# Matrix Factorization Implementation

## Import Library

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
import logging
import tensorflow as tf
logging.getLogger("tensorflow").setLevel(logging.ERROR)

### Hyperparmeter

In [2]:
lr = .0003
l2 = .04
seed = 777

## Load Dataset

In [3]:
df_startups = pd.read_csv('./data/startups_formatted.csv')
df_investors = pd.read_csv('./data/investors_formatted.csv')
df_investments = pd.read_csv('./data/funding_round_formatted.csv')
# dummy 
df_investments["X"] = ["X"] * len(df_investments)

### Generate Matrices

In [4]:
# Starup-Investor Matrix 
startup_investor_matrix = df_investments.pivot_table(index='funded_object_id', columns='investor_object_id',
               values='X', aggfunc=lambda x: len(x.unique()), fill_value=0)


In [5]:
startup_investor_matrix

investor_object_id,0,1,2,3,4,5,6,7,8,9,...,7584,7585,7586,7587,7588,7589,7590,7591,7592,7593
funded_object_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21479,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
21480,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
21481,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
21482,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
# Startup feature Matrix
P = df_startups.loc[startup_investor_matrix.index].iloc[:, 2:].values
# Investor feature Matrix
Q = np.pad(df_investors.iloc[:, 2:].to_numpy(), [(0,0),(0,120)])

startup_investor_matrix = startup_investor_matrix.values

In [7]:
startup_investor_matrix.shape, P.shape, Q.shape

((17852, 7594), (17852, 221), (7594, 221))

## Model Training

In [8]:
class MatrixFactorization:
  def __init__(self, R, P, Q, lr=lr, l2=l2, seed=seed):
    self.R = tf.convert_to_tensor(R, dtype=tf.float32)
    self.mask = tf.not_equal(self.R, 0)
    self.m, self.n = R.shape
    self.lr = lr
    self.l2 = l2
    self.tol = .001
    # Initialize trainable weights.
    self.weight_init = tf.random_normal_initializer(seed=seed)
    self.P = tf.Variable(P, dtype=tf.float32)
    self.Q = tf.Variable(Q, dtype=tf.float32)

  def loss(self):
    raise NotImplementedError

  def grad_update(self):
    with tf.GradientTape() as t:
      t.watch([self.P, self.Q])
      self.current_loss = self.loss()
    gP, gQ = t.gradient(self.current_loss, [self.P, self.Q])
    self.P.assign_sub(self.lr * gP)
    self.Q.assign_sub(self.lr * gQ)

  def train(self, n_epoch=3000):
    for epoch in range(n_epoch):
      self.grad_update()
      if self.current_loss < self.tol:
        break

class BinaryMF(MatrixFactorization):
  def train(self, n_epoch=3000):
    # Cast 1/-1 as binary encoding of 0/1.
    self.labels = tf.cast(tf.not_equal(tf.boolean_mask(self.R, self.mask), -1), dtype=tf.float32)
    for epoch in range(n_epoch):
      self.grad_update()
      if epoch % 50 == 0:
        print(f"Epoch: {epoch} \t\t Loss: {round(float(self.current_loss), 4)}")


  def loss(self):
    """Cross entropy loss."""
    logits = tf.boolean_mask(tf.matmul(self.P, self.Q, transpose_b=True), self.mask)
    logloss = tf.nn.sigmoid_cross_entropy_with_logits(labels=self.labels, logits=logits)
    mlogloss = tf.reduce_mean(logloss)
    l2_norm = tf.reduce_sum(self.P**2) + tf.reduce_sum(self.Q**2)
    return mlogloss + self.l2 * l2_norm

### Training Loop

In [9]:
bmf_model = BinaryMF(startup_investor_matrix, P, Q, lr=.03, l2=.0001, seed=seed)
bmf_model.train()   # train
b_predictions = tf.sigmoid(tf.matmul(bmf_model.P, bmf_model.Q, transpose_b=True)).numpy()   # predict

Epoch: 0 		 Loss: 4.7524
Epoch: 50 		 Loss: 4.7496
Epoch: 100 		 Loss: 4.7468
Epoch: 150 		 Loss: 4.744
Epoch: 200 		 Loss: 4.7412
Epoch: 250 		 Loss: 4.7384
Epoch: 300 		 Loss: 4.7356
Epoch: 350 		 Loss: 4.7329
Epoch: 400 		 Loss: 4.7301
Epoch: 450 		 Loss: 4.7274
Epoch: 500 		 Loss: 4.7246
Epoch: 550 		 Loss: 4.7218
Epoch: 600 		 Loss: 4.7191
Epoch: 650 		 Loss: 4.7163
Epoch: 700 		 Loss: 4.7136
Epoch: 750 		 Loss: 4.7108
Epoch: 800 		 Loss: 4.7081
Epoch: 850 		 Loss: 4.7054
Epoch: 900 		 Loss: 4.7026
Epoch: 950 		 Loss: 4.6999
Epoch: 1000 		 Loss: 4.6972
Epoch: 1050 		 Loss: 4.6944
Epoch: 1100 		 Loss: 4.6917
Epoch: 1150 		 Loss: 4.689
Epoch: 1200 		 Loss: 4.6862
Epoch: 1250 		 Loss: 4.6835
Epoch: 1300 		 Loss: 4.6808
Epoch: 1350 		 Loss: 4.6781
Epoch: 1400 		 Loss: 4.6753
Epoch: 1450 		 Loss: 4.6726
Epoch: 1500 		 Loss: 4.6699
Epoch: 1550 		 Loss: 4.6672
Epoch: 1600 		 Loss: 4.6645
Epoch: 1650 		 Loss: 4.6618
Epoch: 1700 		 Loss: 4.6591
Epoch: 1750 		 Loss: 4.6564
Epoch: 1800 		 Lo

In [10]:
startup_investor_matrix[startup_investor_matrix > 0] = 1
auc = roc_auc_score(startup_investor_matrix.flatten(), b_predictions.flatten())
print("AUC Score: {}".format(auc))

AUC Score: 0.9075085137733652
