https://everdark.github.io/k9/notebooks/ml/matrix_factorization/matrix_factorization.nb.html#22_binary_matrix_factorization

## Import Library

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
import logging
import tensorflow as tf
logging.getLogger("tensorflow").setLevel(logging.ERROR)

### Hyperparmeter

In [2]:
lr = .0003
l2 = .04
seed = 777

## Load Dataset

In [3]:
df_startups = pd.read_csv('./data/startups_formatted.csv')
df_investors = pd.read_csv('./data/investors_formatted.csv')
df_investments = pd.read_csv('./data/funding_round_formatted.csv')
# dummy 
df_investments["X"] = ["X"] * len(df_investments)

### Generate Matrices

##### Note: 1 is have link, 0 is missing (to be predicted), -1 is no link

In [4]:
import random
def split_ele(ele):
    '''20% to 0, 80% to -1'''
    choices = [-1]*8 + [0]*2
    if ele == 0:
        return random.choice(choices)
    else:
        return ele


In [5]:
def train_test_split(series):
    return [split_ele(i) for i in series]

In [6]:
# Starup-Investor Matrix 
startup_investor_matrix = df_investments.pivot_table(index='funded_object_id', columns='investor_object_id',
               values='X', aggfunc='count', fill_value=0)

#startup_investor_matrix = startup_investor_matrix.apply(train_test_split)

In [7]:

# Startup feature Matrix
P = df_startups.loc[startup_investor_matrix.index].iloc[:, 2:].values
# Investor feature Matrix
Q = np.pad(df_investors.iloc[:, 2:].to_numpy(), [(0,0),(0,120)])

startup_investor_matrix = startup_investor_matrix.values


In [8]:
startup_investor_matrix.shape, P.shape, Q.shape

((17852, 7594), (17852, 221), (7594, 221))

## Model Training

In [11]:
class MatrixFactorization:
  def __init__(self, R, P, Q, lr=lr, l2=l2, seed=seed):
    self.R = tf.convert_to_tensor(R, dtype=tf.float32)
    self.mask = tf.not_equal(self.R, 0)
    self.m, self.n = R.shape
    self.lr = lr
    self.l2 = l2
    self.tol = .001
    # Initialize trainable weights.
    self.weight_init = tf.random_normal_initializer(seed=seed)
    self.P = tf.Variable(P, dtype=tf.float32)
    self.Q = tf.Variable(Q, dtype=tf.float32)

  def loss(self):
    raise NotImplementedError

  def grad_update(self):
    with tf.GradientTape() as t:
      t.watch([self.P, self.Q])
      self.current_loss = self.loss()
    gP, gQ = t.gradient(self.current_loss, [self.P, self.Q])
    self.P.assign_sub(self.lr * gP)
    self.Q.assign_sub(self.lr * gQ)

  def train(self, n_epoch=3000):
    for epoch in range(n_epoch):
      self.grad_update()
      if self.current_loss < self.tol:
        break

class BinaryMF(MatrixFactorization):
  def train(self, n_epoch=3000):
    # Cast 1/-1 as binary encoding of 0/1.
    self.labels = tf.cast(tf.not_equal(tf.boolean_mask(self.R, self.mask), -1), dtype=tf.float32)
    for epoch in range(n_epoch):
      self.grad_update()
      if epoch % 50 == 0:
        print(epoch, "\t\t", self.current_loss)

  # The implementation is far from optimized since we don't need the product of entire P'Q.
  # We only need scores for non-missing entries.
  # The code is hence for educational purpose only.
  def loss(self):
    """Cross entropy loss."""
    logits = tf.boolean_mask(tf.matmul(self.P, self.Q, transpose_b=True), self.mask)
    logloss = tf.nn.sigmoid_cross_entropy_with_logits(labels=self.labels, logits=logits)
    mlogloss = tf.reduce_mean(logloss)
    l2_norm = tf.reduce_sum(self.P**2) + tf.reduce_sum(self.Q**2)
    return mlogloss + self.l2 * l2_norm

### Training Loop

In [12]:
bmf_model = BinaryMF(startup_investor_matrix, P, Q, lr=.03, l2=.0001, seed=seed)
bmf_model.train()   # train
b_predictions = tf.sigmoid(tf.matmul(bmf_model.P, bmf_model.Q, transpose_b=True)).numpy()   # predict
b_predictions

0 		 tf.Tensor(4.75237, shape=(), dtype=float32)
50 		 tf.Tensor(4.749568, shape=(), dtype=float32)
100 		 tf.Tensor(4.7467685, shape=(), dtype=float32)
150 		 tf.Tensor(4.7439785, shape=(), dtype=float32)
200 		 tf.Tensor(4.7411885, shape=(), dtype=float32)
250 		 tf.Tensor(4.738403, shape=(), dtype=float32)
300 		 tf.Tensor(4.7356343, shape=(), dtype=float32)
350 		 tf.Tensor(4.7328715, shape=(), dtype=float32)
400 		 tf.Tensor(4.7301116, shape=(), dtype=float32)
450 		 tf.Tensor(4.727355, shape=(), dtype=float32)
500 		 tf.Tensor(4.724598, shape=(), dtype=float32)
550 		 tf.Tensor(4.7218447, shape=(), dtype=float32)
600 		 tf.Tensor(4.719091, shape=(), dtype=float32)
650 		 tf.Tensor(4.716343, shape=(), dtype=float32)
700 		 tf.Tensor(4.7135954, shape=(), dtype=float32)
750 		 tf.Tensor(4.710849, shape=(), dtype=float32)
800 		 tf.Tensor(4.7081065, shape=(), dtype=float32)
850 		 tf.Tensor(4.705364, shape=(), dtype=float32)
900 		 tf.Tensor(4.702625, shape=(), dtype=float32)
950 		 

array([[0.6343095 , 0.5262393 , 0.5018861 , ..., 0.5015253 , 0.5038488 ,
        0.50164855],
       [0.58984995, 0.5175127 , 0.50100666, ..., 0.5006924 , 0.5022855 ,
        0.50076795],
       [0.58833534, 0.5188004 , 0.5021503 , ..., 0.50201315, 0.5033142 ,
        0.50209415],
       ...,
       [0.5932297 , 0.51873016, 0.5018867 , ..., 0.5018798 , 0.5033861 ,
        0.5019858 ],
       [0.63097733, 0.52629554, 0.50209755, ..., 0.5022282 , 0.5040083 ,
        0.5021026 ],
       [0.53636205, 0.5071791 , 0.50023425, ..., 0.50012684, 0.5008801 ,
        0.5001847 ]], dtype=float32)

In [15]:
b_mask = np.zeros_like(startup_investor_matrix)     # mask to filter non-zero
b_mask[startup_investor_matrix.nonzero()] = 1
np.round(b_predictions * b_mask, 2)    # contains only mask data

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [28]:
startup_investor_matrix[startup_investor_matrix > 0] = 1
roc_auc_score(startup_investor_matrix.flatten(), b_predictions.flatten())

0.9075085137733652

In [20]:
b_predictions

array([[0.6343095 , 0.5262393 , 0.5018861 , ..., 0.5015253 , 0.5038488 ,
        0.50164855],
       [0.58984995, 0.5175127 , 0.50100666, ..., 0.5006924 , 0.5022855 ,
        0.50076795],
       [0.58833534, 0.5188004 , 0.5021503 , ..., 0.50201315, 0.5033142 ,
        0.50209415],
       ...,
       [0.5932297 , 0.51873016, 0.5018867 , ..., 0.5018798 , 0.5033861 ,
        0.5019858 ],
       [0.63097733, 0.52629554, 0.50209755, ..., 0.5022282 , 0.5040083 ,
        0.5021026 ],
       [0.53636205, 0.5071791 , 0.50023425, ..., 0.50012684, 0.5008801 ,
        0.5001847 ]], dtype=float32)