In [1]:
import numpy as np
import time
from scipy.sparse import load_npz, issparse
import pandas as pd

In [2]:
def cross_validation(X, n_samples=5):
    indices = np.arange(X.shape[0])
    np.random.shuffle(indices)
    sample_size = X.shape[0] // n_samples
    
    for i in range(1, n_samples):
        test = np.zeros(X.shape[0], dtype=bool)
        test[indices[(i - 1) * sample_size : i * sample_size]] = True
        train = ~test
        yield train, test
    
    test = np.zeros(X.shape[0], dtype=bool)
    test[indices[(n_samples - 1) * sample_size :]] = True
    train = ~test
    yield train, test

In [3]:
def mserror(y, y_pred):
    return np.sum(np.subtract(y_pred, y) ** 2) / y.size

def rmserror(y, y_pred):
    return np.sqrt(mserror(y, y_pred))

In [4]:
class FactorizationMachine(object):
    def __init__(self, step=0.000001, k=5):
        self.k = k
        self.w0 = 0
        self.W = None
        self.V = None
        self.Z = None
        self.X2 = None
        self.initStep = step
        self.step = self.initStep

    def predict(self, X):
        self.Z = X @ self.V
        if issparse(X):
            self.X2 = X.power(2)
        else:
            self.X2 = X**2
        return self.w0 + X @ self.W + np.sum(self.Z**2 - (self.X2 @ self.V**2), axis=1, keepdims=True) / 2

    def get_batches(self, dataset, batch_size):
        X, Y = dataset
        n_samples = X.shape[0]

        indices = np.arange(n_samples)
        np.random.shuffle(indices)

        for start in range(0, n_samples, batch_size):
            end = min(start + batch_size, n_samples)
            batch_idx = indices[start:end]

            yield X[batch_idx], Y[batch_idx]

    def update_learning_rate(self, epoch):
        self.step = self.initStep / np.sqrt(epoch + 1)

    def fit(self, X, y, n_epoch=2, batch_size=128):
        n, d = X.shape

        stdv = 1 / np.sqrt(n)
        self.W = np.random.uniform(-stdv, stdv, size=(d, 1))
        self.V = np.random.uniform(-stdv, stdv, size=(d, self.k))
    
        for i in range(n_epoch):
            for x_batch, y_batch in self.get_batches((X, y), batch_size):
                self.update_learning_rate(i)
                predictions = self.predict(x_batch)
                dLoss = 2 * np.subtract(predictions, y_batch.reshape(-1, 1)) / len(y_batch)

                self.w0 -= np.multiply(self.step, np.sum(dLoss))
                
                self.W -= np.multiply(self.step, x_batch.transpose() @ dLoss)

                np.multiply(self.step, dLoss, out=dLoss)
                for j in range(self.k):
                    if issparse(x_batch):
                        dV = x_batch.multiply(self.Z[:, j].reshape(-1, 1))
                        dV -= self.X2.multiply(self.V[:, j])
                    else:
                        dV = np.multiply(x_batch, self.Z[:, j].reshape(-1, 1))
                        np.subtract(dV, np.multiply(self.X2, self.V[:, j]), out=dV)
                    self.V[:,j] -= dLoss.reshape(-1) @ dV

In [5]:
X = load_npz("sparse_X_100_films.npz")
y = np.load("y_100_films.npz")["arr_0"]

In [6]:
time_train = []
rmse_train = []
rmse_test = []

for train, test in cross_validation(X):
    fm = FactorizationMachine(step = 0.1, k = 3)
    start = time.time()
    fm.fit(X[train], y[train], n_epoch = 1, batch_size = 512)
    stop = time.time()
    
    time_train_i = (stop - start) / 60
    print('Training time: %.5f min' % (time_train_i))
    time_train.append(time_train_i)
    
    predictions_train = fm.predict(X[train]).reshape(-1)
    predictions_test = fm.predict(X[test]).reshape(-1)
    rmse_train_i = rmserror(y[train], predictions_train)
    rmse_test_i = rmserror(y[test], predictions_test)
    print('RMSE train: %.5f, RMSE test: %.5f' % (rmse_train_i, rmse_test_i))
    rmse_train.append(rmse_train_i)
    rmse_test.append(rmse_test_i)    

Training time: 9.48437 min
RMSE train: 0.99572, RMSE test: 0.99593
Training time: 9.39786 min
RMSE train: 0.99668, RMSE test: 0.99674
Training time: 9.70138 min
RMSE train: 0.99596, RMSE test: 0.99601
Training time: 10.11357 min
RMSE train: 0.99594, RMSE test: 0.99643
Training time: 9.65815 min
RMSE train: 0.99566, RMSE test: 0.99686


In [7]:
result_dictionary = {'RMSE train': rmse_train, 'RMSE test': rmse_test, 'Training time': time_train}
result = pd.DataFrame(result_dictionary).T

In [8]:
result

Unnamed: 0,0,1,2,3,4
RMSE train,0.995722,0.996675,0.995955,0.995945,0.995662
RMSE test,0.995929,0.996742,0.99601,0.996432,0.996862
Training time,9.484369,9.397865,9.701377,10.113574,9.658153


In [5]:
X = load_npz("sparse_X_1_file.npz")
y = np.load("y_1_file.npz")["arr_0"]

In [6]:
time_train = []
rmse_train = []
rmse_test = []

for train, test in cross_validation(X):
    fm = FactorizationMachine(step = 0.1, k = 3)
    start = time.time()
    fm.fit(X[train], y[train], n_epoch = 1, batch_size = 512)
    stop = time.time()
    
    time_train_i = (stop - start) / 60
    print('Training time: %.5f min' % (time_train_i))
    time_train.append(time_train_i)
    
    predictions_train = fm.predict(X[train]).reshape(-1)
    predictions_test = fm.predict(X[test]).reshape(-1)
    rmse_train_i = rmserror(y[train], predictions_train)
    rmse_test_i = rmserror(y[test], predictions_test)
    print('RMSE train: %.5f, RMSE test: %.5f' % (rmse_train_i, rmse_test_i))
    rmse_train.append(rmse_train_i)
    rmse_test.append(rmse_test_i)

Training time: 23.81083 min
RMSE train: 1.00949, RMSE test: 1.01009
Training time: 23.61169 min
RMSE train: 1.01089, RMSE test: 1.01050
Training time: 24.08423 min
RMSE train: 1.03048, RMSE test: 1.03110
Training time: 23.48509 min
RMSE train: 1.00879, RMSE test: 1.00942
Training time: 23.89751 min
RMSE train: 1.00876, RMSE test: 1.00941


In [7]:
result_dictionary = {'RMSE train': rmse_train, 'RMSE test': rmse_test, 'Training time': time_train}
result = pd.DataFrame(result_dictionary).T

In [8]:
result

Unnamed: 0,0,1,2,3,4
RMSE train,1.009488,1.010885,1.030476,1.008787,1.008758
RMSE test,1.010086,1.010502,1.031098,1.009424,1.00941
Training time,23.810829,23.611692,24.084233,23.485086,23.897509
