<a href="https://colab.research.google.com/github/stuartheeb/cil-runtime-terror/blob/main/Samyak_Baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import math

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


**Load Data**

In [None]:
number_of_users, number_of_movies = (10000, 1000)
data_pd = pd.read_csv('drive/My Drive/data_train.csv')
print(data_pd.head(5))
print()
print('Shape', data_pd.shape)

       Id  Prediction
0  r44_c1           4
1  r61_c1           3
2  r67_c1           4
3  r72_c1           3
4  r86_c1           5

Shape (1176952, 2)


**Create User-Item Matrix (-1 = missing rating)**

In [None]:
def extract_users_items_predictions(data_pd):
    users, movies = \
        [np.squeeze(arr) for arr in np.split(data_pd.Id.str.extract('r(\d+)_c(\d+)').values.astype(int) - 1, 2, axis=-1)]
    predictions = data_pd.Prediction.values
    return users, movies, predictions

train_users, train_movies, train_predictions = extract_users_items_predictions(data_pd)
data = np.full((number_of_users, number_of_movies), -1)


for user, movie, pred in zip(train_users, train_movies, train_predictions):
    data[user][movie] = pred
  

In [None]:
data

array([[-1, -1, -1, ..., -1, -1, -1],
       [-1, -1, -1, ...,  5,  3,  3],
       [-1, -1, -1, ..., -1, -1, -1],
       ...,
       [-1, -1, -1, ..., -1, -1, -1],
       [-1, -1, -1, ..., -1, -1, -1],
       [-1, -1, -1, ..., -1, -1,  3]])

**Normalize the data along Item**

In [None]:
mean_rating = []
std_rating = []
norm_data = np.zeros((number_of_users, number_of_movies))
for i in range(1000):
    total_movie_rating = []
    for j in range(10000):
        if data[j][i] != -1:
            total_movie_rating.append(data[j][i])
    mean_movie_rating = 0
    std_movie_rating = 0
    
    if len(total_movie_rating)!=0:
        mean_movie_rating = np.mean(total_movie_rating)
        std_movie_rating = np.std(total_movie_rating)
    mean_rating.append(mean_movie_rating)
    std_rating.append(std_movie_rating)
    for j in range(10000):
        if data[j][i] != -1:
            norm_data[j][i] = (float(data[j][i] - mean_movie_rating)) / std_movie_rating

**Perform SVD**

In [None]:
k_singular_values = 3
number_of_singular_values = min(number_of_users, number_of_movies)

assert(k_singular_values <= number_of_singular_values), "choose correct number of singular values"

U, s, Vt = np.linalg.svd(norm_data, full_matrices=False)

S = np.zeros((number_of_movies, number_of_movies))
S[:k_singular_values, :k_singular_values] = np.diag(s[:k_singular_values])

reconstructed_matrix = U.dot(S).dot(Vt)

**ALS Method with SVD Initialization**

In [None]:
class ExplicitMF:
    """
    Train a matrix factorization model using Alternating Least Squares
    to predict empty entries in a matrix
    
    Parameters
    ----------
    n_iters : int
        number of iterations to train the algorithm
        
    n_factors : int
        number of latent factors to use in matrix 
        factorization model, some machine-learning libraries
        denote this as rank
        
    reg : float
        regularization term for item/user latent factors,
        since lambda is a keyword in python we use reg instead
    """

    def __init__(self, n_iters, n_factors, reg):
        self.reg = reg
        self.n_iters = n_iters
        self.n_factors = n_factors  
        
    def fit(self, train, test):
        """
        pass in training and testing at the same time to record
        model convergence, assuming both dataset is in the form
        of User x Item matrix with cells as ratings
        """
        self.n_user, self.n_item = train.shape
        self.user_factors = np.random.random((self.n_user, self.n_factors))
        self.item_factors = np.random.random((self.n_item, self.n_factors))
        
        # record the training and testing mse for every iteration
        # to show convergence later (usually, not worth it for production)
        self.test_mse_record  = []
        self.train_mse_record = []   
        for _ in range(self.n_iters):
            self.user_factors = self._als_step(train, self.user_factors, self.item_factors)
            self.item_factors = self._als_step(train.T, self.item_factors, self.user_factors) 
            predictions = self.predict()
        
        return self    
    
    def _als_step(self, ratings, solve_vecs, fixed_vecs):
        """
        when updating the user matrix,
        the item matrix is the fixed vector and vice versa
        """
        A = fixed_vecs.T.dot(fixed_vecs) + np.eye(self.n_factors) * self.reg
        b = ratings.dot(fixed_vecs)
        A_inv = np.linalg.inv(A)
        solve_vecs = b.dot(A_inv)
        return solve_vecs
    
    def predict(self):
        """predict ratings for every user and item"""
        pred = self.user_factors.dot(self.item_factors.T)
        return pred
 

In [None]:
als = ExplicitMF(n_iters = 20, n_factors = 3, reg = 0.1)
als.fit(reconstructed_matrix,reconstructed_matrix)

<__main__.ExplicitMF at 0x7fc7bb2e3750>

In [None]:
predict_matrix = als.predict()

In [None]:
for i in range(1000):
    for j in range(10000):
      predict_matrix[j][i] = predict_matrix[j][i] * std_rating[i] + mean_rating[i]

In [None]:
predict_matrix

array([[3.37368071, 3.49815172, 3.4747804 , ..., 3.23837057, 3.34842777,
        3.67829539],
       [3.35932916, 3.4962577 , 3.45617099, ..., 3.24086679, 3.33558994,
        3.64978608],
       [3.35654582, 3.49418393, 3.46953203, ..., 3.22900577, 3.32434184,
        3.60498506],
       ...,
       [3.3745746 , 3.49674744, 3.48316892, ..., 3.23184541, 3.34507985,
        3.664877  ],
       [3.36804292, 3.49359901, 3.47585139, ..., 3.22911244, 3.33750603,
        3.65342832],
       [3.39783717, 3.51726267, 3.51448895, ..., 3.25148143, 3.37448746,
        3.67449417]])

**Evaluation of Matrix**

In [None]:
from sklearn.metrics import mean_squared_error

rmse = lambda x, y: math.sqrt(mean_squared_error(x, y))


# test our predictions with the true values
def get_score(predictions, target_values):
    return rmse(predictions, target_values)

def extract_prediction_from_full_matrix(reconstructed_matrix, users, movies):
    # returns predictions for the users-movies combinations specified based on a full m \times n matrix
    assert(len(users) == len(movies)), "users-movies combinations specified should have equal length"
    predictions = np.zeros(len(users))

    for i, (user, movie) in enumerate(zip(users, movies)):
        predictions[i] = reconstructed_matrix[user][movie]

    return predictions

In [None]:
predictions = extract_prediction_from_full_matrix(predict_matrix,train_users,train_movies)

print("RMSE using SVD is: {:.4f}".format(get_score(predictions,train_predictions)))

RMSE using SVD is: 1.0019


**Submission Code**

In [None]:
sub_pd = pd.read_csv('drive/My Drive/sampleSubmission.csv',index_col='Id')

In [None]:
def extract_users_items_predictions(data_pd):
    users, movies = [np.squeeze(arr) for arr in np.split(data_pd.index.str.extract('r(\d+)_c(\d+)').values.astype(int) - 1, 2, axis=-1)]
    predictions = data_pd.Prediction.values
    return users, movies, predictions

users, movies, subpred = extract_users_items_predictions(sub_pd)

# also create full matrix of observed values
subdata = np.full((number_of_users, number_of_movies), 0)

for user, movie, pred in zip(users, movies, subpred):
  subdata[user][movie] = pred

In [None]:
Id = []
pred = []
for j in range(1000):
  for i in range(10000):
    if subdata[i][j] != 0:
      Id.append("r"+str(i+1)+"_c"+str(j+1))
      pred.append(predict_matrix[i][j])

In [None]:
sub_pd['Prediction'] = pred

In [None]:
sub_pd.to_csv("drive/My Drive/submission1.csv")