In [62]:
import pandas as pd
import statistics
import numpy as np
from io import StringIO
import numpy.linalg as lin
import matplotlib.pyplot as plt
import scipy.sparse
from scipy.sparse.linalg import svds
import random
import sys
import time
from pandas import Series
from math import pow
import matplotlib.pyplot as plt
from collections import defaultdict
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

# Task 1: Naive Approaches

### Import Data

In [63]:
movies = pd.read_table('ml-1m/movies.dat',
                   header=None, 
                   encoding= 'ISO-8859-1',
                   engine='python', 
                   sep = '::')
movies.columns = ['MovieID','Title','Genres']

In [64]:
movies.head()

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [65]:
ratings = pd.read_table('ml-1m/ratings.dat',
                   header=None, 
                   encoding= 'ISO-8859-1',
                   engine='python', 
                   sep = '::')
ratings.columns = ['UserID','MovieID','Rating','Timestamp']

In [66]:
ratings.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [67]:
users = pd.read_table('ml-1m/users.dat',
                   header=None, 
                   encoding= 'ISO-8859-1',
                   engine='python', 
                   sep = '::')
users.columns = ['UserID','Gender','Age','Occupation','Zip-code']

In [68]:
users.head()

Unnamed: 0,UserID,Gender,Age,Occupation,Zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


### Calculating the 5 formulas from slide  17: 
R_global(User, Item)=mean(all ratings)

R_item(User, Item)=mean(all ratings for Item)

R_user(User, Item)=mean(all ratings for User)

In [69]:
#R_global i.e. the mean of all ratings
R_global = ratings['Rating'].mean()

In [70]:
#R_user i.e. the mean of all ratings per user
R_user = ratings.groupby(['UserID'])['Rating'].mean()

In [71]:
#R_item i.e. the mean of all ratings per movie
R_movie = ratings.groupby(['MovieID'])['Rating'].mean()

In [72]:
R = pd.merge(ratings,R_user,on='UserID')
R = pd.merge(R,R_movie, on='MovieID')
R = R[['Rating_x','Rating_y', 'Rating', 'MovieID', 'UserID']]
R = R.rename(columns={"Rating_x": "Rating", "Rating_y": "R_user", "Rating": "R_movie"})
R.head()

Unnamed: 0,Rating,R_user,R_movie,MovieID,UserID
0,5,4.188679,4.390725,1193,1
1,5,3.713178,4.390725,1193,2
2,4,3.826087,4.390725,1193,12
3,4,3.323383,4.390725,1193,15
4,5,4.075829,4.390725,1193,17


### 5-fold Cross Validation 

#### For R_global

In [73]:
random.seed(123)
kf5 = KFold(n_splits=5, shuffle=True)
y = ratings["Rating"]


RMSE_list = []
MAE_list = []


for train_index, test_index in kf5.split(ratings):
    #print("TRAIN:", train_index, "TEST:", test_index)
    y_tst = y[test_index]
    R_global = ratings.iloc[train_index]['Rating'].mean()
    X_global = np.full(len(y_tst), R_global)
    RMSE_list.append(mean_squared_error(X_global, y_tst, squared=False))
    MAE_list.append(mean_absolute_error(X_global, y_tst))
    
    
print(RMSE_list)
print(MAE_list)

print("Average RMSE over the 5 splits:", statistics.mean(RMSE_list))
print("Average MAE over the 5 splits:", statistics.mean(MAE_list))

[1.1179186657505382, 1.1169571541967311, 1.1164811584703562, 1.1165766550458018, 1.1175860868686842]
[0.9350448623301201, 0.9336287710304297, 0.9334056641662095, 0.9328779368001785, 0.9343568646146218]
Average RMSE over the 5 splits: 1.1171039440664223
Average MAE over the 5 splits: 0.9338628197883119


#### For R_movie

In [74]:
random.seed(123)
kf5 = KFold(n_splits=5, shuffle=True)


RMSE_list = []
MAE_list = []


for train_index, test_index in kf5.split(ratings):
    tr = ratings.iloc[train_index]
    tst = ratings.iloc[test_index]
    R_global = ratings.iloc[train_index]['Rating'].mean()
    R_movie = tr.groupby('MovieID')['Rating'].mean()
    R_movie = pd.merge(tst, R_movie, how='left', on='MovieID')
    R_movie = R_movie.fillna(R_global)
    

      
    RMSE_list.append(mean_squared_error(R_movie['Rating_x'], R_movie['Rating_y'], squared=False))
    MAE_list.append(mean_absolute_error(R_movie['Rating_x'],  R_movie['Rating_y']))
    
    
print(RMSE_list)
print(MAE_list)

print("Average RMSE over the 5 splits:", statistics.mean(RMSE_list))
print("Average MAE over the 5 splits:", statistics.mean(MAE_list))

[0.9831783587582448, 0.9777055268250058, 0.979383745379347, 0.978704600138562, 0.9785350342388109]
[0.7853112767520519, 0.7811362146640797, 0.7823110828330951, 0.7814558722648273, 0.7818258749033196]
Average RMSE over the 5 splits: 0.9795014530679941
Average MAE over the 5 splits: 0.7824080642834748


#### For R_user

In [75]:
random.seed(123)
kf5 = KFold(n_splits=5, shuffle=True)


RMSE_list = []
MAE_list = []


for train_index, test_index in kf5.split(ratings):
    tr = ratings.iloc[train_index]
    tst = ratings.iloc[test_index]
    R_global = ratings.iloc[train_index]['Rating'].mean()
    R_user = tr.groupby('UserID')['Rating'].mean()
    R_user = pd.merge(tst, R_user, how='left', on='UserID')
    R_user = R_user.fillna(R_global)
      
    RMSE_list.append(mean_squared_error(R_user['Rating_x'], R_user['Rating_y'], squared=False))
    MAE_list.append(mean_absolute_error(R_user['Rating_x'],  R_user['Rating_y']))
    
    
print(RMSE_list)
print(MAE_list)

print("Average RMSE over the 5 splits:", statistics.mean(RMSE_list))
print("Average MAE over the 5 splits:", statistics.mean(MAE_list))

[1.0341540530512852, 1.0366831836316819, 1.0367548911084858, 1.0351620226366867, 1.0342310841258635]
[0.8278011687890378, 0.8295052568118579, 0.8307004050630244, 0.8290178974894602, 0.8275930787817327]
Average RMSE over the 5 splits: 1.0353970469108007
Average MAE over the 5 splits: 0.8289235613870226


#### 5-fold Cross Validation - for linear regression

In [76]:
random.seed(123)
kf5 = KFold(n_splits=5, shuffle=True)
X = R[['R_user', 'R_movie']].to_numpy()
y = np.array(list(R['Rating']))

Index = []
X_train = []
X_test = []
y_train = []
y_test = []

for train_index, test_index in kf5.split(X):
    Index.append(("TRAIN:", train_index, "TEST:", test_index))
    X_tr, X_tst = X[train_index], X[test_index]
    y_tr, y_tst = y[train_index], y[test_index]
    X_train.append(X_tr)
    X_test.append(X_tst)
    y_train.append(y_tr)
    y_test.append(y_tst)

#### Linear Regression without gamma

In [77]:
RMSE_list = []
coefficients = []
MAE_list = []
for i in range(0,5):
    regressor = LinearRegression(fit_intercept=False)
    model = regressor.fit(X_train[i], y_train[i])
    y_predicted = regressor.predict(X_test[i])
    RMSE = mean_squared_error(y_test[i], y_predicted, squared=False)
    RMSE_list.append(RMSE)
    MAE_list.append(mean_absolute_error(y_test[i], y_predicted))
    coefficients.append([model.coef_])
print("RMSE:", statistics.mean(RMSE_list))
print("Alpha, Beta:", coefficients)
print("MAE:", statistics.mean(MAE_list))

RMSE: 0.9473800934567167
Alpha, Beta: [[array([0.36638847, 0.64072531])], [array([0.36777684, 0.63929526])], [array([0.36801483, 0.63896155])], [array([0.36816123, 0.63870314])], [array([0.36797364, 0.63908854])]]
MAE: 0.7592216393240905


#### Linear Regression with gamma

In [78]:
RMSE_list = []
coefficients = []
gamma = []
MAE_list = []
for i in range(0,5):
    regressor = LinearRegression(fit_intercept=True)
    model = regressor.fit(X_train[i], y_train[i])
    y_predicted = regressor.predict(X_test[i])
    RMSE = mean_squared_error(y_test[i], y_predicted, squared=False)
    RMSE_list.append(RMSE)
    MAE_list.append(mean_absolute_error(y_test[i], y_predicted))
    coefficients.append([model.coef_])
    gamma.append(model.intercept_)
print("RMSE:", statistics.mean(RMSE_list))
print("Alpha, Beta:",coefficients)
print("Gamma:",gamma)
print("MAE:", statistics.mean(MAE_list))

RMSE: 0.9155921038075315
Alpha, Beta: [[array([0.78078827, 0.87736718])], [array([0.78235655, 0.87547122])], [array([0.78370875, 0.87621962])], [array([0.78203965, 0.87447233])], [array([0.78174993, 0.87517107])]]
Gamma: [-2.356941390319213, -2.3559225808808737, -2.363749170876681, -2.3519232294198886, -2.3524632219328985]
MAE: 0.7258558382684496


# Task 2: UV Matrix

In [80]:
random.seed(123)

ratings = pd.read_csv('ml-1m/ratings.dat', engine='python', sep='::', names=['user_id', 'movie_id', 'rating', 'timestamp'])
X = ratings.copy()
kf = KFold(n_splits=5, shuffle=True)
d = 2
l_r = 0.5
iterations = 10

for train_index, test_index in kf.split(X):
    X_train, X_test = X.loc[train_index], X.loc[test_index]
    R_df = X_train.pivot(index = 'user_id', columns ='movie_id', values = 'rating')
    user_mean = R_df.mean(axis=1)
    R_df_array = R_df.to_numpy()
    user_mean = user_mean.to_numpy()
    normalized = R_df_array - user_mean.reshape(-1,1)
    M = normalized.copy()
    u = np.full((normalized.shape[0],2), 1)
    v = np.full((2,normalized.shape[1]), 1)
    u = u.astype(np.float32)
    v = v.astype(np.float32)
    uv = np.dot(u,v)
    print("TRAIN:", train_index, "TEST:", test_index)
    for iteration in range(iterations):
        for r in range(6040):
            for s in range(d):
                sums = 0
                u_rk = u[r,:]
                v_kj = v[:,:]
                u_rk_del = np.delete(u_rk, s, 0)
                v_kj_del = np.delete(v_kj, s, 0)
                v_sj = v[s,:]
                v_sj_squared = v_sj ** 2

                u_rk_v_kj = np.dot(u_rk_del, v_kj_del)
                m_rj = M[r,:]

                error = m_rj - u_rk_v_kj

                vsj_dot_er = v_sj * error
                sums = np.nansum(vsj_dot_er)
                v_sj_ssum = np.nansum((v_sj_squared) * (~np.isnan(m_rj)))
                newval_u = sums / v_sj_ssum
                u[r,s] = u[r,s] + ((newval_u - u[r,s]))
        #update v 
        for r in range(d):
            for s in range(R_df_array.shape[1]):
                sums = 0
                #sumvsj_2 = 0

                u_ik = u[:,:]
                v_ks = v[:,s]
                u_ik_del = np.delete(u_ik, r, 1)

                v_ks_del = np.delete(v_ks, r, 0)
                u_ir = u[:,r]
                u_ir_squared = u_ir ** 2

                u_ik_v_ks = np.dot(u_ik_del, v_ks_del)
                m_is = M[:,s]
                error = m_is - u_ik_v_ks

                uir_dot_er = u_ir * error
                sumsv = np.nansum(uir_dot_er)
                u_ir_ssum = np.nansum(u_ir_squared * (~np.isnan(m_is)))
                newval_v = sumsv / u_ir_ssum
                v[r,s] = v[r,s] + ((newval_v - v[r,s]))

        uv = np.dot(u,v)
        difference = uv - normalized
        difference_squared = difference ** 2
        difference_squared_with_0s = np.nan_to_num(difference_squared)
        difference_squared_sum = np.sum(difference_squared_with_0s,axis=0)
        sum_sum = difference_squared_sum.sum()
        non_zero_count = np.count_nonzero(difference_squared_with_0s)
        RMSE = sum_sum / non_zero_count
        print("Iteration: ", iteration)
        print(RMSE)


TRAIN: [      0       1       2 ... 1000206 1000207 1000208] TEST: [      3      11      12 ... 1000168 1000199 1000205]
Iteration:  0
0.8422958972689429
Iteration:  1
0.7948269794696434
Iteration:  2
0.7847370130482627
Iteration:  3
0.7781739188502861
Iteration:  4
0.7734944361010938
Iteration:  5
0.770166485209555
Iteration:  6
0.7677691005312302
Iteration:  7
0.7659928398046777
Iteration:  8
0.7646397131547209
Iteration:  9
0.7635722366023221
TRAIN: [      1       2       3 ... 1000204 1000205 1000208] TEST: [      0      16      21 ... 1000197 1000206 1000207]
Iteration:  0
0.8439388795777046
Iteration:  1
0.796318876325698
Iteration:  2
0.78608344721233
Iteration:  3
0.779437060667424
Iteration:  4
0.7747075290156327
Iteration:  5
0.7713388188424871
Iteration:  6
0.7689151591210315
Iteration:  7
0.7671237449321173
Iteration:  8
0.7657555838924036
Iteration:  9
0.7646828005483265
TRAIN: [      0       1       2 ... 1000205 1000206 1000207] TEST: [      4      15      17 ... 1000200

# Task 3:  Matrix Factorization with Gradient Descent and Regularization

------------------------------------------------------------------------------------

In [81]:
random.seed(123)
def matrixFactorization(X, U, M, K, num_iter = 75, learn_rate=0.005, regularization=0.05):
    print('---Executing Matrix Factorization---')
    # transpose the M matrix
    M = M.T
    result = []
    count = 0
    for step in range(num_iter):
        print('---the {} times iteration---'.format(count))
        count = count + 1
        for i in range(len(X)):
             for j in range(len(X[i])):
                    if X[i][j] > 0:
                    # calculate the training error on the (i,j) example
                        eij = X[i][j]-np.dot(U[i,:], M[:,j])
                    # use regularization to avoid large weights
                        for k in range(K):
                            U[i][k] = U[i][k] + learn_rate * (2 * eij * M[k][j] - regularization * U[i][k])
                            M[k][j] = M[k][j] + learn_rate * (2 * eij * U[i][k] - regularization * M[k][j])
        eR = np.dot(U,M)
        se = 0
        # calulate the total squared error (with regularization)
        for i in range(len(X)):
             for j in range(len(X[i])):
                    if X[i][j] > 0:
                        se = se + pow(X[i][j] - np.dot(U[i,:],M[:,j]),2)
                     # consider the regularization to avoid overfitting
                        for k in range(K):
                            se = se + (regularization / 2) * (pow(U[i][k],2) + pow(M[k][j],2))
        result.append(se)
        if se < 0.001:
            break
    print('---Finish Matrix Factorization---')
    return U, M.T, result,se

In [None]:
random.seed(123)
ratings = pd.read_csv('ml-1m/ratings.dat', engine='python', sep='::', names=['UserID', 'MovieID', 'Rating', 'timestamp'])
kf = KFold(n_splits=5, shuffle=True)
X = ratings[["UserID", "Rating", "MovieID"]]
y = ratings["Rating"]
rmse=[]

for train_index, test_index in kf.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X.loc[train_index], X.loc[test_index]
    y_train, y_test = y.loc[train_index], y.loc[test_index]
    X_train = pd.DataFrame(X_train)
    X_test = pd.DataFrame(X_test)
    y_train = pd.DataFrame(y_train)
    y_test = pd.DataFrame(y_test)
    R_df = X_train.pivot(index = 'UserID', columns ='MovieID', values = 'Rating', ).fillna(0)
    
    user_mean = R_df.mean(axis=1)
    R = R_df.to_numpy()
    user_mean = user_mean.to_numpy()
    normalized = R - user_mean.reshape(-1,1)
    userids = R_df.index
    movieids = R_df.columns
   

    userNum = R.shape[0]
    movieNum = R.shape[1]

    # set the number of hidden factors
    K = 10
    np.random.seed = time.time() * 256
    userMatrix = np.random.rand(userNum, K)
    np.random.seed = time.time() * 256
    movieMatrix = np.random.rand(movieNum, K)

    newUserMatrix, newMovieMatrix, result, error = matrixFactorization(R, userMatrix, movieMatrix, K)
   
    print('----Orignal dataset---\n', R)
    ratingMF = np.dot(newUserMatrix, newMovieMatrix.T)
    print('---Rating matrix after MF algorithm---\n', ratingMF)
    predicted = pd.DataFrame(ratingMF)
    print('Error:',error)
    rmse.append(error)
    print('Result:', result)

TRAIN: [      0       1       2 ... 1000204 1000206 1000207] TEST: [      8       9      23 ... 1000200 1000205 1000208]
---Executing Matrix Factorization---
---the 0 times iteration---
---the 1 times iteration---
---the 2 times iteration---
---the 3 times iteration---
---the 4 times iteration---
---the 5 times iteration---
---the 6 times iteration---
---the 7 times iteration---
