In [2]:
from scipy import stats
import pandas as pd
import numpy as np
import math
from scipy.sparse.linalg import svds
import surprise
from scipy.sparse import coo_matrix
from numpy.linalg import norm
from sklearn.metrics import mean_squared_error
from numpy import linalg as LA

In [3]:
import matplotlib.pyplot as plt


In [4]:
cols=["user_id","movie_id","ratings","timestamp"]
df=pd.read_csv("u1.base",sep="\t",names=cols,encoding="latin-1")
test_df=pd.read_csv("u1.test",sep="\t",names=cols,encoding="latin-1")
ratings_pivot = df.pivot(index='movie_id', columns='user_id', values="ratings").fillna(0)
test_ratings_pivot = df.pivot(index='movie_id', columns='user_id', values="ratings").fillna(0)

test_df_group_by_user = test_df.groupby('user_id')
test_df_group_by_movie = test_df.groupby('movie_id')

train_df_user_movie = df.pivot(
    index='movie_id',
    columns='user_id',
    values='ratings'
).fillna(0)


test_df_user_movie = test_df.pivot(
    index='movie_id',
    columns='user_id',
    values='ratings'
).fillna(0)

#Creating the rating matrix (rows as movies, columns as users)
ratings_mat = np.ndarray(
    shape=(np.max(df.movie_id.values), np.max(df.user_id.values)),
    dtype=np.float)
ratings_mat[df.movie_id.values-1, df.user_id.values-1] = df.ratings.values.astype(float)

mat = train_df_user_movie.as_matrix()

samples = [
            (i, j, ratings_mat[i, j])
            for i in range(1,df.movie_id.max())
            for j in range(1,df.user_id.max())
            if ratings_mat[i, j] > 0
        ]





In [5]:
def SGD(R, K, lamda=0.0005,steps=10):
    
    M,N = R.shape
    # According to slide i have initialized P and Q by SVD
    P,U,Q=svds(R,K)
    minErr = error(R,P,Q,lamda);
    for step in range(steps):
        for ui in range(len(R.data)):
            rui=R.data[ui]
            u = R.row[ui]
            i = R.col[ui]
            if rui>0:
                eui = 2*(rui - np.dot(P[u,:],Q[:,i]))
                if math.isnan(eui):
                    P[u,:]=P[u,:]+lamda*eui*(Q[:,i])
                    Q[:,i]=Q[:,i]+lamda*eui*(P[u,:])
        err = error(R,P,Q,lamda);
        rmse = np.sqrt(error(R,P,Q,lamda)/len(R.data))
        if rmse<0.5:
            break
        if err > minErr:
            break;
        minErr = err
    return P,Q

def error(R,P,Q,lamda=0.0005):
    ratings = R.data
    rows = R.row
    cols = R.col
    e = 0 
    for ui in range(len(ratings)):
        rui=ratings[ui]
        u = rows[ui]
        i = cols[ui]
        if rui>0:
            e= e + pow(rui-np.dot(P[u,:],Q[:,i]),2)
    return e

def get_group(group,key):
    if key in group.groups: return group.get_group(key)
    return pd.DataFrame()
    

def SGDWithRegularization(R, K, lamda=0.0005,steps=10,L1=0.01,L2=0.03):
    
    M,N = R.shape
    # According to slide i have initialized P and Q by SVD
    P,U,Q=svds(R,K)
    minErr = np.zeros(df.user_id.max()+1) 
    finalMinEr = np.Inf;
    for i in range(df.user_id.max()+1):
        minErr[i] = np.Inf;
    #hl, = plt.plot([], [])  
    for step in range(steps):
        px =0;
        qx =0;
        movies = {-1}
        for ui in range(len(R.data)):
            rui=R.data[ui]
            u = R.row[ui]
            i = R.col[ui]
            if i in movies:
                continue;
            movies.add(i);
            if rui>0:
                eui = 2*(rui - np.dot(P[u,:],Q[:,i]))
                tempP=P[u,:]+lamda*eui*Q[:,i]
                tempQ=Q[:,i]+lamda*eui*P[u,:]
                tempPx = LA.norm(P[u,:])
                tempQx= LA.norm(Q[:,i])
                err = error(R,P,Q,lamda);
                if  minErr[i] > err:
                    P[u,:]=tempP
                    Q[:,i]=tempQ
                    px +=  tempPx
                    qx +=  tempQx
                    minErr[i]=err
                if finalMinEr > err:
                    finalMinEr = err;
                
                rmse = np.sqrt(err/len(R.data))
               
        err = error(R,P,Q,lamda);
        rmse = np.sqrt(err/len(R.data))
        err = err + L1*px+L2*qx # added Regularization
        if rmse<0.5:
            break;
        if err > finalMinEr:
            break;
        minErr = err
    return P,Q



In [14]:
def caculateRatingOnLatentFactor(res,key):
    for i in list(test_df_group_by_movie.groups.keys()):
        for j in list(test_df_group_by_user.groups.keys()):
            test_df.loc[((test_df["movie_id"]==i) & (test_df["user_id"]==j)),key] =res[i,j]

            
def getAvgPrecisionAt10(key):
    threashold=4
    grpDfCalc = test_df.sort_values([key]).groupby('user_id')
    group_by_key = 'movie_id';
    actual_data = ratings_pivot.values;
    cnt=0;
    total =0;
    for mov in actual_data:
            mov_idx_ratingGtThreashold = np.argwhere(mov>threashold)
            for userKey in list(grpDfCalc.groups.keys()):
                user_movies = grpDfCalc.get_group(userKey)
                union = len(np.union1d(mov_idx_ratingGtThreashold,user_movies));
                intersection = len(np.intersect1d(mov_idx_ratingGtThreashold,user_movies));
                if union>0 and intersection>0:          
                    total+=(intersection/union);
                cnt+=1;
    if total>0:
        return (total/cnt)
    return 0;

In [15]:
R = coo_matrix(ratings_pivot.values)
latentFactors = [2,5,10]
for factor in latentFactors:
    P,Q=SGD(R,K=factor,lamda=0.0005, steps=20)
    print("RMSE {} for K {}".format(math.sqrt(mean_squared_error(test_ratings_pivot.values,np.dot(P,Q))),factor))
    res = np.dot(P,Q)
    key = 'Caculate{}'.format(factor)
    caculateRatingOnLatentFactor(res,key)
    print("Precision {} for K {}".format(getAvgPrecisionAt10(key),factor))

RMSE 0.8387334849954835 for K 2
Precision 0.002583076836709251 for K 2
RMSE 0.8383884750079765 for K 5
Precision 0.0020645231993078623 for K 5
RMSE 0.8379963202330859 for K 10
Precision 0.0017285485901582709 for K 10


With Regularization

In [52]:
R = coo_matrix(ratings_pivot.values)
latentFactors = [2,5,10]
regs = [(0.001,0.003),(0.05,0.05),(0.5,0.75)]
for l1,l2 in regs:
    for factor in latentFactors:
        P,Q=SGDWithRegularization(R,K=factor, steps=20,L1=l1,L2=l2)
        print("RMSE {} for K {}".format(math.sqrt(mean_squared_error(test_ratings_pivot.values,np.dot(P,Q))),factor))
        res = np.dot(P,Q)
        key = 'RegCaculate{}'.format(factor)
        caculateRatingOnLatentFactor(res,key)
        print("Precision {} for K {}".format(getAvgPrecisionAt10(key),factor))

RMSE 0.8387256587772448 for K 2
Precision 0.0014911515890229862 for K 2
RMSE 0.8383790436555402 for K 5
Precision 0.0013136048731691354 for K 5
RMSE 0.8379858381052904 for K 10
Precision 0.0011753573730405359 for K 10
RMSE 0.8387256587772448 for K 2
Precision 0.0011753573730405359 for K 2
RMSE 0.8383790436555402 for K 5
Precision 0.0011753573730405359 for K 5
RMSE 0.8379858381052904 for K 10
Precision 0.0011753573730405359 for K 10
RMSE 0.8387256587772448 for K 2
Precision 0.0011753573730405359 for K 2
RMSE 0.8383790436555402 for K 5
Precision 0.0011753573730405359 for K 5
RMSE 0.8379858381052904 for K 10
Precision 0.0011753573730405359 for K 10
