This project aims to build a recommender system using the MovieLens 100k dataset. It contains 100004 ratings and 1296 tag applications across 9125 movies. These data were created by 671 users between January 09, 1995 and October 16, 2016.

Sources:

https://grouplens.org/datasets/movielens/

https://beckernick.github.io/matrix-factorization-recommender/ 

http://www.cs.rochester.edu/twiki/pub/Main/HarpSeminar/Factorization_Meets_the_Neighborhood-_a_Multifaceted_Collaborative_Filtering_Model.pdf

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.sparse.linalg import svds
import scipy
%matplotlib inline

plt.style.use('ggplot')
np.set_printoptions(suppress=True)
from sklearn.utils import shuffle

# Dataset: Movielens 100k

https://grouplens.org/datasets/movielens/

In [10]:

data = pd.read_csv('.\ml-latest-small/ratings.csv')
data['userId'] = data['userId'].astype('string')
data['movieId'] = data['movieId'].astype('string')

users = list(set(data['userId'])) #list of all users
movies = list(set(data['movieId'])) #list of all movies

test = pd.DataFrame(columns=data.columns)
train = pd.DataFrame(columns=data.columns)

test_ratio = 0.2 #adjust it as per wish

for u in users[:]:
    temp = data[data['userId'] == u]
    n = len(temp)
    test_size = int(test_ratio*n)

    temp = temp.sort_values('timestamp').reset_index()
    temp.drop('index', axis=1, inplace=True)

    dummy_test = temp.ix[n-1-test_size :]
    dummy_train = temp.ix[: n-2-test_size]

    test = pd.concat([test, dummy_test])
    train = pd.concat([train, dummy_train])


In [35]:
temp = data[data['userId'] == '500']
n = len(temp)
test_size = int(0.2*n)
temp = temp.sort_values('timestamp').reset_index()
temp.drop('index',axis=1, inplace=True)
print n
print test_size
temp

249
49


Unnamed: 0,userId,movieId,rating,timestamp
0,500,2915,3.0,1228920094
1,500,1947,3.0,1228920101
2,500,3863,1.0,1228920119
3,500,4571,3.0,1228920127
4,500,362,4.0,1228920135
5,500,616,3.0,1228920138
6,500,2145,4.5,1228920153
7,500,3247,3.0,1228920168
8,500,1441,3.5,1228920193
9,500,1025,3.5,1228920224


In [8]:
import numpy as np
import pandas as pd
from scipy.linalg import sqrtm



def create_utility_matrix(data, formatizer = {'user':0, 'item': 1, 'value': 2}):

        """
        :param data:            Array-like, 2D, nx3
        :param formatizer:      pass the formatizer
        :return:                the utility matrix. 2D, n x m, n=users, m=items
        """

        itemField = formatizer['item']
        userField = formatizer['user']
        valueField = formatizer['value']

        userList = data.ix[:,userField].tolist()
        itemList = data.ix[:,itemField].tolist()
        valueList = data.ix[:,valueField].tolist()

        users = list(set(data.ix[:,userField]))
        items = list(set(data.ix[:,itemField]))

        users_index = {users[i]: i for i in range(len(users))}



        pd_dict = {item: [np.nan for i in range(len(users))] for item in items}

        for i in range(0,len(data)):
            item = itemList[i]
            user = userList[i]
            value = valueList[i]

            pd_dict[item][users_index[user]] = value
            #print i

        X = pd.DataFrame(pd_dict)
        X.index = users

        itemcols = list(X.columns)
        items_index = {itemcols[i]: i for i in range(len(itemcols))}


        return X, users_index, items_index


In [None]:
utility, users_index, items_index = create_utility_matrix(data)

In [None]:
X

In [None]:
X_train, users_train, items_train = create_utility_matrix(data)

In [None]:
def svd(train, k):
    utilMat = np.array(train)

    mask = np.isnan(utilMat)
    masked_arr=np.ma.masked_array(utilMat, mask)
    item_means=np.mean(masked_arr, axis=0)
    utilMat = masked_arr.filled(item_means)

    x = np.tile(item_means, (utilMat.shape[0],1))

    utilMat = utilMat - x

    U, s, V=np.linalg.svd(utilMat, full_matrices=False)
    s=np.diag(s)
    s=s[0:k,0:k]
    U=U[:,0:k]
    V=V[0:k,:]

    s_root=sqrtm(s)

    Usk=np.dot(U,s_root)
    skV=np.dot(s_root,V)
    UsV = np.dot(Usk, skV)

    UsV = UsV + x

    print("svd done")
    return UsV


In [None]:
preds = svd(X_train, 50)

In [None]:
data

In [None]:
def rmse(true, pred):
    # this will be used towards the end
    x = true - pred
    return sum([xi*xi for xi in x])/len(x)
rmse(X,preds)

In [None]:
preds