In [1]:
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt
import warnings 

%matplotlib inline
warnings.filterwarnings('ignore')

### Data Preprocessing

In [2]:
data = pd.read_csv('ml-20m/ratings.csv')

In [3]:
print("NaNs cells: ", data.isnull().values.sum())
data.head()

NaNs cells:  0


Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


In [4]:
data.drop("timestamp", inplace=True, axis=1)

In [5]:
popularMovie = data[['rating', 'movieId']].groupby("movieId").count()
popularMovie = popularMovie.loc[popularMovie['rating'] > 15000].index

popularUser = data[['rating', 'userId']].groupby("userId").count()
popularUser = popularUser.loc[popularUser['rating'] > 3000].index

print("the number of popular movies: ", len(popularMovie))
print("the number of active users: ", len(popularUser))

the number of popular movies:  252
the number of active users:  56


In [7]:
data = data.loc[ 
    (data['movieId'].isin(popularMovie)) & (data['userId'].isin(popularUser)) 
]

In [8]:
train, test = train_test_split(data, test_size=0.50)
train, valid = train_test_split(train, test_size=0.2)

In [10]:
print(train.shape)
print(valid.shape)
print(test.shape)
train.head(5)

(5050, 3)
(1263, 3)
(6313, 3)


Unnamed: 0,userId,movieId,rating
5128472,35128,788,4.0
18981431,131347,6874,5.0
2671006,18138,1704,3.5
2962539,20132,4995,5.0
18713721,129583,1,4.5


In [11]:
'''
this is the same as 
train.pivot(
    index="movieId", 
    columns="userId",
    values="rating"
)
'''
movieCat = pd.api.types.CategoricalDtype(categories=sorted(train['movieId'].unique()))
userCat = pd.api.types.CategoricalDtype(categories=sorted(train['userId'].unique()))

rowIndex = train['movieId'].astype(movieCat).cat.codes
colIndex = train['userId'].astype(userCat).cat.codes
sparse_matrix = csr_matrix(
    (train["rating"], (rowIndex, colIndex)), 
    shape=(movieCat.categories.size, userCat.categories.size)
)

UserItemDF = pd.SparseDataFrame(
    sparse_matrix,
    index=movieCat.categories,
    columns=userCat.categories,
    default_fill_value=0
)

In [12]:
UserItemDF

Unnamed: 0,7201,8405,8963,9544,12131,14705,15617,18138,18611,20132,...,123606,125794,125978,129583,130459,130767,131347,131894,131904,136268
1,4.5,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,...,0.0,5.0,0.0,4.5,0.0,4.5,5.0,5.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,3.0,...,4.0,0.0,0.0,3.5,0.0,0.0,2.5,0.0,0.0,0.0
6,2.0,0.0,0.0,4.0,0.0,0.0,0.0,4.0,4.0,4.0,...,0.0,5.0,0.0,0.0,0.0,0.0,4.5,0.0,0.0,5.0
10,0.0,0.0,0.0,2.0,0.0,0.0,3.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
11,0.0,4.0,3.0,0.0,3.0,0.0,3.5,4.5,0.0,5.0,...,2.5,5.0,0.0,4.5,3.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8636,0.0,0.0,0.0,3.5,0.0,0.0,0.0,0.0,0.0,3.5,...,0.0,0.0,0.0,4.5,2.5,3.5,4.0,5.0,0.0,0.0
8961,0.0,4.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,...,0.0,5.0,0.0,0.0,3.5,4.0,4.0,5.0,0.0,0.0
32587,3.5,4.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33794,0.0,0.0,3.0,3.5,0.0,0.0,3.5,3.5,0.0,0.0,...,4.0,0.0,4.5,0.0,4.0,0.0,4.0,5.0,0.0,0.0


### Model

In [13]:
model = NearestNeighbors(
    metric='cosine', 
    n_neighbors=20, 
    n_jobs=-1,
    algorithm='brute'
)

In [14]:
model.fit(UserItemDF)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=-1, n_neighbors=20, p=2,
                 radius=1.0)

In [15]:
dis, ind = model.kneighbors(return_distance=True)

In [16]:
'''
dis: distance in high dimension, smaller distance means high similarity
ind: Indices of the nearest points in the population matrix.
'''
# movieId = 1
dis[0], ind[0]

(array([0.39541697, 0.42931302, 0.45193691, 0.46954898, 0.47094594,
        0.47513066, 0.47927382, 0.47946976, 0.47976719, 0.48545616,
        0.4918362 , 0.49549485, 0.50363493, 0.50625285, 0.51838939,
        0.53310134, 0.53565417, 0.53934555, 0.5454519 , 0.54628882]),
 array([153, 248,  80, 174, 246, 126, 110,  81, 121, 114,  24, 247,  84,
        103, 146,   5,   4, 152, 101,  52]))

In [17]:
# key: movieId
# value: the row index in User-Item matrix
movieIdIndexMap = {movId: idx for idx, movId in enumerate(UserItemDF.index)}
showMovieIdIndexMap = list(movieIdIndexMap.items())
showMovieIdIndexMap[:5] + showMovieIdIndexMap[-5:]

[(1, 0),
 (2, 1),
 (6, 2),
 (10, 3),
 (11, 4),
 (8636, 247),
 (8961, 248),
 (32587, 249),
 (33794, 250),
 (58559, 251)]

In [21]:
def predict(movieId, userId, dis, ind, UserItemMatrix, returnAvg=False):
    movieIdIndexMap = {movId: idx for idx, movId in enumerate(UserItemMatrix.index)}
    movieIdIndexMapReversed = {idx: movId for idx, movId in enumerate(UserItemMatrix.index)}
    
    # since we used cosine similarity so the closest neighbor distance should be 1
    nonZeroUserIndex = np.nonzero(UserItemMatrix.loc[movieId])[0]
    sim = dis[ movieIdIndexMap[movieId] ]
    mu_movie = UserItemMatrix.loc[movieId]\
                             .iloc[nonZeroUserIndex]\
                             .mean()
    
    nonZeroMovieIndex = np.nonzero(UserItemMatrix.iloc[ind[ movieIdIndexMap[movieId] ]].loc[:, userId])[0]
    denom = np.sum( abs(sim[nonZeroMovieIndex]) )
    nom = (UserItemMatrix.iloc[ind[ movieIdIndexMap[movieId] ]]
                     .loc[:, userId]\
                     .iloc[nonZeroMovieIndex]
           - UserItemMatrix.iloc[ind[ movieIdIndexMap[movieId] ]]\
                           .iloc[nonZeroMovieIndex]
                           .to_dense()\
                           .replace({0: np.nan})\
                           .mean(axis=1, skipna=True)
    ).values

    if len(nonZeroMovieIndex) == 0:
        prediction = mu_movie
    else:
        prediction = mu_movie + np.sum(nom * sim[nonZeroMovieIndex]) / denom

    if returnAvg:
        return prediction, mu_movie
    else:
        return prediction

In [19]:
from pandarallel import pandarallel
pandarallel.initialize(nb_workers=7, progress_bar=True)
# for parallel computing the lambda function

New pandarallel memory created - Size: 2000 MB
Pandarallel will run on 7 workers


In [22]:
def func(s):
    return predict(
        movieId=s['movieId'],
        userId=s['userId'], 
        dis=dis, ind=ind, 
        UserItemMatrix=UserItemDF
    )

In [23]:
%%time
train['predict'] = train.parallel_apply(func, axis=1)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=722), Label(value='0 / 722'))), HB…

CPU times: user 4.46 s, sys: 1 s, total: 5.46 s
Wall time: 1min 14s


In [24]:
%%time
test['predict'] = test.parallel_apply(func, axis=1)
# test.to_csv(os.getcwd() + 'predictedTest.csv')

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=902), Label(value='0 / 902'))), HB…

CPU times: user 6.11 s, sys: 1.57 s, total: 7.68 s
Wall time: 1min 38s


In [28]:
train.head()

Unnamed: 0,userId,movieId,rating,predict
5128472,35128,788,4.0,3.77332
18981431,131347,6874,5.0,4.152031
2671006,18138,1704,3.5,3.787893
2962539,20132,4995,5.0,4.505833
18713721,129583,1,4.5,4.827462


In [29]:
test.head()

Unnamed: 0,userId,movieId,rating,predict
3615183,24688,2640,4.0,3.74551
11934583,82418,2640,5.0,4.140078
17431902,120575,786,2.0,1.777278
2960707,20132,733,4.0,3.899836
3348835,22901,318,5.0,5.002449


In [30]:
from sklearn.metrics import mean_squared_error
print("-"*10 + "model performance" + "-"*10)
print("on training set: ", mean_squared_error(train['rating'], train['predict']))
print("on testing set: ", mean_squared_error(test['rating'], test['predict']))

----------model performance----------
on training set:  0.549638489799794
on testing set:  0.7434849955224105


### Choosing Hyperparameter

In [31]:
modelOne = NearestNeighbors(
    metric='cosine', 
    n_neighbors=30, 
    n_jobs=-1,
    algorithm='brute'
)
modelOne.fit(UserItemDF)
disOne, indOne = modelOne.kneighbors(return_distance=True)


modelTwo = NearestNeighbors(
    metric='cosine', 
    n_neighbors=20, 
    n_jobs=-1,
    algorithm='brute'
)
modelTwo.fit(UserItemDF)
disTwo, indTwo = modelTwo.kneighbors(return_distance=True)


modelThree = NearestNeighbors(
    metric='cosine', 
    n_neighbors=10, 
    n_jobs=-1,
    algorithm='brute'
)
modelThree.fit(UserItemDF)
disThree, indThree = modelThree.kneighbors(return_distance=True)

In [32]:
def func(s, dis, ind, UserItemMatrix):
    return predict(
        movieId=s['movieId'],
        userId=s['userId'], 
        dis=dis, ind=ind, 
        UserItemMatrix=UserItemMatrix
    )

In [34]:
valid['predictOne'] = valid.parallel_apply(
    func, axis=1, dis=disOne, ind=indOne, UserItemMatrix=UserItemDF
)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=181), Label(value='0 / 181'))), HB…

In [35]:
valid['predictTwo'] = valid.parallel_apply(
    func, axis=1, dis=disTwo, ind=indTwo, UserItemMatrix=UserItemDF
)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=181), Label(value='0 / 181'))), HB…

In [36]:
valid['predictThree'] = valid.parallel_apply(
    func, axis=1, dis=disThree, ind=indThree, UserItemMatrix=UserItemDF
)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=181), Label(value='0 / 181'))), HB…

In [37]:
print('neighbor size - 30: ', mean_squared_error(valid['rating'], valid['predictOne']))
print('neighbor size - 20: ', mean_squared_error(valid['rating'], valid['predictTwo']))
print('neighbor size - 10: ', mean_squared_error(valid['rating'], valid['predictThree']))

neighbor size - 30:  0.6348045163027772
neighbor size - 20:  0.6812016986834581
neighbor size - 10:  0.8210218624069362


##### according to the validation set RMSE result, we choose neighbor size = 30

In [38]:
test['predict'] = test.parallel_apply(
    func, axis=1, dis=disOne, ind=indOne, UserItemMatrix=UserItemDF
)
print('model performance on testing set: ', mean_squared_error(test['rating'], test['predict']))

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=902), Label(value='0 / 902'))), HB…

model performance on testing set:  0.6937660757298684
