# Movie Recommendation System

Predict movie ratings for the MovieLens dataset

In [1]:
import pandas as pd
import numpy as np
import warnings
from sklearn.metrics.pairwise import cosine_similarity

warnings.simplefilter(action='ignore', category=FutureWarning)

#Insert file
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('u.user', sep='|', names=u_cols)

r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('u.data', sep='\t', names=r_cols)

i_cols = ['movie_id', 'movie_title' ,'release_date','video_release_date', 'imdb_url', 'unknown', 'action', 'adventure',
'animation', 'childrens', 'comedy', 'crime', 'documentary', 'drama', 'fantasy', 'noir', 'horrow', 'musical', 'mystery', 'romance', 'scifi', 'thriller', 'war', 'western']

items = pd.read_csv('u.item', sep='|', names=i_cols, encoding='latin-1')

In [3]:
n_users = ratings.user_id.unique().shape[0]
n_items = ratings.movie_id.unique().shape[0]

In [4]:
df = pd.pivot_table(ratings, values='rating', index='user_id', columns='movie_id')
df

movie_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,3.0,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,,,,,,,,,5.0,,...,,,,,,,,,,
940,,,,2.0,,,4.0,5.0,3.0,,...,,,,,,,,,,
941,5.0,,,,,,4.0,,,,...,,,,,,,,,,
942,,,,,,,,,,,...,,,,,,,,,,


## User-Based Collaborative Filtering

### Normalization

In [5]:
#Normalize our values due to the excessive amount of NaN
df_mean = df.mean(axis=1)
rating_matrix = (df - df_mean).fillna(0)
rating_matrix = rating_matrix.values

In [6]:
rating_matrix

array([[ 1.38970588, -0.70967742,  1.2037037 , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.38970588,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 1.38970588,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  1.29032258,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

### Similarity Matrix

In [7]:
results = []
for i in range(len(rating_matrix)):
    for j in range(len(rating_matrix)):
        matrix = np.row_stack((rating_matrix[i], rating_matrix[j]))
        results.append(cosine_similarity(matrix)[0,1])

In [8]:
# Change the similarity score of the same sure to NaN; we should not factor that in during the score prediction
similarity_matrix = results
similarity_matrix = [np.NaN if i > 0.999 else i for i in similarity_matrix]
similarity_matrix = np.reshape(similarity_matrix, (-1, len(rating_matrix)))

### Score Prediction with user-based neighborhood technique

In [9]:
#Version 2
def score(rating, similarity, k=5):
    score = []

    for i in range(len(rating)):
        
        #Obtain rating only from k-nearest users
        top = np.argsort(-similarity[i])[:k]
        
        #Calculate prediction based on normalized rating
        prediction = np.dot(rating[[top]].T, similarity[i][[top]])/ np.sum(similarity[i][[top]])
        
        #Add prediction back to the actual average
        score.append(df_mean.values[i]+prediction)
    
    table = pd.DataFrame(data=score,index=df.index, columns=df.columns)
    return table

In [10]:
score(rating_matrix, similarity_matrix, 10)

movie_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.368783,3.282638,4.112122,3.473091,3.767034,3.610294,4.044994,3.699217,3.970378,3.610294,...,3.610294,3.610294,3.610294,3.610294,3.610294,3.610294,3.610294,3.610294,3.610294,3.610294
2,3.788503,3.709677,3.709677,3.709677,3.709677,3.709677,3.709677,3.709677,3.915323,3.709677,...,3.709677,3.709677,3.709677,3.709677,3.709677,3.709677,3.709677,3.709677,3.709677,3.709677
3,2.833256,2.796296,2.814653,2.796296,2.796296,2.796296,2.539813,2.796296,2.591493,2.597459,...,2.796296,2.796296,2.796296,2.796296,2.796296,2.796296,2.796296,2.796296,2.796296,2.796296
4,4.446751,4.333333,4.333333,4.112552,4.333333,4.333333,4.336620,4.304770,4.272262,4.333333,...,4.333333,4.333333,4.333333,4.333333,4.333333,4.333333,4.333333,4.333333,4.333333,4.333333
5,2.821021,1.714034,2.856664,2.201617,2.982786,2.874286,2.826263,2.615305,2.794515,2.852443,...,2.874286,2.874286,2.874286,2.874286,2.874286,2.874286,2.874286,2.874286,2.874286,2.874286
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,4.589418,4.265306,4.265306,4.265306,4.265306,4.265306,4.155042,4.265306,4.239428,4.364283,...,4.265306,4.265306,4.265306,4.265306,4.265306,4.265306,4.265306,4.265306,4.265306,4.265306
940,3.492949,3.457944,3.457944,3.457944,3.648883,3.457944,3.286640,3.566037,3.408452,3.457944,...,3.457944,3.457944,3.457944,3.457944,3.457944,3.457944,3.457944,3.457944,3.457944,3.457944
941,4.511556,4.045455,4.045455,4.045455,4.045455,4.045455,4.072837,4.045455,4.017368,4.045455,...,4.045455,4.045455,4.045455,4.045455,4.045455,4.045455,4.045455,4.045455,4.045455,4.045455
942,5.056331,4.244491,4.265823,4.224562,4.277145,4.265823,4.391849,4.874406,4.278771,4.342588,...,4.265823,4.265823,4.265823,4.265823,4.265823,4.265823,4.265823,4.265823,4.265823,4.265823


## Misc Codes
These codes will be deleted in the final version

In [None]:
# #Vanilla Veresion

# score = []
# for i in range(len(rating_matrix)):
    
#     np.argsort
#     #Calculate the prediction
#     predition = np.nansum(rating_matrix.T* similarity_matrix[i], axis=1) / np.nansum(similarity_matrix[i])
#     score.append(prediction + df_mean.values[i])

In [None]:
# def movies(user_id, k=5):
    
#     k_movies = np.argsort(-score[user_id-1])[:k]
   
#     return print('Our top picks for user {} are: {}'.format(user_id, items.movie_title[k_movies].values))

In [None]:
# #Function for cosine_similiarity
# def cosine(matrix):
#     dot_prod = (matrix[0] * matrix[1]).sum()
#     a_sum = (matrix[0]**2).sum()
#     b_sum = (matrix[1]**2).sum()
#     results = dot_prod / sqrt(a_sum*b_sum)
#     return results

In [None]:
# # results = []
# for i in range(len(rating_matrix)):
#     for j in range(len(rating_matrix)):
#         matrix = np.row_stack((rating_matrix[i], rating_matrix[j]))
#         matrix = np.delete(matrix, np.argwhere(np.isnan(matrix[0])), 1)
#         matrix = np.delete(matrix, np.argwhere(np.isnan(matrix[1])), 1)
#         try:
#             results.append(cosine_similarity(matrix)[0,1])
#         except:
#             results.append(np.nan)

In [None]:
# start_time = time.time()
# results = []
# for i in range(len(test)):
#     for j in range(len(test)):
#         matrix = np.row_stack((test[i], test[j]))
#         matrix = np.delete(matrix, np.argwhere(np.isnan(matrix[0])), 1)
#         matrix = np.delete(matrix, np.argwhere(np.isnan(matrix[1])), 1)
#         print(cosine_similarity(matrix)[0,1])
        
# print("--- %s seconds ---" % (time.time() - start_time))