# RECOMMENDER MOVIE RATINGS TEST

In [1]:
import numpy as np
import pandas as pd
from surprise import SVD
from surprise import NormalPredictor
from surprise.model_selection import cross_validate
from surprise.model_selection import KFold
from surprise.model_selection import GridSearchCV
from surprise import Dataset, Reader,accuracy
from surprise import get_dataset_dir, dump
from surprise.model_selection import train_test_split
from surprise import KNNBaseline

In [2]:
#Movie information
mv_info = pd.read_csv('../input/dataset/movie_titles.csv',error_bad_lines=False, header=None, names = ['MovieID','YearOfRelease','Title'])
mv_info['YearOfRelease'] = mv_info['YearOfRelease'].fillna(0).astype(int)

#Movie Ratings Subset from data preparation Netflix file
mv_ss = pd.read_csv('../input/dataset-balanced/movies_subset_balanced.csv')
mv_ss['RatingDate'] = pd.to_datetime(mv_ss['RatingDate'])
mv_ss['MovieID'] = mv_ss['MovieID'].astype(int)
mv_ss['UserID'] = mv_ss['UserID'].astype(int)
mv_ss_full = mv_ss.merge(mv_info, left_on = 'MovieID', right_on = 'MovieID')
mv_ss_full['RatingDate'] = pd.to_datetime(mv_ss_full['RatingDate'])
mv_ss['MovieID'] = mv_ss['MovieID'].astype(int)
mv_ss['UserID'] = mv_ss['UserID'].astype(int)

In [3]:
mv_ss_full.head()

Unnamed: 0,MovieID,UserID,Rating,RatingDate,YearOfRelease,Title
0,30,1366162,1,2005-10-26,2003,Something's Gotta Give
1,30,483101,1,2005-07-07,2003,Something's Gotta Give
2,30,2148796,1,2004-11-20,2003,Something's Gotta Give
3,30,998271,1,2004-06-21,2003,Something's Gotta Give
4,30,607621,1,2005-06-07,2003,Something's Gotta Give


In [4]:
mv_ss_full.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 250000 entries, 0 to 249999
Data columns (total 6 columns):
 #   Column         Non-Null Count   Dtype         
---  ------         --------------   -----         
 0   MovieID        250000 non-null  int64         
 1   UserID         250000 non-null  int64         
 2   Rating         250000 non-null  int64         
 3   RatingDate     250000 non-null  datetime64[ns]
 4   YearOfRelease  250000 non-null  int64         
 5   Title          250000 non-null  object        
dtypes: datetime64[ns](1), int64(4), object(1)
memory usage: 13.4+ MB


In [5]:
mv_info.head()

Unnamed: 0,MovieID,YearOfRelease,Title
0,1,2003,Dinosaur Planet
1,2,2004,Isle of Man TT 2004 Review
2,3,1997,Character
3,4,1994,Paula Abdul's Get Up & Dance
4,5,2004,The Rise and Fall of ECW


In [6]:
mv_ss.head()

Unnamed: 0,MovieID,UserID,Rating,RatingDate
0,30,1366162,1,2005-10-26
1,30,483101,1,2005-07-07
2,30,2148796,1,2004-11-20
3,30,998271,1,2004-06-21
4,30,607621,1,2005-06-07


In [7]:
pd.set_option('display.max_rows', None)
movies_tot = mv_ss_full.groupby(['Title', 'MovieID']).count().reset_index()[['Title', 'MovieID']]
movies_tot

Unnamed: 0,Title,MovieID
0,13 Going on 30,16082
1,50 First Dates,1962
2,A Beautiful Mind,1180
3,A Few Good Men,16668
4,About Schmidt,3756
5,Adaptation,12084
6,Air Force One,13651
7,Along Came Polly,2391
8,Along Came a Spider,12299
9,American Beauty,571


In [8]:
#Test for different movie ratings

INPUT = {
    '1962': 5,#50 First Dates
    '571' : 5,#American Beauty
    '10359': 5, #Runaway Bride
    '11149': 5, #Maid in Manhattan
    '3938':5,#Shrek 2
    '15107':5,#Ocean's Eleven
    '14644': 5, #The Stepford Wives
#     '13728' : 5,
#     '11443' : 5, #harry potter
#     '2452': 4, #Lord of the Rings: The Fellowship of the Ring
#     '8687': 5, #Star wars
#     '17324':4, #Hitch,
#     '6037':4 #The Bourne Identity
#     '14691':4,
#     '2372':4,
#     '17627':5,
#     '5582':4
}



In [9]:
# #New user
new_user_rated_movies = INPUT.keys()
new_user_ratings = INPUT.values()
new_user_id = 99
new_user_id_list = [new_user_id]*len(new_user_rated_movies)
df_new_user = pd.DataFrame({'MovieID': new_user_rated_movies, 
                           'UserID' : new_user_id_list,
                           'Rating' : new_user_ratings })
df_new_user['MovieID'] = df_new_user['MovieID'].astype(int)
df_new_user['UserID'] = df_new_user['UserID'].astype(int)
total_movies = set(mv_ss['MovieID'].unique())
new_user_unrated_movies = list(total_movies.difference(set(df_new_user['MovieID'])))
df_new_user_mv_rec = pd.DataFrame({'MovieID' : new_user_unrated_movies,
                                  'UserID': [new_user_id]*len(new_user_unrated_movies)})

In [10]:
mv_ss = pd.concat([mv_ss, df_new_user])

In [11]:
# len(new_user_ids)

In [12]:
#Create pred ratings for other movies under the assumption that a rating for a given movie is 5

# new_user_ids = [i for i in range(100, 100 + len(movies_tot))]
# new_user_rating = [5]*len(new_user_ids)
# movie_id_new_user = list(movies_tot['MovieID'])

# df_new_users_rating = pd.DataFrame({'MovieID': movie_id_new_user,'UserID' : new_user_ids, 'Rating' : new_user_rating})

In [13]:
# df_new_users_rating

In [14]:
# mv_ss = pd.concat([mv_ss, df_new_users_rating])
# mv_ss.tail()

In [15]:
# len(mv_ss)

# Matrix Factorization

In [16]:
# The columns must correspond to user id, item id and ratings (in that order).
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(mv_ss[['UserID', 'MovieID', 'Rating']], reader)
algo = SVD(n_epochs = 20, lr_all = 0.002 , reg_all = 0.4)
algo.fit(data.build_full_trainset())

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f3092e6c190>

In [17]:
def predict_rating(row):
    return algo.predict(row['UserID'], row['MovieID'], r_ui=None, clip=True, verbose=False).est

In [18]:
df_new_user_mv_rec['pred_rating'] = df_new_user_mv_rec.apply(predict_rating, axis = 1)
df_new_user_mv_rec = df_new_user_mv_rec.merge(mv_info, left_on = 'MovieID', right_on = 'MovieID')
df_new_user_mv_rec = df_new_user_mv_rec.sort_values('pred_rating', ascending = False)


In [19]:
rec_mf_top_5 = df_new_user_mv_rec.reset_index(drop = True)[:5]

In [20]:
rec_mf_top_5

Unnamed: 0,MovieID,UserID,pred_rating,YearOfRelease,Title
0,30,99,4.148587,2003,Something's Gotta Give
1,175,99,4.13483,1992,Reservoir Dogs
2,191,99,4.134347,2003,X2: X-Men United
3,457,99,4.125524,2004,Kill Bill: Vol. 2
4,313,99,4.12435,2000,Pay It Forward


In [21]:
pd.reset_option('display.max_rows')

# KNN item-item collaborative filtering

In [22]:
train, test = train_test_split(data, test_size=.2)
sim_options = {'name': 'msd',
               'min_support': 5,
               'user_based': False}
base1 = KNNBaseline(k=30,sim_options=sim_options)
base1.fit(train)
base1_preds = base1.test(test)
accuracy.rmse(base1_preds)

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 1.4684


1.4683754545372827

In [23]:
def predict_rating(row):
    return base1.predict(row['UserID'], row['MovieID'], r_ui=None, clip=True, verbose=False).est

In [24]:
df_new_user_mv_rec['pred_rating'] = df_new_user_mv_rec.apply(predict_rating, axis = 1)
df_new_user_mv_rec = df_new_user_mv_rec.merge(mv_info, left_on = 'MovieID', right_on = 'MovieID')
df_new_user_mv_rec = df_new_user_mv_rec.sort_values('pred_rating', ascending = False)

In [25]:
pd.set_option('display.max_rows', None)
df_new_user_mv_rec.head()

Unnamed: 0,MovieID,UserID,pred_rating,YearOfRelease_x,Title_x,YearOfRelease_y,Title_y
80,9756,99,5.0,2002,Mr. Deeds,2002,Mr. Deeds
98,11443,99,5.0,2002,Harry Potter and the Chamber of Secrets,2002,Harry Potter and the Chamber of Secrets
1,175,99,5.0,1992,Reservoir Dogs,1992,Reservoir Dogs
95,7745,99,5.0,1995,Apollo 13,1995,Apollo 13
175,16384,99,5.0,1993,The Fugitive,1993,The Fugitive
