In [2]:
# Import necessary modules
import pandas as pd
import numpy as np
from pandas_profiling import ProfileReport
import math
import re
import matplotlib.pyplot as plt
from matplotlib import style
style.use('ggplot')
from sklearn.cluster import KMeans
from scipy.sparse import csr_matrix
import seaborn as sns
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate
sns.set_style("darkgrid")

In [3]:
df_title = pd.read_csv('../data/raw/movie_titles.csv', encoding = "ISO-8859-1", header = None, names = ['Movie_Id', 'Year', 'Name'])
df_title.set_index('Movie_Id', inplace = True)
print (df_title.head(10))

            Year                          Name
Movie_Id                                      
1         2003.0               Dinosaur Planet
2         2004.0    Isle of Man TT 2004 Review
3         1997.0                     Character
4         1994.0  Paula Abdul's Get Up & Dance
5         2004.0      The Rise and Fall of ECW
6         1997.0                          Sick
7         1992.0                         8 Man
8         2004.0    What the #$*! Do We Know!?
9         1991.0      Class of Nuke 'Em High 2
10        2001.0                       Fighter


In [4]:
df = pd.read_csv('../data/processed/df.csv')

In [10]:
# drop the bottom 30% of movies with fewest number of ratings to speed things up
f = ['count','mean']

df_movie_summary = df.groupby('Movie_Id')['Rating'].agg(f)
df_movie_summary.index = df_movie_summary.index.map(int)
movie_benchmark = round(df_movie_summary['count'].quantile(0.7),0)
drop_movie_list = df_movie_summary[df_movie_summary['count'] < movie_benchmark].index

## Collaborative Filtering Recommendation Model

Use [collaborative filtering](https://en.wikipedia.org/wiki/Collaborative_filtering), with reduced number of records to test the model, say 250,000 instead of the full 75million in the reduced dataset (eliminated zero ratings and fewest 30% ratings)

NOTE: 250,000 records in the model takes 15 minutes on my PC to run, so using all 75million of course 450 minutes appx 7.5 hours for one prediction.

In [20]:
df_short = df.head(250000)

In [21]:
reader = Reader()
data = Dataset.load_from_df(df_short[['Cust_Id', 'Movie_Id', 'Rating']][:], reader)
svd = SVD()
cross_validate(svd, data, measures=['RMSE', 'MAE'])

{'test_rmse': array([0.98189988, 0.98133738, 0.98444953, 0.98166476, 0.97574285]),
 'test_mae': array([0.78114346, 0.78018646, 0.78241731, 0.77826625, 0.77586465]),
 'fit_time': (19.916154623031616,
  17.7994441986084,
  19.001749277114868,
  19.427056312561035,
  19.414900064468384),
 'test_time': (106.49023675918579,
  0.6062641143798828,
  0.5980896949768066,
  0.5137159824371338,
  0.5056664943695068)}

#### Show some customer Ids and run some predictions on what those customers might like to see

In [22]:
df.head(10)

Unnamed: 0.1,Unnamed: 0,Cust_Id,Rating,Movie_Id
0,696,712664,5.0,3
1,697,1331154,4.0,3
2,698,2632461,3.0,3
3,699,44937,5.0,3
4,700,656399,4.0,3
5,701,439011,1.0,3
6,703,1644750,3.0,3
7,704,2031561,4.0,3
8,705,616720,4.0,3
9,706,2467008,4.0,3


In [27]:
Customer_Id = 1331154

### Show the above customer's favorite movies

In [28]:
Customer = df[(df['Cust_Id'] == Customer_Id) & (df['Rating'] == 5)]
Customer = Customer.set_index('Movie_Id')
Customer = Customer.join(df_title)['Name']
print(Customer)


Movie_Id
143                                       The Game
270                     Sex and the City: Season 4
361      The Phantom of the Opera: Special Edition
457                              Kill Bill: Vol. 2
482                                          Frida
                           ...                    
16860                        Law & Order: Season 1
16954           Indiana Jones and the Last Crusade
17085                                 24: Season 2
17627        Harry Potter and the Sorcerer's Stone
17709                      A River Runs Through It
Name: Name, Length: 158, dtype: object


### Predict which movies customer would like:

In [29]:
Customer = df_title.copy()
Customer = Customer.reset_index()
Customer = Customer[~Customer['Movie_Id'].isin(drop_movie_list)]

data = Dataset.load_from_df(df_short[['Cust_Id', 'Movie_Id', 'Rating']], reader)

trainset = data.build_full_trainset()
svd.fit(trainset)

Customer['Estimate_Score'] = Customer['Movie_Id'].apply(lambda x: svd.predict(785314, x).est)

Customer = Customer.drop('Movie_Id', axis = 1)

Customer = Customer.sort_values('Estimate_Score', ascending=False)
print(Customer.head(10))

         Year                                          Name  Estimate_Score
57     1996.0                                   Dragonheart        3.080407
27     2002.0                               Lilo and Stitch        3.079255
11799  1998.0                                     Fireworks        2.956333
11802  2005.0                                         Zeher        2.956333
11804  2004.0                                The Big Bounce        2.956333
11805  1998.0  The Secret of N-I-M-H 2: Timmy to the Rescue        2.956333
11806  1995.0                 Chinese Odyssey 2: Cinderella        2.956333
11807  1977.0                                   Eaten Alive        2.956333
11808  1996.0          Bob Hope: Hollywood's Brightest Star        2.956333
11811  2004.0                           Million Dollar Baby        2.956333
