In [2]:
import pandas as pd
import numpy as np
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate

Reading Movie data from https://grouplens.org/datasets/movielens/100k/https://grouplens.org/datasets/movielens/100k/

In [3]:
data=pd.read_csv('u.data', sep='\t',)
data=data.rename(columns={"196": "Cust_Id", "242": "Movie_Id", "3": "Rating"})
data=data[["Cust_Id", "Movie_Id",  "Rating"]]
movie=pd.read_csv('u.item', sep='|' , encoding = "ISO-8859-1", header = None)
movie= movie.rename(columns={0: "Movie_Id", 1: "Name"})
movie.set_index('Movie_Id', inplace = True)
movie=movie[['Name']]


In [4]:
movie.head(2)

Unnamed: 0_level_0,Name
Movie_Id,Unnamed: 1_level_1
1,Toy Story (1995)
2,GoldenEye (1995)


In [5]:
data.head(2)

Unnamed: 0,Cust_Id,Movie_Id,Rating
0,186,302,3
1,22,377,1


Getting count and mean of every movie

In [6]:
f = ['count','mean']
df_movie_summary = data.groupby('Movie_Id')['Rating'].agg(f)
df_movie_summary.index = df_movie_summary.index.map(int)


In [7]:
df_movie_summary.head()

Unnamed: 0_level_0,count,mean
Movie_Id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,452,3.878319
2,131,3.206107
3,90,3.033333
4,209,3.550239
5,86,3.302326


List of movies with count more than 70% quartile (atleast 65 count of reviews)

In [8]:
movie_benchmark = round(df_movie_summary['count'].quantile(0.7),0)
drop_movie_list = df_movie_summary[df_movie_summary['count'] < movie_benchmark].index

Let's pivot the data set and put it into a giant matrix - we need it for our recommendation system:



In [9]:
df_p = pd.pivot_table(data,values='Rating',index='Cust_Id',columns='Movie_Id')
print(df_p.shape)

(943, 1682)


In [10]:
df_p

Movie_Id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
Cust_Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,3.0,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,,,,,,,,,5.0,,...,,,,,,,,,,
940,,,,2.0,,,4.0,5.0,3.0,,...,,,,,,,,,,
941,5.0,,,,,,4.0,,,,...,,,,,,,,,,
942,,,,,,,,,,,...,,,,,,,,,,


In [11]:
total_length=df_p.shape[0]*df_p.shape[1]

In [12]:
df_p.isnull().sum().sum()/total_length

0.9369539368246911

#93.6% of data is missing

#Recommend with Collaborative Filtering¶

We are using powerful SVD algorithm here with Surprise library CV method. The minimization is performed by a very straightforward stochastic gradient descent.



https://surprise.readthedocs.io/en/v1.1.1/matrix_factorization.html#surprise.prediction_algorithms.matrix_factorization.SVD

https://surprise.readthedocs.io/en/v1.1.1/getting_started.html



In [13]:
reader = Reader()

df= Dataset.load_from_df(data[['Cust_Id', 'Movie_Id', 'Rating']], reader)

svd = SVD()
cross_validate(svd, df, measures=['RMSE', 'MAE'])

{'fit_time': (4.949059247970581,
  4.878285884857178,
  4.839358329772949,
  4.757091522216797,
  4.729170083999634),
 'test_mae': array([0.74148459, 0.73613162, 0.73994874, 0.74187864, 0.73566258]),
 'test_rmse': array([0.94053107, 0.93394258, 0.94113505, 0.94168461, 0.93137486]),
 'test_time': (0.2371993064880371,
  0.2258610725402832,
  0.1886730194091797,
  0.2255716323852539,
  0.14863038063049316)}

Below is what user 100_liked in the past:



In [14]:
df_100 = data[(data['Cust_Id'] == 100) & (data['Rating'] == 5)]
df_100 = df_100.set_index('Movie_Id')
df_100 = df_100.join(movie)['Name']
print(df_100)

Movie_Id
316    As Good As It Gets (1997)
313               Titanic (1997)
315             Apt Pupil (1998)
Name: Name, dtype: object


#Let's predict which movies user_100 would love to watch using SVD





In [15]:
user_100 = movie['Name'].copy()
user_100 = user_100.reset_index()
user_100 = user_100[~user_100['Movie_Id'].isin(drop_movie_list)]

#Building traing data and fitting SVD
trainset = df.build_full_trainset()
svd.fit(trainset)

user_100['Estimate_Score'] = user_100['Movie_Id'].apply(lambda x: svd.predict(100, x).est)

user_100 = user_100.drop('Movie_Id', axis = 1)

user_100= user_100.sort_values('Estimate_Score', ascending=False)
print(user_100.head(10))

                                       Name  Estimate_Score
168              Wrong Trousers, The (1993)        4.295900
407                   Close Shave, A (1995)        4.282272
317                 Schindler's List (1993)        4.271074
49                         Star Wars (1977)        4.249429
63         Shawshank Redemption, The (1994)        4.238933
271                Good Will Hunting (1997)        4.221585
11               Usual Suspects, The (1995)        4.207729
167  Monty Python and the Holy Grail (1974)        4.172774
482                       Casablanca (1942)        4.160453
315               As Good As It Gets (1997)        4.125141


#Recommend with Pearsons' R correlations

The way it works is we use Pearsons' R correlation to measure the linear correlation between review scores of all pairs of movies, then we provide the top 10 movies with highest correlations:

In [16]:

def recommend(movie_title, min_count, movie_data, user_movie_matrix):
    print("For movie ({})".format(movie_title))

    print("Top 10 movies recommended based on Pearsons'R correlation")

    i = int(movie_data.index[movie_data['Name'] == movie_title][0])
    similar_to_target = user_movie_matrix.corrwith(user_movie_matrix[i])
    corr_target = pd.DataFrame(similar_to_target, columns = ['PearsonR'])
    corr_target.dropna(inplace = True)
    corr_target = corr_target.sort_values('PearsonR', ascending = False)
    corr_target.index = corr_target.index.map(int)
    corr_target = corr_target.join(movie_data).join(df_movie_summary)[['PearsonR', 'Name', 'count', 'mean']]
    print(corr_target[corr_target['count']>min_count][:10].to_string(index=False))

#A recommendation for you if you like Star Wars (1977)



In [17]:
recommend("Star Wars (1977)", 20, movie, df_p)

For movie (Star Wars (1977))
Top 10 movies recommended based on Pearsons'R correlation


  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)


 PearsonR                                       Name  count     mean
 1.000000                           Star Wars (1977)    583 4.358491
 0.747981            Empire Strikes Back, The (1980)    367 4.204360
 0.672556                  Return of the Jedi (1983)    507 4.007890
 0.633312                       Meet John Doe (1941)     25 3.920000
 0.599564 Ghost in the Shell (Kokaku kidotai) (1995)     26 3.653846
 0.536117             Raiders of the Lost Ark (1981)    420 4.252381
 0.515291            Night Falls on Manhattan (1997)     32 3.375000
 0.515164                  When We Were Kings (1996)     44 4.045455
 0.515108                         Brassed Off (1996)     32 3.937500
 0.509016    Some Folks Call It a Sling Blade (1993)     41 4.292683
