In [1]:
import numpy as np
import pandas as pd

Load ratings csv file

In [2]:
ratings_df = pd.read_csv("ratings_small.csv")

In [3]:
ratings_df.shape

(100004, 4)

In [4]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


Load Movies metadata csv file

In [5]:
movies_df = pd.read_csv("../Data Exploration and Preparation/master_data_with_imputed_budget_and_revenue.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [48]:
ratings_df.movieId.shape

(45006,)

In [49]:
movies_df.movie_id.shape

(45538,)

Merge the two dataframe to keep only userId, movieId, rating and title data

In [37]:
ratings_df = pd.merge(ratings_df,movies_df[['title','movie_id']],left_on='movieId',right_on='movie_id')

In [44]:
ratings_df.drop(['timestamp','movie_id'],axis=1,inplace=True)

In [45]:
ratings_df.shape

(45006, 4)

In [47]:
ratings_df.sample(5)

Unnamed: 0,userId,movieId,rating,title
14499,592,104,5.0,Run Lola Run
21857,94,923,3.0,Dawn of the Dead
42258,382,73290,3.0,Urban Explorer
20603,307,4973,5.0,Under the Sand
32109,97,44555,4.5,"A Woman, a Gun and a Noodle Shop"


In [57]:
ratings_df.isnull().sum()

userId     0
movieId    0
rating     0
title      0
dtype: int64

Let's see the number of ratings for each movie

In [85]:
ratings_count = ratings_df.groupby(by="title")['rating'].count().reset_index().rename(columns={'rating':'totalRatings'})[['title','totalRatings']]

See the count of unique movies in the ratings_count dataframe

In [86]:
ratings_count.shape[0]

2794

We can also double check this number

In [88]:
len(ratings_df['title'].unique())

2794

In [89]:
ratings_count.sample(5)

Unnamed: 0,title,totalRatings
1,'Gator Bait,1
503,Club Paradise,3
1292,Mandroid,1
898,Grill Point,145
40,50 First Dates,3


In [90]:
ratings_count.head()

Unnamed: 0,title,totalRatings
0,!Women Art Revolution,2
1,'Gator Bait,1
2,'Twas the Night Before Christmas,2
3,...And God Created Woman,1
4,00 Schneider - Jagd auf Nihil Baxter,2


In [91]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,title
0,1,1371,2.5,Rocky III
1,4,1371,4.0,Rocky III
2,7,1371,3.0,Rocky III
3,19,1371,4.0,Rocky III
4,21,1371,3.0,Rocky III


In [131]:
ratings_total = pd.merge(ratings_df,ratings_count,on='title',how='left')

In [132]:
ratings_total.shape

(45006, 5)

In [133]:
ratings_total.head()

Unnamed: 0,userId,movieId,rating,title,totalRatings
0,1,1371,2.5,Rocky III,47
1,4,1371,4.0,Rocky III,47
2,7,1371,3.0,Rocky III,47
3,19,1371,4.0,Rocky III,47
4,21,1371,3.0,Rocky III,47


Let's see some statistics for the totalRatings

In [134]:
ratings_count['totalRatings'].describe()

count    2794.000000
mean       16.108089
std        31.481588
min         1.000000
25%         1.000000
50%         4.000000
75%        16.000000
max       324.000000
Name: totalRatings, dtype: float64

There is a huge difference between minimum number of ratings and maximum number of ratings.

In [135]:
ratings_count['totalRatings'].quantile(np.arange(.6,1,0.01))

0.60      7.00
0.61      7.00
0.62      7.00
0.63      8.00
0.64      8.00
0.65      9.00
0.66      9.00
0.67     10.00
0.68     10.00
0.69     11.00
0.70     12.00
0.71     12.00
0.72     13.00
0.73     14.00
0.74     14.82
0.75     16.00
0.76     17.00
0.77     18.00
0.78     19.00
0.79     20.00
0.80     21.00
0.81     22.33
0.82     24.00
0.83     26.00
0.84     27.00
0.85     29.00
0.86     31.00
0.87     34.00
0.88     37.00
0.89     41.77
0.90     45.00
0.91     49.00
0.92     52.56
0.93     59.00
0.94     64.42
0.95     71.00
0.96     83.28
0.97     98.21
0.98    119.14
0.99    168.49
Name: totalRatings, dtype: float64

About top 21% of the movies received more than 20 votes. Let's remove all the other movies so that we are only left with significant movies (in terms of total votes count)

In [136]:
votes_count_threshold = 20

In [137]:
ratings_top = ratings_total.query('totalRatings > @votes_count_threshold')

In [145]:
ratings_top.shape

(34554, 5)

In [146]:
ratings_top.head()

Unnamed: 0,userId,movieId,rating,title,totalRatings
0,1,1371,2.5,Rocky III,47
1,4,1371,4.0,Rocky III,47
2,7,1371,3.0,Rocky III,47
3,19,1371,4.0,Rocky III,47
4,21,1371,3.0,Rocky III,47


Make data consistent by ensuring there are unique entries for [title,userId] pairs

In [148]:
if not ratings_top[ratings_top.duplicated(['userId','title'])].empty:
    ratings_top = ratings_top.drop_duplicates(['userId','title'])

In [149]:
ratings_top.shape

(34412, 5)

Reshape the data using pivot function

In [167]:
df_for_knn = ratings_top.pivot(index='title',columns='userId',values='rating').fillna(0)

In [298]:
df_for_knn.head()

userId,1,2,3,4,5,6,7,8,9,10,...,662,663,664,665,666,667,668,669,670,671
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"20,000 Leagues Under the Sea",0.0,0.0,0.0,3.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2001: A Space Odyssey,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
24 Hour Party People,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
28 Days Later,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
28 Weeks Later,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [261]:
df_for_knn.shape

(580, 671)

Let's use sparse matrix representation of this matrix

In [199]:
from scipy.sparse import csr_matrix

In [200]:
df_for_knn_sparse = csr_matrix(df_for_knn.values)

## Recommendations using KNearestNeighbors method

In [201]:
from sklearn.neighbors import NearestNeighbors

In [202]:
model_knn = NearestNeighbors(metric='cosine',algorithm='brute')

In [203]:
model_knn.fit(df_for_knn_sparse)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
         metric_params=None, n_jobs=1, n_neighbors=5, p=2, radius=1.0)

In [273]:
query_index = np.random.choice(df_for_knn.shape[0])

In [303]:
distances, indices = model_knn.kneighbors(df_for_knn.loc['Batman Returns'].values.reshape(1,-1),n_neighbors=6)

In [324]:
distances, indices = model_knn.kneighbors(df_for_knn.iloc[query_index,:].values.reshape(1,-1),n_neighbors=6)

In [325]:
for i in range(0,len(distances.flatten())):
    if i==0:
        print("Recommendations for movie: {0}\n".format(df_for_knn.index[query_index]))
    else:
        print("{0}: {1}, with distance of {2}".format(i,df_for_knn.index[indices.flatten()[i]],distances.flatten()[i]))

Recommendations for movie: Batman Returns

1: Silent Hill, with distance of 0.30336126037089606
2: To Kill a Mockingbird, with distance of 0.3064181508774485
3: Reservoir Dogs, with distance of 0.3387139430593612
4: Monsoon Wedding, with distance of 0.37021315086973283
5: Wag the Dog, with distance of 0.41973556386139477


## Recommendations using Association Rules

In [258]:
def encode_units(x):
    if x<=0:
        return 0
    if x>=1:
        return 1

In [259]:
df_for_ar = df_for_knn.T.applymap(encode_units)

In [262]:
df_for_ar.shape

(671, 580)

In [358]:
df_for_ar.head()

title,"20,000 Leagues Under the Sea",2001: A Space Odyssey,24 Hour Party People,28 Days Later,28 Weeks Later,300,48 Hrs.,5 Card Stud,7 Virgins,8 Women,...,Within the Woods,X-Men Origins: Wolverine,Y Tu Mamá También,Yankee Doodle Dandy,Yesterday,Young Adam,Young Frankenstein,Young and Innocent,Zatoichi,xXx
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0,0,0,0.0,0,0,0.0,0,0,...,0,0,0.0,0,0.0,0.0,0,0.0,0.0,0
2,0.0,1,0,0,0.0,0,1,0.0,0,0,...,0,0,0.0,0,0.0,0.0,0,0.0,0.0,0
3,0.0,0,0,0,0.0,1,0,0.0,0,0,...,0,0,0.0,0,0.0,0.0,0,1.0,0.0,0
4,1.0,0,0,0,0.0,0,0,0.0,0,0,...,0,1,0.0,0,0.0,0.0,1,0.0,0.0,0
5,0.0,0,0,0,0.0,0,1,0.0,0,0,...,0,0,0.0,0,0.0,0.0,0,1.0,0.0,0


In [266]:
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [267]:
frequent_itemsets = apriori(df_for_ar, min_support=0.07, use_colnames=True)

  support_dict = {1: support[support >= min_support]}
  itemset_dict = {1: ary_col_idx[support >= min_support].reshape(-1, 1)}


In [268]:
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
rules.head()

Unnamed: 0,antecedants,consequents,support,confidence,lift
0,(Batman Returns),(2001: A Space Odyssey),0.298063,0.25,1.928161
1,(2001: A Space Odyssey),(Batman Returns),0.129657,0.574713,1.928161
2,(2001: A Space Odyssey),(Beauty and the Beast),0.129657,0.643678,2.570881
3,(Beauty and the Beast),(2001: A Space Odyssey),0.250373,0.333333,2.570881
4,(2001: A Space Odyssey),(Men in Black II),0.129657,0.643678,1.928161


## Comparing recommendations of KNN and apriori algorithms

Let's compare the two recommendations engines for movie recommended against 'Batman Returns'

Let's first find the index of 'Batman Returns' movie in our knn dataframe

In [359]:
query_index = df_for_knn.index.get_loc('Batman Returns')

In [360]:
query_index

65

In [363]:
distances, indices = model_knn.kneighbors(df_for_knn.iloc[query_index,:].values.reshape(1,-1),n_neighbors=6)
for i in range(0,len(distances.flatten())):
    if i==0:
        print("KNN Recommendations for movie: {0}\n".format(df_for_knn.index[query_index]))
    else:
        print("{0}: {1}, with distance of {2}".format(i,df_for_knn.index[indices.flatten()[i]],distances.flatten()[i]))

KNN Recommendations for movie: Batman Returns

1: Silent Hill, with distance of 0.30336126037089606
2: To Kill a Mockingbird, with distance of 0.3064181508774485
3: Reservoir Dogs, with distance of 0.3387139430593612
4: Monsoon Wedding, with distance of 0.37021315086973283
5: Wag the Dog, with distance of 0.41973556386139477


Let's fetch the correpsonding association rules generated via Apriori algorithm for same movie and sort them in descending order of lift

In [382]:
all_antecedents = [list(x) for x in rules['antecedants'].values]
desired_indices = [i for i in range(len(all_antecedents)) if len(all_antecedents[i])==1 and all_antecedents[i][0]=='Batman Returns']
apriori_recommendations=rules.iloc[desired_indices,].sort_values(by=['lift'],ascending=False)
apriori_recommendations.head()

Unnamed: 0,antecedants,consequents,support,confidence,lift
6726,(Batman Returns),"(Ariel, To Kill a Mockingbird, Reservoir Dogs)",0.298063,0.275,3.295089
6740,(Batman Returns),"(To Kill a Mockingbird, Ariel, The Million Dol...",0.298063,0.24,3.2208
1919,(Batman Returns),"(Ariel, To Kill a Mockingbird)",0.298063,0.33,3.163286
7692,(Batman Returns),"(Romeo + Juliet, To Kill a Mockingbird, Reserv...",0.298063,0.245,3.161442
6698,(Batman Returns),"(Ariel, The Million Dollar Hotel, Reservoir Dogs)",0.298063,0.255,3.111


Let's filter out the first 5 recommendations

In [391]:
apriori_recommendations_list = [list(x) for x in apriori_recommendations['consequents'].values]
print("Apriori Recommendations for movie: Batman Returns\n")
for i in range(5):
    print("{0}: {1} with lift of {2}".format(i+1,apriori_recommendations_list[i],apriori_recommendations.iloc[i,4]))

Apriori Recommendations for movie: Batman Returns

1: ['Ariel', 'To Kill a Mockingbird', 'Reservoir Dogs'] with lift of 3.295089285714285
2: ['To Kill a Mockingbird', 'Ariel', 'The Million Dollar Hotel'] with lift of 3.2208000000000006
3: ['Ariel', 'To Kill a Mockingbird'] with lift of 3.1632857142857147
4: ['Romeo + Juliet', 'To Kill a Mockingbird', 'Reservoir Dogs'] with lift of 3.1614423076923077
5: ['Ariel', 'The Million Dollar Hotel', 'Reservoir Dogs'] with lift of 3.111


We can also take a look at the single movie recommendations rather than "baskets"

In [393]:
apriori_single_recommendations.head()

Unnamed: 0,antecedants,consequents,support,confidence,lift
182,(Batman Returns),(Grbavica: The Land of My Dreams),0.298063,0.3,2.875714
208,(Batman Returns),(Reservoir Dogs),0.298063,0.595,2.609444
124,(Batman Returns),(Ariel),0.298063,0.405,2.539766
242,(Batman Returns),(To Kill a Mockingbird),0.298063,0.65,2.478125
212,(Batman Returns),(Romeo + Juliet),0.298063,0.405,2.4705


In [395]:
apriori_single_recommendations = apriori_recommendations.iloc[[x for x in range(len(apriori_recommendations_list)) if len(apriori_recommendations_list[x])==1],]
apriori_single_recommendations_list = [list(x) for x in apriori_single_recommendations['consequents'].values]
print("Apriori single-movie Recommendations for movie: Batman Returns\n")
for i in range(5):
    print("{0}: {1}, with lift of {2}".format(i+1,apriori_single_recommendations_list[i][0],apriori_single_recommendations.iloc[i,4]))

Apriori single-movie Recommendations for movie: Batman Returns

1: Grbavica: The Land of My Dreams, with lift of 2.8757142857142863
2: Reservoir Dogs, with lift of 2.6094444444444447
3: Ariel, with lift of 2.5397663551401872
4: To Kill a Mockingbird, with lift of 2.478125
5: Romeo + Juliet, with lift of 2.4705000000000004


Next, we will build some classification models with this dataset.