In [1]:
import pandas as pd
import time

df_tags= pd.read_csv('data/tags.csv')
df_movies = pd.read_csv('data/movies.csv')
df_movies['genres'] = df_movies['genres'].apply(lambda x: x.split('|'))
df_tags_combined = df_tags.groupby('movieId').apply(lambda x: list(x['tag'])).reset_index().rename(columns={0:'tags'})
df_movies = pd.merge(df_movies, df_tags_combined, on = 'movieId', how = 'left')

df_movies['tags'] = df_movies['tags'].apply(lambda x: x if isinstance(x,list) else [])
df_movies['keywords'] = df_movies['genres']+df_movies['tags']
df_movies['keywords'] = df_movies['keywords'].apply(lambda x: set([str.lower(i.replace(" ", "")) for i in x]))
df_movies.set_index('movieId', inplace= True)

all_keywords = set()
for this_movie_keywords in df_movies['keywords']:
    all_keywords = all_keywords.union(this_movie_keywords)



In [2]:
df_movies

Unnamed: 0_level_0,title,genres,tags,keywords
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]","[pixar, pixar, fun]","{fantasy, animation, adventure, fun, comedy, c..."
2,Jumanji (1995),"[Adventure, Children, Fantasy]","[fantasy, magic board game, Robin Williams, game]","{fantasy, game, robinwilliams, adventure, chil..."
3,Grumpier Old Men (1995),"[Comedy, Romance]","[moldy, old]","{romance, old, comedy, moldy}"
4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]",[],"{drama, romance, comedy}"
5,Father of the Bride Part II (1995),[Comedy],"[pregnancy, remake]","{pregnancy, comedy, remake}"
6,Heat (1995),"[Action, Crime, Thriller]",[],"{action, thriller, crime}"
7,Sabrina (1995),"[Comedy, Romance]",[remake],"{romance, comedy, remake}"
8,Tom and Huck (1995),"[Adventure, Children]",[],"{adventure, children}"
9,Sudden Death (1995),[Action],[],{action}
10,GoldenEye (1995),"[Action, Adventure, Thriller]",[],"{action, adventure, thriller}"


In [3]:
df_ratings = pd.read_csv('data/ratings.csv')

In [4]:
df_mxk = pd.DataFrame(0, index = df_movies.reset_index()['movieId'].unique(), columns = all_keywords)
df_mxk['mean_rating'] = df_ratings.groupby('movieId')['rating'].mean()

for index,row in df_mxk.iterrows():
    df_mxk.loc[index,df_movies.loc[index]['keywords']] = 1

df_mxk['mean_rating'].fillna(df_mxk['mean_rating'].mean(), inplace=True)
df_mxk = df_mxk.loc[:,df_mxk.sum() > 5]

In [5]:
df_mxk

Unnamed: 0,astaireandrogers,willferrell,children,disability,horror,timetravel,heist,disney,bible,film-noir,...,racism,sequel,politics,innetflixqueue,surreal,vietnam,thought-provoking,twistending,comicbook,mean_rating
1,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3.920930
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3.431818
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3.259615
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2.357143
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3.071429
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3.946078
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3.185185
8,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2.875000
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3.125000
10,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3.496212


In [6]:
from sklearn.tree import DecisionTreeRegressor

reg = DecisionTreeRegressor(random_state=42)
X = df_mxk.drop('mean_rating', axis = 1).as_matrix()
y = df_mxk['mean_rating'].as_matrix()

reg.fit(X,y)
keyword_scores = pd.Series(reg.feature_importances_ , index = df_mxk.drop('mean_rating', axis=1).columns)
keyword_frequency = df_mxk.sum()

In [7]:
df_movies['chief_keyword'] = df_movies['keywords'].apply(lambda x: (keyword_scores[x]/keyword_frequency).idxmax())
df_movies

Unnamed: 0_level_0,title,genres,tags,keywords,chief_keyword
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]","[pixar, pixar, fun]","{fantasy, animation, adventure, fun, comedy, c...",animation
2,Jumanji (1995),"[Adventure, Children, Fantasy]","[fantasy, magic board game, Robin Williams, game]","{fantasy, game, robinwilliams, adventure, chil...",fantasy
3,Grumpier Old Men (1995),"[Comedy, Romance]","[moldy, old]","{romance, old, comedy, moldy}",romance
4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]",[],"{drama, romance, comedy}",drama
5,Father of the Bride Part II (1995),[Comedy],"[pregnancy, remake]","{pregnancy, comedy, remake}",pregnancy
6,Heat (1995),"[Action, Crime, Thriller]",[],"{action, thriller, crime}",action
7,Sabrina (1995),"[Comedy, Romance]",[remake],"{romance, comedy, remake}",remake
8,Tom and Huck (1995),"[Adventure, Children]",[],"{adventure, children}",children
9,Sudden Death (1995),[Action],[],{action},action
10,GoldenEye (1995),"[Action, Adventure, Thriller]",[],"{action, adventure, thriller}",adventure


In [8]:
df_movies.tail(2500)

Unnamed: 0_level_0,title,genres,tags,keywords,chief_keyword
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
73876,Undisputed II: Last Man Standing (2006),"[Action, Crime, Drama]",[],"{action, drama, crime}",drama
73881,3 Idiots (2009),"[Comedy, Drama, Romance]",[],"{drama, romance, comedy}",drama
73929,Legion (2010),"[Action, Fantasy, Horror, Thriller]",[],"{action, fantasy, horror, thriller}",fantasy
74075,Stan Helsing (2009),"[Comedy, Horror]",[],"{horror, comedy}",horror
74089,Peter Pan (1960),"[Children, Fantasy, Musical]",[],"{fantasy, children, musical}",fantasy
74095,Wicked City (Yôjû toshi) (1987),"[Animation, Fantasy, Horror, Sci-Fi]",[],"{fantasy, horror, animation, sci-fi}",animation
74154,When in Rome (2010),"[Comedy, Romance]",[],"{romance, comedy}",romance
74226,"Dream of Light (a.k.a. Quince Tree Sun, The) (...","[Documentary, Drama]",[],"{drama, documentary}",documentary
74228,Triangle (2009),"[Drama, Horror, Mystery, Thriller]",[],"{drama, mystery, horror, thriller}",mystery
74275,I Love You Phillip Morris (2009),"[Comedy, Drama, Romance]",[],"{drama, romance, comedy}",drama


In [23]:
all_chief_keywords = df_movies['chief_keyword'].unique()
df_uxk = pd.DataFrame(0, index = df_ratings['userId'].unique(), columns = all_chief_keywords)

start = time.time()
for row in df_ratings.itertuples(index=True, name='Pandas'):
    this_movie_chief_keyword = df_movies.loc[getattr(row, 'movieId'), 'chief_keyword']
    this_user_this_movie_rating = getattr(row, 'rating')
    this_user_id = getattr(row, 'userId')
    df_uxk.loc[this_user_id,this_movie_chief_keyword] += this_user_this_movie_rating
end = time.time()

print 'Time Taken:  '+ str(end-start)


Time Taken:  79.6070158482


In [21]:
df_uxk

Unnamed: 0,animation,fantasy,romance,drama,pregnancy,action,remake,children,adventure,politics,...,dark,depressing,philosophical,stylized,witty,socialcommentary,blackcomedy,bittersweet,(nogenreslisted),tense
1,126.0,123.0,16.0,51.0,0.0,29.0,0.0,5.0,78.0,10.0,...,0.0,0.0,0,0.0,0.0,0.0,0,0,0.0,0
2,0.0,0.0,0.0,17.5,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0,0.0,0.0,0.0,0,0,0.0,0
3,2.0,13.5,0.0,1.0,0.0,4.5,0.5,0.5,0.0,0.0,...,0.0,0.0,0,0.0,0.0,0.0,0,0,0.0,0
4,21.0,45.0,17.0,190.0,0.0,5.0,0.0,0.0,29.0,4.0,...,0.0,0.0,0,0.0,0.0,0.0,0,0,0.0,0
5,18.0,7.0,0.0,30.0,0.0,0.0,0.0,0.0,2.0,0.0,...,0.0,0.0,0,0.0,0.0,0.0,0,0,0.0,0
6,47.0,65.0,44.0,244.0,5.0,46.0,4.0,60.0,53.0,8.0,...,0.0,0.0,0,0.0,0.0,0.0,0,0,0.0,0
7,32.0,16.0,12.0,53.0,0.0,21.0,0.0,1.5,25.0,0.0,...,0.0,0.0,0,4.0,0.0,0.0,0,0,0.0,0
8,0.0,10.0,13.0,7.0,0.0,12.0,0.0,0.0,11.0,4.0,...,0.0,0.0,0,0.0,0.0,0.0,0,0,0.0,0
9,4.0,5.0,0.0,47.0,0.0,1.0,0.0,0.0,9.0,0.0,...,0.0,0.0,0,0.0,0.0,0.0,0,0,0.0,0
10,39.5,18.5,78.5,150.5,0.0,7.0,0.0,0.0,19.5,4.0,...,0.0,0.0,0,4.0,3.5,0.0,0,0,0.0,0


In [24]:

nok = len(all_chief_keywords)
df_co_rating = pd.DataFrame(0, index = all_chief_keywords, columns = all_chief_keywords)

start = time.time()
for index,row in df_uxk.iterrows():
    print index
    for i, first_keyword in enumerate(all_chief_keywords):
        for j in range(i+1,nok):
            second_keyword = all_chief_keywords[j]
            df_co_rating.loc[first_keyword,second_keyword] += min(row[first_keyword],row[second_keyword])
            df_co_rating.loc[second_keyword,first_keyword] = df_co_rating.loc[first_keyword,second_keyword]
         

end = time.time()
print 'Time Taken:  '+ str(end-start)       
    

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277


In [27]:
df_co_rating.to_pickle('co_rating.pkl')

In [10]:
import scipy.stats


def sim_matrix(co): # returns the similarity matrix for the given co-occurence matrix
    chief_keywords = co.columns
    df_sim = pd.DataFrame(index = co.index, columns = co.columns)
    f = co.sum()
    n = sum(f)

    for first_chief_keyword in chief_keywords:
        for second_chief_keyword in chief_keywords:
            k11 = co.loc[first_chief_keyword][second_chief_keyword]
            k12 = f[first_chief_keyword]-k11
            k21 = f[second_chief_keyword]-k11
            k22 = n - k12 - k21 + k11
            df_sim.loc[first_chief_keyword][second_chief_keyword], p, dof, expctd= scipy.stats.chi2_contingency([[k11,k12],[k21,k22]], lambda_="log-likelihood")
            if ((k11/k21) < f[first_chief_keyword]/(n-f[first_chief_keyword])):
                df_sim.loc[first_chief_keyword][second_chief_keyword] = 0
                
    return df_sim


In [9]:
df_co_rating = pd.read_pickle('co_rating.pkl')

In [11]:
df_sim_chief_keyword = sim_matrix(df_co_rating)

In [12]:
pd.options.display.max_columns = 1000
pd.options.display.max_rows = 1000

In [13]:
df_sim_chief_keyword

Unnamed: 0,animation,fantasy,romance,drama,pregnancy,action,remake,children,adventure,politics,horror,mafia,comedy,crime,mystery,sci-fi,shakespeare,innetflixqueue,highschool,timetravel,animalmovie,twins,funny,england,journalism,wedding,twistending,heist,prostitution,music,war,documentary,quirky,musical,revenge,assassination,thriller,leonardodicaprio,imax,superhero,western,film-noir,witty,psychology,stephenking,moviebusiness,australia,bad,court,classic,christmas,satire,imdbtop250,death,mentalillness,remade,heartwarming,gambling,darkcomedy,sexuality,menindrag,military,racism,thought-provoking,race,future,adolescence,coenbrothers,india,spoof,terrorism,aliens,adultery,kidnapping,divorce,screwball,television,ghosts,astaireandrogers,nickandnoracharles,disney,bible,holocaust,predictable,disability,emotional,boxing,newyork,space,atmospheric,murder,worldwarii,anime,religion,police,martialarts,vietnam,business,family,drugs,serialkiller,dystopia,willferrell,magic,cinematography,violence,dark,depressing,philosophical,stylized,mindfuck,visuallyappealing,socialcommentary,creepy,blackcomedy,tense,bittersweet,(nogenreslisted),suspense,gritty
animation,0.0,4905.27,836.692,5715.23,0.0,1166.59,0.0,54.5386,2301.59,0.0,242.652,0.0,2476.14,17.9062,940.158,3760.73,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,438.018,44.4517,0.0,102.134,0.0,0.0,0.0,0.0,2392.75,0.0,5.81561,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
fantasy,4905.27,0.0,975.579,8152.77,0.0,1688.92,0.0,4.17221,3783.08,0.0,530.213,0.0,3785.77,19.682,1775.71,6704.99,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,936.784,9.87856,0.0,32.54,0.0,0.0,0.0,0.0,1747.97,0.0,24.2436,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
romance,836.692,975.579,0.0,1322.62,0.0,588.162,0.0,45.5897,743.605,0.0,128.117,0.0,1419.34,230.169,456.362,453.03,0.0,0.071813,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,246.277,98.5379,0.0,340.408,0.0,0.0,0.0,0.0,205.011,0.0,92.8077,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
drama,5715.23,8152.77,1322.62,0.0,0.0,2394.8,0.0,0.0,4673.98,0.0,562.605,0.0,7365.91,1.09754,1836.7,13198.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1805.09,144.831,0.0,5.58956,0.0,0.0,0.0,0.0,2089.98,0.0,21.7648,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
pregnancy,0.0,0.0,0.0,0.0,0.0,0.0,135.047,6.66,0.0,41.4375,0.0,0.0,0.0,0.0,0.0,0.0,17.0142,0.0,0.00102562,0.0,65.6446,75.0934,0.0,12.5766,37.1554,34.6471,0.304955,0.0,96.1273,30.9673,0.0,0.0,0.0,0.0,3.00185,4.55336,17.5909,18.9818,0.0,0.0,0.0,0.0,4.27698,0.0,0.428485,19.1435,29.6856,41.4577,4.36723,0.0,12.0659,3.66192,0.0,0.130455,34.399,6.99512,0.0,30.1595,15.7137,4.77968,51.4992,50.1539,0.598095,5.73025,14.5541,0.195708,28.089,9.99243,21.4141,0.950985,92.0135,0.0,18.8193,34.6873,13.1787,5.23217,53.5715,6.54217,15.3541,20.7408,35.0793,26.8512,6.81661,17.5185,39.4828,0.0487645,0.129128,15.9466,13.1133,0.0,36.7577,0.273481,0.023258,18.3177,11.1714,0.0,1.37895,4.9824,40.2665,4.39362,23.1577,0.201365,19.7228,0.0,35.6378,3.09958,0.70337,0.00755237,6.646,3.86823,0.909438,4.12929,7.95003,0.0,6.50155,0.0,0.0,0.000838621,0.113819,0.440207
action,1166.59,1688.92,588.162,2394.8,0.0,0.0,0.0,12.4144,2552.82,0.0,299.62,0.0,1571.93,192.085,1197.13,2381.75,0.0,0.0,0.0,14.5764,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,609.041,0.0,0.0,0.0,0.0,0.0,0.0,0.0,933.864,0.0,139.035,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
remake,0.0,0.0,0.0,0.0,135.047,0.0,0.0,14.3425,0.0,91.9951,0.0,0.0,0.0,0.0,0.0,0.0,40.3045,3.24793,7.01806,0.0,57.0568,82.6671,1.06011,41.6171,51.0509,104.553,0.705209,0.0,142.666,56.1921,0.0,0.0,0.0,0.0,1.22145,9.76392,30.6003,17.9003,0.0,0.0506133,0.0,0.0,4.87156,0.0,6.91933,23.2463,61.6397,135.924,30.9191,10.8809,17.0399,3.9371,0.0,6.77554,24.9681,52.903,0.755052,38.2458,17.8709,13.6604,83.4727,45.3659,12.6741,15.8678,38.2925,11.2411,32.0632,42.1652,10.9655,27.9557,101.518,3.96349,66.4059,55.5361,24.5189,21.0092,40.91,24.14,20.5796,10.2708,73.1905,29.173,12.2577,22.3741,23.3636,3.02759,10.0094,33.0228,48.5852,0.0,53.4733,17.2845,0.0,15.6086,34.1692,0.678971,2.61979,18.0504,13.683,8.61333,37.2663,5.49367,14.6728,0.0,25.7248,1.33611,0.0,5.97064,3.54024,0.238638,4.50372,1.48309,5.87989,8.73219,2.2924,0.0,0.0,5.38845,0.00395964,1.47313
children,54.5386,4.17221,45.5897,0.0,6.66,12.4144,14.3425,0.0,5.59461,0.238292,72.3953,0.0,14.9345,29.3951,10.0023,0.0,0.394314,0.0,16.101,7.53981,18.9819,9.4699,4.83979,0.0,1.24634,4.69174,0.0,0.0,0.0552923,4.55327,1.19009,0.0,0.0,137.934,0.0,0.0,48.1862,0.0,0.0,29.6495,120.49,0.0,0.0,0.0,0.0,0.0,0.135464,13.6253,0.0,0.007301,125.46,0.0,0.0,0.0,0.614546,0.0,0.0,1.4965,0.0,0.0,6.86528,1.38566,0.0,0.0,0.0,0.0,0.0508158,0.0,0.0,1.51771,0.664675,3.28876,0.67224,0.0,0.0,0.0,0.0,0.160726,0.0640987,0.723865,16.2742,0.0,0.0,0.0,0.00144257,0.0,0.0,0.0,0.0,0.0,0.247255,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0828936,0.0,0.981699,0.0371522,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.101003,0.0218725,0.0,0.0
adventure,2301.59,3783.08,743.605,4673.98,0.0,2552.82,0.0,5.59461,0.0,0.0,317.196,0.0,2617.83,71.5315,1842.41,4586.6,0.0,0.0,0.0,0.00563702,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1264.22,4.60812,0.0,13.0368,0.0,0.0,0.0,0.0,1056.45,0.0,125.175,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
politics,0.0,0.0,0.0,0.0,41.4375,0.0,91.9951,0.238292,0.0,0.0,0.0,5.83076,0.0,0.0,0.0,0.0,94.1959,10.7819,2.31307,0.0,79.7196,62.1104,0.781531,48.8488,118.407,119.688,10.5083,0.00225223,179.715,55.482,0.0,0.0,0.0,0.0,4.49551,45.1581,36.8568,8.30236,0.0,0.157214,0.0,0.0,17.0972,0.0,16.9356,70.0867,72.3076,46.7287,54.717,0.242263,22.901,6.32111,0.0,44.7053,18.2459,32.143,2.88174,43.0024,35.3453,16.1201,61.5348,44.695,17.445,36.1893,22.5588,5.32359,14.234,52.7182,49.7025,27.878,60.0072,0.000867413,73.3943,17.4508,20.5952,62.4586,21.726,47.7582,31.259,5.54577,28.5207,33.1694,24.3898,22.9219,9.59372,8.87978,23.037,65.6576,40.2499,0.0,23.7784,31.539,0.0,16.475,33.7656,0.162918,3.707,21.0238,20.3438,17.2259,4.4572,0.0195969,0.240503,0.0,14.7067,0.587607,0.311205,5.03452,4.08741,0.0,2.74366,0.0,0.0,0.818961,2.20785,9.40668,0.277851,2.72315,1.74027,0.507102


In [60]:
df_sim_chief_keyword['children'].sort_values(ascending = False)

musical                  137.934
christmas                 125.46
western                   120.49
horror                   72.3953
animation                54.5386
thriller                 48.1862
romance                  45.5897
superhero                29.6495
crime                    29.3951
animalmovie              18.9819
disney                   16.2742
highschool                16.101
comedy                   14.9345
remake                   14.3425
bad                      13.6253
action                   12.4144
mystery                  10.0023
twins                     9.4699
timetravel               7.53981
menindrag                6.86528
pregnancy                   6.66
adventure                5.59461
funny                    4.83979
wedding                  4.69174
music                    4.55327
fantasy                  4.17221
aliens                   3.28876
spoof                    1.51771
gambling                  1.4965
military                 1.38566
journalism

In [45]:
df_movies[df_movies['chief_keyword'] == 'superhero']

Unnamed: 0_level_0,title,genres,tags,keywords,chief_keyword
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
153,Batman Forever (1995),"[Action, Adventure, Comedy, Crime]","[superhero, superhero]","{action, comedy, superhero, adventure, crime}",superhero
592,Batman (1989),"[Action, Crime, Thriller]",[superhero],"{action, superhero, thriller, crime}",superhero
1377,Batman Returns (1992),"[Action, Crime]",[superhero],"{action, superhero, crime}",superhero
2640,Superman (1978),"[Action, Adventure, Sci-Fi]",[superhero],"{action, superhero, adventure, sci-fi}",superhero
2641,Superman II (1980),"[Action, Sci-Fi]",[superhero],"{action, superhero, sci-fi}",superhero
2642,Superman III (1983),"[Action, Adventure, Sci-Fi]",[superhero],"{action, superhero, adventure, sci-fi}",superhero
2723,Mystery Men (1999),"[Action, Comedy, Fantasy]",[superhero],"{action, fantasy, comedy, superhero}",superhero
3793,X-Men (2000),"[Action, Adventure, Sci-Fi]","[action, comic book, hugh jackman, marvel, sup...","{superhero, sci-fi, comicbook, adventure, acti...",superhero
3877,Supergirl (1984),"[Action, Adventure, Fantasy]",[superhero],"{action, fantasy, superhero, adventure}",superhero
5349,Spider-Man (2002),"[Action, Adventure, Sci-Fi, Thriller]",[superhero],"{action, superhero, adventure, thriller, sci-fi}",superhero


In [47]:


from surprise import SVD, Reader, Dataset

reader = Reader()
data = Dataset.load_from_df(df_ratings[['userId', 'movieId', 'rating']], reader)
data.split(n_folds=5)
svd = SVD()
trainset = data.build_full_trainset()
svd.train(trainset)



<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f112de0fb10>

In [56]:
def collaborative(userId):
    df_movies['est'] = df_movies.reset_index()['movieId'].apply(lambda x: svd.predict(userId,x).est)
    return df_movies.sort_values('est', ascending=False).head(10)
    

In [57]:
collaborative(1)

Unnamed: 0_level_0,title,genres,tags,keywords,chief_keyword,est
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
901,Funny Face (1957),"[Comedy, Musical]",[],"{comedy, musical}",musical,5
899,Singin' in the Rain (1952),"[Comedy, Musical, Romance]",[movie business],"{romance, comedy, musical, moviebusiness}",moviebusiness,5
6315,Wildcats (1986),[Comedy],[],{comedy},comedy,5
224,Don Juan DeMarco (1995),"[Comedy, Drama, Romance]","[mental illness, psychology]","{drama, romance, comedy, mentalillness, psycho...",psychology,5
933,To Catch a Thief (1955),"[Crime, Mystery, Romance, Thriller]",[],"{mystery, romance, thriller, crime}",mystery,5
709,Oliver & Company (1988),"[Adventure, Animation, Children, Comedy, Musical]",[],"{comedy, musical, animation, adventure, children}",animation,5
277,Miracle on 34th Street (1994),[Drama],[Christmas],"{drama, christmas}",christmas,5
1258,"Shining, The (1980)",[Horror],"[atmospheric, disturbing, Horror, jack nichols...","{stephenking, atmospheric, horror, stanleykubr...",atmospheric,5
907,"Gay Divorcee, The (1934)","[Comedy, Musical, Romance]",[divorce],"{romance, divorce, comedy, musical}",divorce,5
4025,Miss Congeniality (2000),"[Comedy, Crime]",[pageant],"{pageant, comedy, crime}",crime,5


In [58]:

title_to_id = df_movies.reset_index()[['movieId', 'title']].set_index('title')

def hybrid(userId, title):
    this_movie_id = title_to_id.loc[title]
    all_movieids = list(df_movies.index)
    sim_scores_series = pd.Series(0,index = all_movieids)
    for movieid in all_movieids:
        sim_scores_series.loc[movieid] = df_sim_chief_keyword.loc[df_movies.loc[this_movie_id,'chief_keyword'],df_movies.loc[movieid,'chief_keyword']].iloc[0]
        
    top_25_ids = sim_scores_series.sort_values(ascending=False)[:26].index
    df_movies_top25 = df_movies.loc[top_25_ids].reset_index()
    
    df_movies_top25['est'] = df_movies_top25['index'].apply(lambda x: svd.predict(userId,x).est)
    
    #Sort the movies in decreasing order of predicted rating
    df_movies_top25 = df_movies_top25.sort_values('est', ascending=False)
    
    #Return the top 10 movies as recommendations
    return df_movies_top25.head(10)

hybrid(1, 'Spider-Man (2002)')

In [59]:
hybrid(1, 'Spider-Man (2002)')

Unnamed: 0,index,title,genres,tags,keywords,chief_keyword,est
11,1270,Back to the Future (1985),"[Adventure, Comedy, Sci-Fi]",[time travel],"{timetravel, comedy, adventure, sci-fi}",timetravel,4.802465
4,589,Terminator 2: Judgment Day (1991),"[Action, Sci-Fi]","[apocalypse, Arnold Schwarzenegger, nuclear wa...","{sci-fi, robots, scifimasterpiece, suspense, a...",timetravel,4.645798
0,7254,The Butterfly Effect (2004),"[Drama, Sci-Fi, Thriller]","[alternate reality, sci-fi, science fiction, t...","{sciencefiction, sci-fi, drama, alternatereali...",timetravel,4.597376
13,1097,E.T. the Extra-Terrestrial (1982),"[Children, Drama, Sci-Fi]",[aliens],"{drama, aliens, children, sci-fi}",aliens,4.585646
10,1240,"Terminator, The (1984)","[Action, Sci-Fi, Thriller]","[Action, artificial intelligence, robots, Sci-...","{specialeffects, sci-fi, robots, tense, artifi...",timetravel,4.565978
8,2968,Time Bandits (1981),"[Adventure, Comedy, Fantasy, Sci-Fi]",[time travel],"{timetravel, fantasy, comedy, adventure, sci-fi}",timetravel,4.551937
6,32,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),"[Mystery, Sci-Fi, Thriller]","[time travel, time travel, Brad Pitt, Bruce Wi...","{mystery, sci-fi, mindfuck, postapocalyptic, t...",timetravel,4.551658
19,1253,"Day the Earth Stood Still, The (1951)","[Drama, Sci-Fi, Thriller]",[aliens],"{drama, aliens, sci-fi, thriller}",aliens,4.549467
12,1200,Aliens (1986),"[Action, Adventure, Horror, Sci-Fi]","[action, aliens, horror, sci-fi, space, space ...","{sci-fi, space, horror, spacecraft, suspense, ...",aliens,4.504616
1,68358,Star Trek (2009),"[Action, Adventure, Sci-Fi, IMAX]","[future, lack of development, lack of story, q...","{timetravel, simonpegg, sci-fi, space, lackofs...",timetravel,4.461477
