In [1]:
import pandas as pd
import numpy as np
import re
import sklearn.metrics.pairwise as pw
from scipy import sparse
from sklearn.metrics.pairwise import pairwise_distances

In [37]:
movies = pd.read_csv("../../movies.csv")
ratings = pd.read_csv("../../ratings.csv")

movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [38]:
movies['title'].value_counts().sort_values(ascending=False).head(10) > 1

Aladdin (1992)           True
Beneath (2013)           True
Johnny Express (2014)    True
Girl, The (2012)         True
Paradise (2013)          True
Emma (1996)              True
Offside (2006)           True
Blackout (2007)          True
Darling (2007)           True
Casanova (2005)          True
Name: title, dtype: bool

In [39]:
duplicate_movies = movies.groupby('title').filter(lambda x: len(x) == 2)
duplic_ids = duplicate_movies['movieId'].values
#Duplicated titles
duplicate_movies = duplicate_movies[['movieId','title']]
# Checking the id with most reviews
review_count = pd.DataFrame(ratings[ratings['movieId'].isin(duplic_ids)]['movieId'].value_counts())
review_count.reset_index(inplace=True)
review_count.columns = ['movieId','count']
duplicated_df = pd.merge(duplicate_movies, review_count, on='movieId')
display(duplicated_df)
## Getting duplicates with low review count
duplicated_df.sort_values(by=['title','count'],ascending=[True,False])
duplicated_ids = duplicated_df.drop_duplicates(subset ="title", 
                     keep = 'last', inplace = False)['movieId']

Unnamed: 0,movieId,title,count
0,588,Aladdin (1992),41842
1,838,Emma (1996),7938
2,1788,Men with Guns (1997),683
3,3598,Hamlet (2000),1168
4,26958,Emma (1996),19
5,26982,Men with Guns (1997),32
6,34048,War of the Worlds (2005),6456
7,42015,Casanova (2005),578
8,47254,Chaos (2005),244
9,48682,Offside (2006),88


In [40]:
movies = movies.loc[~movies['movieId'].isin(duplicated_ids)]
ratings = ratings.loc[~ratings['movieId'].isin(duplicated_ids)]

In [11]:
genres = list(set('|'.join(list(movies["genres"].unique())).split('|')))
genres.remove('(no genres listed)')

#Creating dummy columns for each genre
for genre in genres:
    movies[genre] = movies['genres'].map(lambda val: 1 if genre in val else 0)

In [41]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [42]:
#Droping genres
movies.drop('genres', axis=1,inplace= True)  
ratings.drop('timestamp', axis=1,inplace= True)

In [43]:
df = pd.merge(ratings, movies, on='movieId')
print(df.shape)

(19999685, 4)


In [70]:
max_no_rows = (int)(1e7)

df_small = df[1:max_no_rows]

In [72]:
pivot_item_based = pd.pivot_table(df_small,
                                  index='title',
                                  columns=['userId'], values='rating')
sparse_pivot = sparse.csr_matrix(pivot_item_based.fillna(0))
recommender = pw.cosine_similarity(sparse_pivot)
recommender_df = pd.DataFrame(recommender, 
                              columns=pivot_item_based.index,
                              index=pivot_item_based.index)

In [58]:
def item_based_recom(film_name):    
    ## Item Rating Based Cosine Similarity
    cosine_df = pd.DataFrame(recommender_df[film_name].sort_values(ascending=False))
    cosine_df.reset_index(level=0, inplace=True)
    cosine_df.columns = ['title','cosine_sim']
    return cosine_df

In [77]:
item_based_recom('iron man 2').head(10)

Unnamed: 0,title,cosine_sim
0,Iron Man 2 (2010),1.0
1,Captain America: The First Avenger (2011),0.563698
2,Thor (2011),0.553673
3,Iron Man (2008),0.536621
4,"Avengers, The (2012)",0.530285
5,X-Men: First Class (2011),0.529303
6,Sherlock Holmes (2009),0.524113
7,Iron Man 3 (2013),0.507974
8,Kick-Ass (2010),0.477293
9,Avatar (2009),0.464223


In [62]:
recommender_df = pd.read_parquet('../models/recommender_df.parquet.gzip')
titles = recommender_df.columns

In [63]:
import re
a = titles[30]

In [64]:
re.compile('\(\d+\)').split(a)[0].strip().lower()

'300'

In [67]:
recommender_df.columns = titles_parsed
recommender_df.columns[1:10]

Index(['10 things i hate about you', '10,000 bc', '100 girls',
       '101 dalmatians', '101 dalmatians (one hundred and one dalmatians)',
       '102 dalmatians', '12 angry men', '127 hours', '13 ghosts'],
      dtype='object')

In [46]:
def parse_movie_title(title_name):
    title_name = remove_date_lower(title_name)
    
    print(title_name)
    print(title_name in titles_parsed)

In [50]:
parse_movie_title('Iron Man 2')

iron man 2
True


In [51]:
def remove_date_lower(title):
    return re.compile('\(\d+\)').split(title)[0].strip().lower()

In [40]:
titles_parsed = [remove_date_lower(t) for t in recommender_df.index.values]

In [41]:
titles_parsed[:30]

['*batteries not included',
 '10 things i hate about you',
 '10,000 bc',
 '100 girls',
 '101 dalmatians',
 '101 dalmatians (one hundred and one dalmatians)',
 '102 dalmatians',
 '12 angry men',
 '127 hours',
 '13 ghosts',
 '13 going on 30',
 '13th warrior, the',
 '1408',
 '15 minutes',
 '16 blocks',
 '1984 (nineteen eighty-four)',
 '2 days in the valley',
 '2 fast 2 furious (fast and the furious 2, the)',
 '20,000 leagues under the sea',
 '2001: a space odyssey',
 '2010: the year we make contact',
 '2012',
 '2046',
 '21',
 '21 grams',
 '27 dresses',
 '28 days',
 '28 days later',
 '28 weeks later',
 '30 days of night']