In [3]:
import pandas as pd
import numpy as np
import re
import sklearn.metrics.pairwise as pw
from scipy import sparse
from sklearn.metrics.pairwise import pairwise_distances

In [37]:
movies = pd.read_csv("../../movies.csv")
ratings = pd.read_csv("../../ratings.csv")

movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [38]:
movies['title'].value_counts().sort_values(ascending=False).head(10) > 1

Aladdin (1992)           True
Beneath (2013)           True
Johnny Express (2014)    True
Girl, The (2012)         True
Paradise (2013)          True
Emma (1996)              True
Offside (2006)           True
Blackout (2007)          True
Darling (2007)           True
Casanova (2005)          True
Name: title, dtype: bool

In [39]:
duplicate_movies = movies.groupby('title').filter(lambda x: len(x) == 2)
duplic_ids = duplicate_movies['movieId'].values
#Duplicated titles
duplicate_movies = duplicate_movies[['movieId','title']]
# Checking the id with most reviews
review_count = pd.DataFrame(ratings[ratings['movieId'].isin(duplic_ids)]['movieId'].value_counts())
review_count.reset_index(inplace=True)
review_count.columns = ['movieId','count']
duplicated_df = pd.merge(duplicate_movies, review_count, on='movieId')
display(duplicated_df)
## Getting duplicates with low review count
duplicated_df.sort_values(by=['title','count'],ascending=[True,False])
duplicated_ids = duplicated_df.drop_duplicates(subset ="title", 
                     keep = 'last', inplace = False)['movieId']

Unnamed: 0,movieId,title,count
0,588,Aladdin (1992),41842
1,838,Emma (1996),7938
2,1788,Men with Guns (1997),683
3,3598,Hamlet (2000),1168
4,26958,Emma (1996),19
5,26982,Men with Guns (1997),32
6,34048,War of the Worlds (2005),6456
7,42015,Casanova (2005),578
8,47254,Chaos (2005),244
9,48682,Offside (2006),88


In [40]:
movies = movies.loc[~movies['movieId'].isin(duplicated_ids)]
ratings = ratings.loc[~ratings['movieId'].isin(duplicated_ids)]

In [11]:
genres = list(set('|'.join(list(movies["genres"].unique())).split('|')))
genres.remove('(no genres listed)')

#Creating dummy columns for each genre
for genre in genres:
    movies[genre] = movies['genres'].map(lambda val: 1 if genre in val else 0)

In [41]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [42]:
#Droping genres
movies.drop('genres', axis=1,inplace= True)  
ratings.drop('timestamp', axis=1,inplace= True)

In [43]:
df = pd.merge(ratings, movies, on='movieId')
print(df.shape)

(19999685, 4)


In [70]:
max_no_rows = (int)(1e7)

df_small = df[1:max_no_rows]

In [72]:
pivot_item_based = pd.pivot_table(df_small,
                                  index='title',
                                  columns=['userId'], values='rating')
sparse_pivot = sparse.csr_matrix(pivot_item_based.fillna(0))
recommender = pw.cosine_similarity(sparse_pivot)
recommender_df = pd.DataFrame(recommender, 
                              columns=pivot_item_based.index,
                              index=pivot_item_based.index)

In [73]:
def item_based_recom(film_name):    
    ## Item Rating Based Cosine Similarity
    cosine_df = pd.DataFrame(recommender_df[film_name].sort_values(ascending=False))
    cosine_df.reset_index(level=0, inplace=True)
    cosine_df.columns = ['title','cosine_sim']
    return cosine_df

In [83]:
item_based_recom('Seven (a.k.a. Se7en) (1995)').head(10)

Unnamed: 0,title,cosine_sim
0,Seven (a.k.a. Se7en) (1995),1.0
1,Pulp Fiction (1994),0.662649
2,"Silence of the Lambs, The (1991)",0.634034
3,"Usual Suspects, The (1995)",0.613564
4,"Shawshank Redemption, The (1994)",0.586279
5,Forrest Gump (1994),0.581476
6,Terminator 2: Judgment Day (1991),0.578067
7,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),0.577908
8,Braveheart (1995),0.574283
9,Jurassic Park (1993),0.561129
