In [1]:
import pandas as pd
import numpy as np
import re
import sklearn.metrics.pairwise as pw
from scipy import sparse
from sklearn.metrics.pairwise import pairwise_distances

In [2]:
movies = pd.read_csv("movies.csv")
ratings = pd.read_csv("ratings.csv")

movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
movies['title'].value_counts().sort_values(ascending=False).head(10) > 1

Men with Guns (1997)                True
War of the Worlds (2005)            True
Levottomat 3 (2004)                False
Urban Legends: Final Cut (2000)    False
Detour (1945)                      False
Eddie Murphy Delirious (1983)      False
Flesh and the Devil (1926)         False
Transylvania 6-5000 (1985)         False
Rio Grande (1950)                  False
Murder by Numbers (2002)           False
Name: title, dtype: bool

In [4]:

duplicate_movies = movies.groupby('title').filter(lambda x: len(x) == 2)
duplic_ids = duplicate_movies['movieId'].values
#Duplicated titles
duplicate_movies = duplicate_movies[['movieId','title']]
# Checking the id with most reviews
review_count = pd.DataFrame(ratings[ratings['movieId'].isin(duplic_ids)]['movieId'].value_counts())
review_count.reset_index(inplace=True)
review_count.columns = ['movieId','count']
duplicated_df = pd.merge(duplicate_movies, review_count, on='movieId')
display(duplicated_df)
## Getting duplicates with low review count
duplicated_df.sort_values(by=['title','count'],ascending=[True,False])
duplicated_ids = duplicated_df.drop_duplicates(subset ="title", 
                     keep = 'last', inplace = False)['movieId']

Unnamed: 0,movieId,title,count
0,1788,Men with Guns (1997),3
1,26982,Men with Guns (1997),2
2,34048,War of the Worlds (2005),33
3,64997,War of the Worlds (2005),3


In [5]:
movies = movies.loc[~movies['movieId'].isin(duplicated_ids)]
ratings = ratings.loc[~ratings['movieId'].isin(duplicated_ids)]

In [6]:
genres = list(set('|'.join(list(movies["genres"].unique())).split('|')))
genres.remove('(no genres listed)')

#Creating dummy columns for each genre
for genre in genres:
    movies[genre] = movies['genres'].map(lambda val: 1 if genre in val else 0)

In [7]:
movies.head()

Unnamed: 0,movieId,title,genres,Action,IMAX,War,Musical,Western,Horror,Film-Noir,...,Romance,Animation,Drama,Thriller,Mystery,Crime,Adventure,Documentary,Sci-Fi,Comedy
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0,0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,1
1,2,Jumanji (1995),Adventure|Children|Fantasy,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,3,Grumpier Old Men (1995),Comedy|Romance,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,0,0,0,0,0,0,0,...,1,0,1,0,0,0,0,0,0,1
4,5,Father of the Bride Part II (1995),Comedy,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [8]:
#Droping genres
movies.drop('genres', axis=1,inplace= True)  
ratings.drop('timestamp', axis=1,inplace= True)

In [9]:
df = pd.merge(ratings, movies, on='movieId')
print(df.shape)
df

(105334, 23)


Unnamed: 0,userId,movieId,rating,title,Action,IMAX,War,Musical,Western,Horror,...,Romance,Animation,Drama,Thriller,Mystery,Crime,Adventure,Documentary,Sci-Fi,Comedy
0,1,16,4.0,Casino (1995),0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0
1,9,16,4.0,Casino (1995),0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0
2,12,16,1.5,Casino (1995),0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0
3,24,16,4.0,Casino (1995),0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0
4,29,16,3.0,Casino (1995),0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105329,668,140098,2.5,Runoff (2015),0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
105330,668,140816,2.5,Tangerine (2015),0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
105331,668,141472,2.5,The 50 Year Argument (2014),0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
105332,668,142488,4.0,Spotlight (2015),0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [10]:
pivot_item_based = pd.pivot_table(df,index='title',columns=['userId'], values='rating')
pivot_item_based

userId,1,2,3,4,5,6,7,8,9,10,...,659,660,661,662,663,664,665,666,667,668
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),,,,,,,,,,,...,,,,,,,,,,
'Hellboy': The Seeds of Creation (2004),,,,,,,,,,,...,,,,,,,,,,
'Round Midnight (1986),,,,,,,,,,,...,,,,,,,,,,2.5
'Til There Was You (1997),,,,,,,,,,,...,,,,,,,,,,
"'burbs, The (1989)",,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
loudQUIETloud: A Film About the Pixies (2006),,,,,,,,,,,...,,,,,,,,,,
xXx (2002),,,,,,,,,,,...,,,,,,,,,,3.5
xXx: State of the Union (2005),,,,,,,,,,,...,,,,,,,,,,
¡Three Amigos! (1986),,,,,,,,,,,...,,,,,,,,,,2.0


In [11]:
sparse_pivot = sparse.csr_matrix(pivot_item_based.fillna(0))
sparse_pivot

<10323x668 sparse matrix of type '<class 'numpy.float64'>'
	with 105334 stored elements in Compressed Sparse Row format>

In [12]:
recommender = pw.cosine_similarity(sparse_pivot)
recommender

array([[1.        , 0.        , 0.        , ..., 0.        , 0.09832433,
        0.        ],
       [0.        , 1.        , 0.        , ..., 0.        , 0.14748649,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.        , 0.09832433,
        1.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.04899235,
        0.        ],
       [0.09832433, 0.14748649, 0.09832433, ..., 0.04899235, 1.        ,
        0.09832433],
       [0.        , 0.        , 1.        , ..., 0.        , 0.09832433,
        1.        ]])

In [13]:
recommender_df = pd.DataFrame(recommender, columns=pivot_item_based.index,index=pivot_item_based.index)

In [14]:
recommender_df

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Til There Was You (1997),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...And Justice for All (1979),10 (1979),...,[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),a/k/a Tommy Chong (2005),eXistenZ (1999),loudQUIETloud: A Film About the Pixies (2006),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),1.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.342682,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.158272,0.000000,0.098324,0.000000
'Hellboy': The Seeds of Creation (2004),0.000000,1.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.147486,0.000000
'Round Midnight (1986),0.000000,0.000000,1.000000,0.0,0.000000,0.000000,0.081094,0.000000,0.257012,0.680414,...,0.000000,0.227429,0.141421,0.000000,0.100219,0.0,0.221581,0.000000,0.098324,1.000000
'Til There Was You (1997),0.000000,0.000000,0.000000,1.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
"'burbs, The (1989)",0.000000,0.000000,0.000000,0.0,1.000000,0.000000,0.031610,0.231897,0.100923,0.000000,...,0.057358,0.000000,0.000000,0.000000,0.212684,0.0,0.104192,0.000000,0.161820,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
loudQUIETloud: A Film About the Pixies (2006),0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.202735,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,1.0,0.000000,0.000000,0.000000,0.000000
xXx (2002),0.158272,0.000000,0.221581,0.0,0.104192,0.189927,0.232954,0.000000,0.214237,0.279996,...,0.106301,0.050394,0.031336,0.221581,0.159413,0.0,1.000000,0.344367,0.135389,0.221581
xXx: State of the Union (2005),0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.225606,0.000000,0.113833,0.000000,...,0.141405,0.000000,0.000000,0.498273,0.170617,0.0,0.344367,1.000000,0.048992,0.000000
¡Three Amigos! (1986),0.098324,0.147486,0.098324,0.0,0.161820,0.098324,0.089702,0.137282,0.134776,0.133802,...,0.000000,0.022362,0.013905,0.000000,0.176140,0.0,0.135389,0.048992,1.000000,0.098324


In [15]:
def item_based_recom(film_name):    
    ## Item Rating Based Cosine Similarity
    cosine_df = pd.DataFrame(recommender_df[film_name].sort_values(ascending=False))
    cosine_df.reset_index(level=0, inplace=True)
    cosine_df.columns = ['title','cosine_sim']
    return cosine_df

In [19]:
item_based_recom('Liar Liar (1997)').head(10)

Unnamed: 0,title,cosine_sim
0,Liar Liar (1997),1.0
1,"Wedding Singer, The (1998)",0.499534
2,Happy Gilmore (1996),0.499495
3,Pleasantville (1998),0.493828
4,Austin Powers: International Man of Mystery (1...,0.477878
5,"Honey, I Shrunk the Kids (1989)",0.473103
6,Air Force One (1997),0.471592
7,Scream (1996),0.471343
8,Grease (1978),0.470676
9,Mars Attacks! (1996),0.466977
