In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# Load dataset
dataset_path = "../Datasets/MovieLens_20M_Dataset/"

path = os.path.join(dataset_path, 'movie.csv')
movies = pd.read_csv(path)

path = os.path.join(dataset_path, 'rating.csv')
rating = pd.read_csv(path)

path = os.path.join(dataset_path, 'tag.csv')
tag = pd.read_csv(path)

path = os.path.join(dataset_path, 'link.csv')
link = pd.read_csv(path)

path = os.path.join(dataset_path, 'genome_tags.csv')
genome_tags = pd.read_csv(path)

path = os.path.join(dataset_path, 'genome_scores.csv')
genome_scores = pd.read_csv(path)

In [5]:
# Dataset shape
print("genome_tags shape is {}".format(genome_tags.shape))
print("genome_scores shape is {}".format(genome_scores.shape))
print("movies shape is {}".format(movies.shape))
print("rating shape is {}".format(rating.shape))
print("tag shape is {}".format(tag.shape))

genome_tags shape is (1128, 2)
genome_scores shape is (11709768, 3)
movies shape is (27278, 3)
rating shape is (20000263, 4)
tag shape is (465564, 4)


In [6]:
print(genome_tags.columns)
print(genome_scores.columns)
print(movies.columns)
print(rating.columns)
print(tag.columns)
genome_scores.head()

Index(['tagId', 'tag'], dtype='object')
Index(['movieId', 'tagId', 'relevance'], dtype='object')
Index(['movieId', 'title', 'genres'], dtype='object')
Index(['userId', 'movieId', 'rating', 'timestamp'], dtype='object')
Index(['userId', 'movieId', 'tag', 'timestamp'], dtype='object')


Unnamed: 0,movieId,tagId,relevance
0,1,1,0.025
1,1,2,0.025
2,1,3,0.05775
3,1,4,0.09675
4,1,5,0.14675


In [7]:
# genome_scores dataset has relevance feature which says that how much a tag is relevant to the movie and
# it's value range from 0 to 1, we'll consider only the value which has more than 0.5 relevance. So this gives better 
# predicrion. And We'll merge the tag with genome_scores dataset.
genome_scores = genome_scores[genome_scores['relevance']> 0.7].merge(genome_tags,on='tagId',how='left') 

# concatenating all the tag that belongs to a movie and forming a tag collection for each movie
genome_scores = genome_scores.groupby('movieId')['tag'].apply(' '.join).reset_index()


In [11]:

final_dataset = pd.merge(movies,genome_scores,on='movieId',how='left')
genome_scores.head()

Unnamed: 0,movieId,tag
0,1,adventure animated animation cartoon cgi child...
1,2,adventure animals big budget childhood childre...
2,3,comedy good sequel original sequel sequels
3,4,chick flick girlie movie romantic unlikely fri...
4,5,comedy destiny family father daughter relation...


In [12]:
# renaming tag as keywords
tag = tag.rename(columns = {"tag":"keywords"})
tag['keywords'].fillna('',inplace=True)
tag = tag.groupby('movieId')['keywords'].apply(' '.join).reset_index()

final_dataset = pd.merge(final_dataset,tag,on='movieId',how='left')
final_dataset['genres'].head()

0    Adventure|Animation|Children|Comedy|Fantasy
1                     Adventure|Children|Fantasy
2                                 Comedy|Romance
3                           Comedy|Drama|Romance
4                                         Comedy
Name: genres, dtype: object

In [13]:
final_dataset['keywords'] = final_dataset['keywords'] + " " +final_dataset['tag'] +  " " + \
    final_dataset['genres'].str.replace("|"," ")
final_dataset['keywords'].fillna("",inplace=True)

In [14]:
# Both tag and genres values has added to keywords so we drop this 
final_dataset.drop(['tag','genres'],inplace=True,axis=1)

In [15]:
c_vect = TfidfVectorizer()
X = c_vect.fit_transform(final_dataset['keywords'])

In [16]:
# There are other similiary distance metric available which are euclidean distance,manhattan distance, 
# Pearson coefficient etc. But for sparse matrix cosine similarity works better
cosine_sim = cosine_similarity(X)

In [17]:
def get_movie_recommendation(movie_name):
    idx = final_dataset[final_dataset['title'].str.contains(movie_name)].index
    if len(idx):
        movie_indices = sorted(list(enumerate(cosine_sim[idx[0]])), key=lambda x: x[1], reverse=True)[1:11]
        movie_indices = [i[0] for i in movie_indices]
        return movie_indices
    else : 
        return []

In [18]:
title = "Dark Knight"
recommended_movie_list = get_movie_recommendation(title)
movies.iloc[recommended_movie_list].set_index('movieId')

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
33794,Batman Begins (2005),Action|Crime|IMAX
91529,"Dark Knight Rises, The (2012)",Action|Adventure|Crime|IMAX
42015,Casanova (2005),Action|Adventure|Comedy|Drama|Romance
4299,"Knight's Tale, A (2001)",Action|Comedy|Romance
5611,"Four Feathers, The (2002)",Adventure|War
27073,Two Hands (1999),Comedy|Crime|Thriller
2572,10 Things I Hate About You (1999),Comedy|Romance
3213,Batman: Mask of the Phantasm (1993),Animation|Children
39183,Brokeback Mountain (2005),Drama|Romance
90603,Batman: Year One (2011),Action|Animation|Crime
