In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from surprise import Dataset, Reader
from surprise.model_selection import cross_validate
from surprise.prediction_algorithms import SVD
from surprise.model_selection import GridSearchCV

# Ignore warnings :
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Reading ratings file
# Ignore the timestamp column

ratings = pd.read_csv('ratings.csv', sep=',', usecols=['userId', 'movieId', 'rating'])
movies = pd.read_csv('movies.csv', sep=',', usecols=[ 'movieId','title', 'genres'])
links = pd.read_csv('links.csv', sep=',', usecols=[ 'movieId','imdbId', 'tmdbId'])
tmdb_data = pd.read_csv('tmdb_data.csv', sep=',')


In [3]:
tmdb_data

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,vote_count,year,cast,crew,keywords,cast_size,crew_size,director,Film_director,metadata
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"['Animation', 'Comedy', 'Family']",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,5415.0,1995,"['tomhanks', 'timallen', 'donrickles']","[{'credit_id': '52fe4284c3a36847f8024f55', 'de...","['jealousi', 'toy', 'boy', 'friendship', 'frie...",13,106,"['johnlasseter', 'johnlasseter', 'johnlasseter']",John Lasseter,jealousi toy boy friendship friend rivalri new...
1,False,"{'id': 645, 'name': 'James Bond Collection', '...",58000000,"['Adventure', 'Action', 'Thriller']",http://www.mgm.com/view/movie/757/Goldeneye/,710,tt0113189,en,GoldenEye,James Bond must unmask the mysterious head of ...,...,1194.0,1995,"['piercebrosnan', 'seanbean', 'izabellascorupco']","[{'credit_id': '52fe426ec3a36847f801e16f', 'de...","['cuba', 'falselyaccus', 'secretident', 'compu...",20,46,"['martincampbell', 'martincampbell', 'martinca...",Martin Campbell,cuba falselyaccus secretident computervirus se...
2,False,,62000000,"['Comedy', 'Drama', 'Romance']",,9087,tt0112346,en,The American President,"Widowed U.S. president Andrew Shepherd, one of...",...,199.0,1995,"['michaeldouglas', 'annettebening', 'michaelj....","[{'credit_id': '52fe44dac3a36847f80adfa3', 'de...","['whitehous', 'usapresid', 'newlov', 'widow']",18,6,"['robreiner', 'robreiner', 'robreiner']",Rob Reiner,whitehous usapresid newlov widow michaeldougla...
3,False,,44000000,"['History', 'Drama']",,10858,tt0113987,en,Nixon,An all-star cast powers this epic look at Amer...,...,72.0,1995,"['anthonyhopkins', 'joanallen', 'powersboothe']","[{'credit_id': '52fe43c59251416c7501d705', 'de...","['usapresid', 'presidentialelect', 'watergates...",34,8,"['oliverstone', 'oliverstone', 'oliverstone']",Oliver Stone,usapresid presidentialelect watergatescand bio...
4,False,,98000000,"['Action', 'Adventure']",,1408,tt0112760,en,Cutthroat Island,"Morgan Adams and her slave, William Shaw, are ...",...,137.0,1995,"['geenadavis', 'matthewmodine', 'franklangella']","[{'credit_id': '52fe42f4c3a36847f802f69f', 'de...","['exoticisland', 'treasur', 'map', 'ship', 'sc...",31,16,"['rennyharlin', 'rennyharlin', 'rennyharlin']",Renny Harlin,exoticisland treasur map ship scalp pirat geen...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3443,False,,0,['Drama'],,340611,tt4193394,en,Indignation,"In 1951, Marcus Messner, a working-class Jewis...",...,69.0,2016,"['loganlerman', 'sarahgadon', 'tracyletts']","[{'credit_id': '58514b91c3a3682dfe017405', 'de...","['basedonnovel', 'jewishlif', 'ohio', '1950s']",33,23,"['jamesschamus', 'jamesschamus', 'jamesschamus']",James Schamus,basedonnovel jewishlif ohio 1950s loganlerman ...
3444,False,,19000000,"['Mystery', 'Adventure', 'Crime']",http://www.playnerve.com/,328387,tt3531824,en,Nerve,"Industrious high school senior, Vee Delmonico,...",...,2262.0,2016,"['emmaroberts', 'davefranco', 'emilymeade']","[{'credit_id': '57993c2f925141234800341d', 'de...","['basedonnovel', 'technolog', 'internet', 'hac...",22,42,"['henryjoost', 'henryjoost', 'henryjoost']",Henry Joost,basedonnovel technolog internet hack teenag ne...
3445,False,,25000000,"['Crime', 'Drama', 'Thriller']",http://www.theinfiltrator.com/,325789,tt1355631,en,The Infiltrator,A U.S Customs official uncovers a massive mone...,...,592.0,2016,"['bryancranston', 'dianekruger', 'johnleguizamo']","[{'credit_id': '578af3a79251417aca003525', 'de...","['undercov', 'biographi', 'drug', 'druglord']",21,48,"['bradfurman', 'bradfurman', 'bradfurman']",Brad Furman,undercov biographi drug druglord bryancranston...
3446,False,,20000000,['Comedy'],http://stxmovies.com/badmoms/,376659,tt4651520,en,Bad Moms,When three overworked and under-appreciated mo...,...,1287.0,2016,"['milakunis', 'kristenbell', 'kathrynhahn']","[{'credit_id': '5690c7adc3a3686b52001c68', 'de...","['alcohol', 'bar', 'parti', 'divorc', 'famili'...",33,4,"['jonlucas', 'jonlucas', 'jonlucas']",Jon Lucas,alcohol bar parti divorc famili hitandrun bath...


In [4]:
def convert_int(x):
    try:
        return int(x)
    except:
        return np.nan

In [5]:
content = pd.merge(ratings, movies, on = 'movieId')
content_data= pd.merge(links, content, on = 'movieId')
content_data

Unnamed: 0,movieId,imdbId,tmdbId,userId,rating,title,genres
0,1,114709,862.0,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,114709,862.0,5,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,1,114709,862.0,7,4.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,1,114709,862.0,15,2.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,1,114709,862.0,17,4.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
...,...,...,...,...,...,...,...
100831,193581,5476944,432131.0,184,4.0,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
100832,193583,5914996,445030.0,184,3.5,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
100833,193585,6397426,479308.0,184,3.5,Flint (2017),Drama
100834,193587,8391976,483455.0,184,3.5,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [6]:
content_data = content_data.sample(frac = 0.5)

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(content_data['genres'])
tfidf_matrix.shape

(50418, 175)

In [8]:
from sklearn.metrics.pairwise import linear_kernel
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_sim

array([[1.        , 0.        , 0.        , ..., 0.        , 0.13067286,
        0.12570048],
       [0.        , 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 1.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 1.        , ..., 1.        , 0.        ,
        0.        ],
       [0.13067286, 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.12570048, 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [9]:
# Build a 1-dimensional array with movie titles
titles = movies['title']
indices = pd.Series(movies.index, index=movies['title'])

# Function that get movie recommendations based on the cosine similarity score of movie genres
def get_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [10]:
get_recommendations('Avatar (2009)').head(10)

60     French Twist (Gazon maudit) (1995)
62             From Dusk Till Dawn (1996)
78               Dunston Checks In (1996)
83                 Beautiful Girls (1996)
96                         Catwalk (1996)
100               Before and After (1996)
108         Flirting With Disaster (1996)
110                 Jupiter's Wife (1994)
146                      Jury Duty (1995)
195           Death and the Maiden (1994)
Name: title, dtype: object

In [11]:
get_recommendations('Man Who Knew Too Much, The (1934)').head(10)

27              Persuasion (1995)
44              Pocahontas (1995)
58                Bio-Dome (1996)
157            Nine Months (1995)
167           Strange Days (1995)
183         Before Sunrise (1995)
187              Cure, The (1995)
190                 Clerks (1994)
229    Legends of the Fall (1994)
255      Perez Family, The (1995)
Name: title, dtype: object

In [12]:
get_recommendations('Twelve Monkeys (a.k.a. 12 Monkeys) (1995)').head(10)

76                      Angels and Insects (1995)
372                          Bronx Tale, A (1993)
726                     To Be or Not to Be (1942)
1785    Police Academy 6: City Under Siege (1989)
1788                       Very Bad Things (1998)
2535                                Nadine (1987)
2592                                   Hud (1963)
2989                   She's Having a Baby (1988)
3326                Ernest Saves Christmas (1988)
3410                 America's Sweethearts (2001)
Name: title, dtype: object