In [1]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/the-movies-dataset/keywords.csv
/kaggle/input/the-movies-dataset/ratings.csv
/kaggle/input/the-movies-dataset/credits.csv
/kaggle/input/the-movies-dataset/movies_metadata.csv
/kaggle/input/the-movies-dataset/links.csv
/kaggle/input/the-movies-dataset/links_small.csv
/kaggle/input/the-movies-dataset/ratings_small.csv


In [2]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from surprise import Reader, Dataset, SVD
from surprise.model_selection import KFold
from surprise.model_selection.validation import cross_validate

In [3]:
# The main Movies Metadata file
meta = pd.read_csv('../input/the-movies-dataset/movies_metadata.csv')
meta.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [4]:
# The subset of 100,000 ratings from 700 users on 9,000 movies
ratings = pd.read_csv('../input/the-movies-dataset/ratings_small.csv')
ratings.head() # Movies in this dataset are rated out of 5 instead of 10

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [5]:
# TMDb and IMDb IDs of a small subset of 9,000 movies of the Full Dataset
links = pd.read_csv('../input/the-movies-dataset/links_small.csv')
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [6]:
# Movie plot keywords for the MovieLens movies
keywords = pd.read_csv('../input/the-movies-dataset/keywords.csv')
keywords.head()

Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [7]:
# Cast and Crew Information for all movies in the dataset
credits = pd.read_csv('../input/the-movies-dataset/credits.csv')
credits.head()

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862


In [8]:
# -- Content-focused recommender --

meta['overview'] = meta['overview'].fillna('')
meta['overview'].head() # Sample descriptions

0    Led by Woody, Andy's toys live happily in his ...
1    When siblings Judy and Peter discover an encha...
2    A family wedding reignites the ancient feud be...
3    Cheated on, mistreated and stepped on, the wom...
4    Just when George Banks has recovered from his ...
Name: overview, dtype: object

In [9]:
# Check the datatype of "id" in movies_metadata.csv
pd.DataFrame({'feature':meta.dtypes.index, 'dtype':meta.dtypes.values})

Unnamed: 0,feature,dtype
0,adult,object
1,belongs_to_collection,object
2,budget,object
3,genres,object
4,homepage,object
5,id,object
6,imdb_id,object
7,original_language,object
8,original_title,object
9,overview,object


In [10]:
meta = meta.drop([19730, 29503, 35587]) # Remove these ids to solve ValueError: "Unable to parse string..."

# Convert object to int64 for compatibility during merging
meta['id'] = pd.to_numeric(meta['id'])

# Run  the following code for converting more than one value to integer
# def convert_int(x):
#     try:
#         return int(x)
#     except:
#         return np.nan

In [11]:
# Check the datatype of "tmdbId" in links_small.csv
pd.DataFrame({'feature':links.dtypes.index, 'dtype':links.dtypes.values})

Unnamed: 0,feature,dtype
0,movieId,int64
1,imdbId,int64
2,tmdbId,float64


In [12]:
# Convert float64 to int64
col=np.array(links['tmdbId'], np.int64)
links['tmdbId']=col

In [13]:
# Merge the dataframes on column "tmdbId"
meta.rename(columns={'id':'tmdbId'}, inplace=True)
meta = pd.merge(meta,links,on='tmdbId')
meta.drop(['imdb_id'], axis=1, inplace=True)
meta.head()

# Alternatively, run the following code to reduce the size of movies_metadata.csv to match links_small.csv
# meta = meta[meta['tmdbId'].isin(links)]
# meta.shape

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,tmdbId,original_language,original_title,overview,popularity,...,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,movieId,imdbId
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.9469,...,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,1,114709
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,en,Jumanji,When siblings Judy and Peter discover an encha...,17.0155,...,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,2,113497
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,...,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,3,113228
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.85949,...,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0,4,114885
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.38752,...,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0,5,113041


In [14]:
# Remove stop words and use TF-IDF vectorizer
tfidf = TfidfVectorizer(stop_words='english')
# Construct TF-IDF matrix
tfidf_matrix = tfidf.fit_transform(meta['overview'])
tfidf_matrix.shape

(9099, 29727)

In [15]:
# Compute cosine similarity
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
# Get corresponding indices of the movies
indices = pd.Series(meta.index, index=meta['original_title']).drop_duplicates()

In [16]:
# Recommendation function
def recommend(title, cosine_sim=cosine_sim):
    
    # Get the index of the movie that matches the title
    idx = indices[title]
    # Get the pairwise similarity scores of all movies with the given movie
    sim_scores = list(enumerate(cosine_sim[idx]))
    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    # Get the scores of the 15 most similar movies
    sim_scores = sim_scores[1:16]
    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]
    # Remove low-rated movies or outliers
    for i in movie_indices:
        pop = meta.at[i,'vote_average']
        if pop<5 or pop>10:
            movie_indices.remove(i)

    # Return the most similar movies qualifying the 5.0 rating threshold
    return meta[['original_title','vote_average']].iloc[movie_indices]

In [17]:
recommend('Iron Man')

Unnamed: 0,original_title,vote_average
7516,Iron Man 2,6.6
8296,Iron Man 3,6.8
5670,Scarface,7.5
8766,Avengers: Age of Ultron,7.3
8100,Brake,5.3
4274,Saturday Night Fever,6.5
6063,Hostage,6.2
2322,The Dark Half,5.4
6152,Batman Begins,7.5
1650,Return from Witch Mountain,5.6


In [18]:
recommend('The Conjuring')

Unnamed: 0,original_title,vote_average
9074,The Conjuring 2,7.0
5587,The Boston Strangler,6.7
8726,The Borderlands,5.1
3077,The Spiral Staircase,6.6
5790,The Turning Point,6.0
8139,Sinister,6.8
3784,Things Behind the Sun,5.9
353,Jason's Lyric,5.9
6118,The Amityville Horror,6.0
8389,Koch,5.0


In [19]:
# -- User-focused recommender --

reader = Reader() # Used to parse a file containing ratings
df = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
kf = KFold(n_splits=5)
kf.split(df) # Split the data into folds

<generator object KFold.split at 0x7f00ddf66150>

In [20]:
# Use Single Value Decomposition (SVD) for cross-validation and fitting
svd = SVD()
cross_validate(svd, df, measures=['RMSE', 'MAE'])

trainset = df.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f00dc59ed10>

In [21]:
# Check a random user's ratings
ratings[ratings['userId'] == 10]

Unnamed: 0,userId,movieId,rating,timestamp
744,10,50,5.0,942766420
745,10,152,4.0,942766793
746,10,318,4.0,942766515
747,10,344,3.0,942766603
748,10,345,4.0,942766603
749,10,592,3.0,942767328
750,10,735,4.0,942766974
751,10,1036,3.0,942767258
752,10,1089,3.0,942766420
753,10,1101,2.0,942767328


In [22]:
# Read the smaller links file again
links_df = pd.read_csv('../input/the-movies-dataset/links_small.csv')
col=np.array(links_df['tmdbId'], np.int64)
links_df['tmdbId']=col

# Merge movies_metadata.csv and links_small.csv files
links_df = links_df.merge(meta[['title', 'tmdbId']], on='tmdbId').set_index('title')
links_index = links_df.set_index('tmdbId') # For label indexing

In [23]:
# Recommendation function
def hybrid(userId, title):
    idx = indices[title]
    tmdbId = links_df.loc[title]['tmdbId'] # Get the corresponding tmdb id
    
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31] # Scores of the 30 most similar movies
    movie_indices = [i[0] for i in sim_scores]
    
    movies = meta.iloc[movie_indices][['title', 'vote_average', 'tmdbId']]
    movies['est'] = movies['tmdbId'].apply(lambda x: svd.predict(userId, links_index.loc[x]['movieId']).est) # Estimated prediction using svd
    movies = movies.sort_values('est', ascending=False) # Rank movies according to the predicted values
    movies.columns = ['Title', 'Vote Average', 'TMDb Id', 'Estimated Prediction']
    return movies.head(15) # Display top 15 similar movies

In [24]:
# Recommendations for user with id 1
hybrid(1, 'The Conjuring')

Unnamed: 0,Title,Vote Average,TMDb Id,Estimated Prediction
7795,Midnight in Paris,7.4,59436,3.310206
5587,The Boston Strangler,6.7,26690,2.863504
5364,Night of the Living Dead,6.7,19185,2.855486
5859,Shining Through,6.2,31962,2.839961
8389,Koch,5.0,138217,2.834015
8993,Ashby,6.2,330112,2.821417
1965,The Texas Chain Saw Massacre,7.1,30497,2.813384
8262,Safe Haven,6.9,112949,2.783426
9074,The Conjuring 2,7.0,259693,2.770597
3784,Things Behind the Sun,5.9,102933,2.762046


In [25]:
# Recommendations for user with id 30
hybrid(30, 'The Conjuring')

Unnamed: 0,Title,Vote Average,TMDb Id,Estimated Prediction
7795,Midnight in Paris,7.4,59436,4.647098
5933,Salem's Lot,5.5,36763,4.12546
5364,Night of the Living Dead,6.7,19185,4.025749
9074,The Conjuring 2,7.0,259693,3.910156
8389,Koch,5.0,138217,3.883676
8262,Safe Haven,6.9,112949,3.87865
8726,The Borderlands,5.1,207774,3.868494
8993,Ashby,6.2,330112,3.848328
353,Jason's Lyric,5.9,22067,3.830028
6863,[REC],7.1,8329,3.807929


In [26]:
# Recommendations for user with id 500
hybrid(500, 'The Conjuring')

Unnamed: 0,Title,Vote Average,TMDb Id,Estimated Prediction
5364,Night of the Living Dead,6.7,19185,3.508786
5933,Salem's Lot,5.5,36763,3.501342
353,Jason's Lyric,5.9,22067,3.272869
6100,Electra Glide in Blue,6.6,26332,3.234464
8139,Sinister,6.8,82507,3.185044
7795,Midnight in Paris,7.4,59436,3.181651
1070,Amityville II: The Possession,5.9,16235,3.168537
3077,The Spiral Staircase,6.6,27452,3.167259
7049,Magicians,6.2,10078,3.155611
5859,Shining Through,6.2,31962,3.113176
