# Movie Recommendation Engine

In [34]:
# Import datasets
import pandas as pd 
import numpy as np 
data1 = pd.read_csv('tmdb_5000_credits.csv')
data2 = pd.read_csv('tmdb_5000_movies.csv')
data = data2.merge(data1,left_on ="id",right_on = "movie_id").drop(['movie_id','title_y'],axis=1)

In [35]:
data.columns

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title_x', 'vote_average',
       'vote_count', 'cast', 'crew'],
      dtype='object')

## Demographic Filtering

### Weighted Rating(WR) = (R*v) / (v+m) + (m*C) / (v+m)
#### v is the number of votes for the movie;
#### m is the minimum votes required to be listed in the chart;
#### R is the average rating of the movie; And
#### C is the mean vote across the whole report

In [31]:
# Top 10 Highest Rated Movies
C = data['vote_average'].mean()
m = data['vote_count'].quantile(0.9)

def wr(x,C=C,m=m):
    R = x['vote_average']
    v = x['vote_count']
    return (v/(v+m) * R) + (m/(m+v) * C)

demo_data = data.loc[data['vote_count'] > m]
demo_data['Score'] = demo_data.apply(wr,axis=1)
d_filtering = pd.DataFrame(demo_data, columns = ['title_x', 'vote_count', 'vote_average', 'Score']).sort_values('Score',ascending=False)
d_filtering.head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()


Unnamed: 0,title_x,vote_count,vote_average,Score
1881,The Shawshank Redemption,8205,8.5,8.059258
662,Fight Club,9413,8.3,7.939256
65,The Dark Knight,12002,8.2,7.92002
3232,Pulp Fiction,8428,8.3,7.904645
96,Inception,13752,8.1,7.863239
3337,The Godfather,5893,8.4,7.851236
95,Interstellar,10867,8.1,7.809479
809,Forrest Gump,7927,8.2,7.803188
329,The Lord of the Rings: The Return of the King,8064,8.1,7.727243
1990,The Empire Strikes Back,5879,8.2,7.697884


## Content Based Filtering

#### Compute pairwise similarity scores for all movies based on their overviews and recommend movies based on that similarity score. Thus, compute TF-IDF vectors for each overview.

In [36]:
#Import TfIdfVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer

#Define a TF-IDF Vectorizer Object. Remove stopwords
tfidf = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
c_data = data.copy()
c_data['overview'] = data['overview'].fillna('')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(c_data['overview'])


#### Cosine Similarity Score

In [37]:
# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

#Construct a reverse map of indices and movie titles
indices = pd.Series(data.index, index=data['title_x']).drop_duplicates()

# Function that takes in movie title as input and outputs most similar movies
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return data['title_x'].iloc[movie_indices]

In [42]:
# Example
get_recommendations('The Martian')

373                  Mission to Mars
487                       Red Planet
1507                           Alive
3158                           Alien
1198        Escape from Planet Earth
2964           The Last Days on Mars
91      Independence Day: Resurgence
635                        Apollo 13
3993               Journey to Saturn
1735                  Ghosts of Mars
Name: title_x, dtype: object

## Collaborative Filtering

#### The dataset we used before did not have userId. let's load another dataset calld ratings

##### Since the user based filtering would cause sparsity and scalability issues. We need to use Single Vector Decomposition method to leverage latent factors.

In [46]:
from surprise import Reader, Dataset, SVD, evaluate
reader = Reader()
data1 = pd.read_csv('ratings.csv')
data1.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,110,1.0,1425941529
1,1,147,4.5,1425942435
2,1,858,5.0,1425941523
3,1,1221,5.0,1425941546
4,1,1246,5.0,1425941556


In [47]:
data2 = Dataset.load_from_df(data1[['userId', 'movieId', 'rating']], reader)
data2.split(n_folds=3)

In [None]:
svd = SVD()
evaluate(svd, data2, measures='RMSE')
trainset = data2.build_full_trainset()
svd.fit(trainset)

In [None]:
# Example
svd.predict(1, 302, 3)
# For movie with ID 302, we get an estimated prediction of 2.686

## Hybrid Recommender

#### Content based filtering & User based collaborative filtering

In [55]:
def convert_int(x):
    try:
        return int(x)
    except:
        return np.nan

In [77]:
data1['userId'] = data1['userId'].apply(convert_int)
data3 = data1.merge(data[['title_x', 'id']], left_on='movieId',right_on='id').set_index('title_x')
data4 = data3.set_index('id')

In [89]:
def hybrid(userId, title):
    idx = indices[title]
    movie_id = data3.loc[title]['movieId']
    
    sim_scores = list(enumerate(cosine_sim[int(idx)]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:10]
    movie_indices = [i[0] for i in sim_scores]
    
    movies = data.iloc[movie_indices][['title_x', 'vote_count', 'vote_average', 'release_date', 'id']]
    movies['est'] = movies['id'].apply(lambda x: svd.predict(userId, data4.loc[x]['movieId']).est)
    movies = movies.sort_values('est', ascending=False)
    return movies.head(10)

In [None]:
#Example
#hybrid(500, 'Avatar')