# Movie Recomender System

> importing all needed libraries and packages

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from ast import literal_eval

import warnings
warnings.filterwarnings('ignore')

import os
print(os.listdir("../input/tmdb-movie-metadata/"))

reading data sets

In [None]:
movies=pd.read_csv('../input/tmdb-movie-metadata/tmdb_5000_movies.csv')
credits=pd.read_csv('../input/tmdb-movie-metadata/tmdb_5000_credits.csv')

In [None]:
credits.head()

In [None]:
movies.head()

In [None]:
credits.columns=['id','title','cast','crew']

we are merging the two data sets

In [None]:
movies=movies.merge(credits,on='id')

In [None]:
movies.shape

In [None]:
movies.info()

In [None]:
movies.describe()

In [None]:
movies['genres']=movies['genres'].apply(literal_eval).apply(lambda x: [i['name'] for i in x])

In [None]:
movies['title']=movies['title_x']

dropping the unnecessary columns in the merged data set

In [None]:
movies.drop(['title_x','title_y'],axis=1,inplace=True)

# Demographic Filtering


In [None]:
vote_counts = movies[movies['vote_count'].notnull()]['vote_count'].astype('int')
vote_averages = movies[movies['vote_average'].notnull()]['vote_average'].astype('int')
C= vote_averages.mean()
C

In [None]:
m = vote_counts.quantile(0.95)
m

In [None]:
movies['year']=movies['release_date'].apply(lambda x: str(x).split('-')[0]  if x != np.nan else np.nan)

In [None]:
movies.head(1)

In [None]:
qualified=movies[(movies['vote_count']>=m) & (movies['vote_count'].notnull()) & (movies['vote_average'].notnull())][['title','year','vote_count','vote_average','popularity','genres']]

In [None]:
qualified.shape

In [None]:
def weighted_ratio(x):
    v=x['vote_count']
    R=x['vote_average']
    return (((v/(v+m))*R) + ((m/(v+m))*C))

In [None]:
qualified['wr']=qualified.apply(weighted_ratio,axis=1)

In [None]:
qualified['wr']=np.round(qualified['wr'],2)

In [None]:
qualified=qualified.sort_values(by='wr',ascending=False)

In [None]:
gen=movies.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)
gen.name='genre'
gen_movies=movies.drop('genres',axis=1).join(gen)

In [None]:
gen_movies.head(3)

In [None]:
def build_chart(genre,percentile=0.95):
    df=gen_movies[gen_movies['genre'] == genre]
    vote_counts=df[df['vote_count'].notnull()]['vote_count'].astype(int)
    vote_averages=df[df['vote_average'].notnull()]['vote_average'].astype(int)
    c=vote_averages.mean()
    m=vote_counts.quantile(percentile)
    
    qualified=df[(df['vote_count']>=m) & df['vote_count'].notnull() & df['vote_average'].notnull()][['title','year','vote_count','vote_average','popularity']]
    

    qualified['wr'] = qualified.apply(lambda x: (x['vote_count']/(x['vote_count']+m) * x['vote_average']) + (m/(m+x['vote_count']) * C), axis=1)
    qualified = qualified.sort_values('wr', ascending=False).head(250)
    
    return qualified

In [None]:
build_chart('Romance').head(10)

In [None]:
build_chart('Action').head(10)

# Content Based Recomender System

recommendation is based on movies plot summaries given in the overview column.If a user gives a movie title our goal is to recommend movies that share similar plot summaries

we take overview column from the movies data set

In [None]:
movies['overview'].head()

our model will not understand the language that we are getting as op in the above cell.So,we use TF-IDF to vectorize the objects.
This is also called as NLP(natural language processor)

In [None]:
#Import TfIdfVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer

#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf=TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
movies['overview']=movies['overview'].fillna('')

#Construct the required TF-IDF matrix by fitting and transforming the data 
tfidf_matrix=tfidf.fit_transform(movies['overview'])

tfidf_matrix.shape

we are importing linear kernel to check how two vectors of overview are related.It gives value b/w 0-1 .if the value is 0.6 ,that means overview 1 and 2 are having that much similarity

In [None]:
from sklearn.metrics.pairwise import linear_kernel

cosine_sim=linear_kernel(tfidf_matrix,tfidf_matrix)

In [None]:
cosine_sim[0]

In [None]:
indices = pd.Series(movies.index, index=movies['title']).drop_duplicates()
#reverse mapping of indices and movie titles

In [None]:
indices

we are writing a funnction that gives recommedations based on movie title that we give.
idx returns the index of movie and later sigmoid them ad sort them in order.
therefore we are getting the top 10 recommedations for a given movie

In [None]:
def content_based(title, cosine_sim=cosine_sim):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]
    return movies['title'].iloc[movie_indices]

In [None]:
content_based('Black Swan')

In [None]:
content_based('Avatar')

In [None]:
content_based('The Avengers')

# Collaborative Filtering

if two users watch a movie ,it considers them as similar users and suggest one of them to watch a movie based on others watchlist

In [None]:
ratings = pd.read_csv('../input/the-movies-dataset/ratings_small.csv')
movies=pd.read_csv('../input/tmdb-movie-metadata/tmdb_5000_movies.csv')

ratings.head()

In [None]:
df1=pd.DataFrame(ratings,columns=['userId','movieId','rating'])
df1

In [None]:
df=pd.DataFrame(movies,columns=['title','id'])
df

In [None]:
df=df.rename(columns={'id':'movieId'})

merging ratings and movies datasets on movieId

In [None]:
ratings=df1.merge(df,on='movieId')
ratings.head()

here we count the number of ratings given to a movie

In [None]:
rate_cnt=(ratings.groupby(by=['movieId'])['rating'].
          count().
          reset_index().
          rename(columns={'rating':'totalratingcnt'})
          [['movieId','totalratingcnt']]
         )
rate_cnt.head()

merging the total rating count with previous data set

In [None]:
rating_count=ratings.merge(rate_cnt)
rating_count.head()

this shows the basic info about the movies

In [None]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)
print(rate_cnt['totalratingcnt'].describe())

we are considering the movies with totla rating cnt > 50 as popular movies

In [None]:
popularity_threshold = 50
rating_popular_movie= rating_count.query('totalratingcnt >= @popularity_threshold')
rating_popular_movie.head()

In [None]:
rating_popular_movie.shape


we are creating a matrix to show ratings given by differnt users to different movies

In [None]:
## First lets create a Pivot matrix

movie_features_df=rating_popular_movie.pivot_table(index='title',columns='userId',values='rating').fillna(0)
movie_features_df.head()

we are using Nearest Neighbours to find euclidean distance b/w two movies also similarities among movies

In [None]:
from scipy.sparse import csr_matrix

movie_features_df_matrix = csr_matrix(movie_features_df.values)

from sklearn.neighbors import NearestNeighbors


model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model_knn.fit(movie_features_df_matrix)

In [None]:
movie_features_df.shape


In [None]:
query_index = np.random.choice(movie_features_df.shape[0])
print(query_index)
distances, indices = model_knn.kneighbors(movie_features_df.iloc[query_index,:].values.reshape(1, -1), n_neighbors = 6)

In [None]:
movie_features_df.head()


finally we are  getting recommendations for the 

In [None]:
for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Recommendations for {0}:\n'.format(movie_features_df.index[query_index]))
    else:
        print('{0}: {1}, with distance of {2}:'.format(i, movie_features_df.index[indices.flatten()[i]], distances.flatten()[i]))