# MOVIE RECOMMENDATION SYSTEM

### Below code explains a very basic movie recommedation engine which uses the logic known as weighted average of ratings or votes provided by the customers. 

### Details of the logic is further explained in the below code

#### tmbd movie and credit data from kaggle is being used in the below code

https://www.kaggle.com/tmdb/tmdb-movie-metadata

Below code is part of my effort to learn to build recommendation systems (based on tutorial by Krish Naik on youtube)

In [None]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import matplotlib
matplotlib.rcParams["figure.figsize"] = (20,10)

import os

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:

credits  = pd.read_csv('/kaggle/input/tmdb-movie-metadata/tmdb_5000_credits.csv')
movies_df= pd.read_csv('/kaggle/input/tmdb-movie-metadata/tmdb_5000_movies.csv' )

In [None]:
credits.head()

In [None]:
movies_df.head()

In [None]:
print('Credits: ', credits.shape)
print('Movies Dataframe: ', movies_df.shape)

In [None]:
# RENAMING MOVIE_ID COLUMN TO MOVIE SO THAT IT IS CONSISTENT WITH MOVIES_DF ID COLUMN
credits_column_renamed = credits.rename(index=str, columns={'movie_id':'id'})

In [None]:
#MERGING CREDITS AND MOVIES

movies_df_merge = movies_df.merge(credits_column_renamed, on='id')
movies_df_merge.head()

In [None]:
movies_df_merge.info()

In [None]:
# SOME OF THE COLUMNS ARE NOT IMPORTANT. HENCE WE CAN DROP THESE COLUMNS

movies_cleaned_df = movies_df_merge.drop(columns=['homepage','title_x','title_y','status','production_countries'])
movies_cleaned_df.head()

In [None]:
movies_cleaned_df.info()

In [None]:
# check if there are any NULL values 
movies_cleaned_df.isnull().sum()

USING WEIGHTED AVERAGE FOR EACH MOVIES AVERAGE RATING 

$$ W =  \frac{Rv + Cm}{v + m} $$
$where: $
$ W = Weighted Rating $

$ R = average for a movie as a number from 0 to 10 (mean)=(Rating) $

$ v   = No. of votes for the movie = (votes) $

$ m   = min no. of votes to be listed in top 250 (currently 3000) $

$ C = mean votes against whole report $


In [None]:
#calculate all the components of above formula 
v=movies_cleaned_df['vote_count']
R=movies_cleaned_df['vote_average']
C=movies_cleaned_df['vote_average'].mean()
m=movies_cleaned_df['vote_count'].quantile(0.70)

In [None]:
movies_cleaned_df['weighted_average'] = ((R*v + (C*m))/(v + m))

In [None]:
movies_cleaned_df.head()

In [None]:
movie_sorted_ranking = movies_cleaned_df.sort_values("weighted_average", ascending = False)
movie_sorted_ranking[['original_title', 'vote_count', 'vote_average', 'weighted_average', 'popularity']].head(20)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

weight_average=movie_sorted_ranking.sort_values('weighted_average',ascending=False)
plt.figure(figsize=(12,6))
axis1=sns.barplot(x=weight_average['weighted_average'].head(10), y=weight_average['original_title'].head(10), data=weight_average)
plt.xlim(4, 10)
plt.title('Best Movies by average votes', weight='bold')
plt.xlabel('Weighted Average Score', weight='bold')
plt.ylabel('Movie Title', weight='bold')
plt.savefig('best_movies.png')

In [None]:
popularity=movie_sorted_ranking.sort_values('popularity',ascending=False)
plt.figure(figsize=(12,6))
ax=sns.barplot(x=popularity['popularity'].head(10), y=popularity['original_title'].head(10), data=popularity)

plt.title('Most Popular by Votes', weight='bold')
plt.xlabel('Score of Popularity', weight='bold')
plt.ylabel('Movie Title', weight='bold')
plt.savefig('best_popular_movies.png')

## Recommendation based on scaled weighted average and popularity score(Priority is given 50% to both)

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaling=MinMaxScaler()
movie_scaled_df=scaling.fit_transform(movies_cleaned_df[['weighted_average','popularity']])
movie_normalized_df=pd.DataFrame(movie_scaled_df,columns=['weighted_average','popularity'])
movie_normalized_df.head()

In [None]:
movies_cleaned_df[['normalized_weight_average','normalized_popularity']]= movie_normalized_df

In [None]:
movies_cleaned_df.head()

In [None]:
movies_cleaned_df['score'] = movies_cleaned_df['normalized_weight_average'] * 0.5 + movies_cleaned_df['normalized_popularity'] * 0.5
movies_scored_df = movies_cleaned_df.sort_values(['score'], ascending=False)
movies_scored_df[['original_title', 'normalized_weight_average', 'normalized_popularity', 'score']].head(20)

In [None]:
scored_df = movies_cleaned_df.sort_values('score', ascending=False)

plt.figure(figsize=(16,6))

ax = sns.barplot(x=scored_df['score'].head(10), y=scored_df['original_title'].head(10), data=scored_df, palette='deep')

#plt.xlim(3.55, 5.25)
plt.title('Best Rated & Most Popular Blend', weight='bold')
plt.xlabel('Score', weight='bold')
plt.ylabel('Movie Title', weight='bold')

plt.savefig('scored_movies.png')