## Name: Jay Shah 
## Date: 4-6-2021
# Recommendation System

In [None]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt

# Reading the input data from the file

In [None]:
credits_data = pd.read_csv('../input/tmdb-movie-metadata/tmdb_5000_credits.csv')
movies_data = pd.read_csv('../input/tmdb-movie-metadata/tmdb_5000_movies.csv')

# Displaying the data

In [None]:
credits_data.head(10)

In [None]:
movies_data.head(10)

# Displaying the name of columns.

In [None]:
credits_data.columns

In [None]:
movies_data.columns

# Checking the NA values

In [None]:
credits_data.isnull().sum(axis=0)

In [None]:
movies_data.isnull().sum(axis=0)

# Shape of the Dataset

In [None]:
print(credits_data.shape)
print(movies_data.shape)

# Merging both the dataset
#### Both datasets can be merged on ID column present in movie data with Movie_ID from credits data. In order to merge we will first need to rename any one of the column name so that they can be merged on that particular name.

In [None]:
# Renaming the column of Movie-ID in credits data to ID 
credits_data = credits_data.rename(columns={'movie_id':'id'})
credits_data

In [None]:
merged_data = movies_data.merge(credits_data,on='id')
merged_data

# Displaying the column names of merged data

In [None]:
# Displaying the columns of merged data
merged_data.columns

# Removing the column name title_x and renaming the title_y to title.
### Here the title_x values is same as that of title_y and hence the below step is performed.

In [None]:
merged_data.drop(columns=['title_x'],inplace=True)
merged_data.rename(columns={'title_y':'title'},inplace=True)
merged_data.columns

In [None]:
merged_data.head(10)

# Removing irrelevant columns from the merged_data

In [None]:
merged_data = merged_data.drop(columns=['title','homepage','status','production_countries'])
merged_data

In [None]:
merged_data.info()

# Applying the weighted average technique for each movie's average rating

In [None]:
v = merged_data['vote_count']               # Number of votes for the movie
R = merged_data['vote_average']             # Vote average of a movie which is a number from 0 to 10
C = merged_data['vote_average'].mean()      # The mean vote across the whole report 
m = merged_data['vote_count'].quantile(0.7) # More than 70%ile votes should be there for consideration

In [None]:
merged_data['Weighted_average'] = ((R*v)+(C*m))/(v+m)

In [None]:
merged_data.head(10)

# Sorting the values in descending order based on the weighted average and thereby displaying the most recommended movie

In [None]:
movie_rankings = merged_data.sort_values('Weighted_average',ascending=False)
movie_rankings[['original_title','vote_count','vote_average','Weighted_average','popularity']]

# Visualizing best movies by average votes

In [None]:
weighted_average = merged_data.sort_values('Weighted_average',ascending=False)
plt.figure(figsize=(14,10))
axis1 = sns.barplot(x = weighted_average['Weighted_average'].head(10),y = weighted_average['original_title'].head(10))

plt.title('Best Movies according to Average Vote given by Users',weight='bold')
plt.xlabel('Weighted Average Score',weight='bold')
plt.ylabel('Movie Title',weight='bold')

# Visualizing best movies by popularity

In [None]:
popularity = merged_data.sort_values('popularity',ascending=False)
plt.figure(figsize=(14,10))
axis1 = sns.barplot(x = popularity['popularity'].head(10),y = popularity['original_title'].head(10))
plt.title('Movies most popular by votes',weight='bold')
plt.xlabel('Score of Popularity',weight='bold')
plt.ylabel('Movie Title',weight='bold')

# Recommendation based on Scaled Weighted Average & Popularity Score

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaled_merged_data = scaler.fit_transform(merged_data[['Weighted_average','popularity']])
normalized_merged_data = pd.DataFrame(scaled_merged_data,columns=['Weighted_average','popularity'])
normalized_merged_data.head(10)

In [None]:
merged_data[['Normalized  Weight Average','Normalized Popularity']] = normalized_merged_data
merged_data

# Generating new column in Merged Data called Score which is calculated by giving 50% to each Normalized Popularity and Normalized Weighted Score

In [None]:
merged_data.columns

In [None]:
merged_data['Score'] = merged_data['Normalized  Weight Average']*0.5 + merged_data['Normalized Popularity']*0.5
merged_data = merged_data.sort_values('Score',ascending=False)
merged_data[['original_title','Normalized  Weight Average','Normalized Popularity','Score']]

# Visualizing Data Based on the Score

In [None]:
score = merged_data.sort_values('Score',ascending=False)
plt.figure(figsize=(14,10))
axis1 = sns.barplot(x = score['Score'].head(10),y = score['original_title'].head(10))
plt.title('Movies most popular by Score',weight='bold')
plt.xlabel('Score',weight='bold')
plt.ylabel('Movie Title',weight='bold')