In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
movies_df=pd.read_csv('/kaggle/input/tmdb-movie-metadata/tmdb_5000_movies.csv')
credits_df=pd.read_csv('/kaggle/input/tmdb-movie-metadata/tmdb_5000_credits.csv')
movies_df.head(2)

In [None]:
credits_df.head()

In [None]:
credits_df.shape,movies_df.shape

In [None]:
# id column name must be same for both credits_df and movies_df database
#renaame movie_id as id
#combine the both dataset on ( movies_id=id )
cred_col_ren=credits_df.rename(index=str,columns={'movie_id':'id'})
movies_df_mer=movies_df.merge(cred_col_ren,on='id')
movies_df_mer.head()

In [None]:
movies_new_df=movies_df_mer.drop(columns=['homepage', 'title_x', 'title_y', 'status','production_countries'])
movies_new_df.head()

In [None]:
movies_new_df.info()

In [None]:
# Calculate all the components based on the formula
v=movies_new_df['vote_count']
R=movies_new_df['vote_average']
C=movies_new_df['vote_average'].mean()
m=movies_new_df['vote_count'].quantile(0.70)

In [None]:
movies_new_df['weighted_average']=((R*v)+ (C*m))/(v+m)

In [None]:
movies_new_df.head(3)

In [None]:
movie_sorted_ranking=movies_new_df.sort_values('weighted_average',ascending=False)
movie_sorted_ranking[['original_title', 'vote_count', 'vote_average', 'weighted_average', 'popularity']].head(20)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

weight_average=movie_sorted_ranking.sort_values('weighted_average',ascending=False)
plt.figure(figsize=(12,6))
axis1=sns.barplot(y=weight_average['weighted_average'].head(10), x=weight_average['original_title'].head(10), data=weight_average)
plt.ylim(6, 10)
plt.title('Best Movies by average votes', weight='bold')
plt.ylabel('Weighted Average Score', weight='bold')
plt.xlabel('Movie Title', weight='bold')
plt.savefig('best_movies.png')

In [None]:
popularity=movie_sorted_ranking.sort_values('popularity',ascending=False)
plt.figure(figsize=(12,6))
ax=sns.barplot(x=popularity['popularity'].head(10), y=popularity['original_title'].head(10), data=popularity)

plt.title('Most Popular by Votes', weight='bold')
plt.xlabel('Score of Popularity', weight='bold')
plt.ylabel('Movie Title', weight='bold')

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaling=MinMaxScaler()
movie_scaled_df=scaling.fit_transform(movies_new_df[['weighted_average','popularity']])
movie_normalized_df=pd.DataFrame(movie_scaled_df,columns=['weighted_average','popularity'])
movie_normalized_df.head()

In [None]:
movies_new_df[['normalized_weight_average','normalized_popularity']]= movie_normalized_df

In [None]:
movies_new_df.head()

In [None]:
movies_new_df['score'] = movies_new_df['normalized_weight_average'] * 0.5 + movies_new_df['normalized_popularity'] * 0.5
movies_scored_df = movies_new_df.sort_values(['score'], ascending=False)
movies_scored_df[['original_title', 'normalized_weight_average', 'normalized_popularity', 'score']].head(20)

In [None]:
scored_df = movies_new_df.sort_values('score', ascending=False)

plt.figure(figsize=(16,6))

ax = sns.barplot(x=scored_df['score'].head(10), y=scored_df['original_title'].head(10), data=scored_df, palette='deep')

#plt.xlim(3.55, 5.25)
plt.title('Best Rated & Most Popular Blend', weight='bold')
plt.xlabel('Score', weight='bold')
plt.ylabel('Movie Title', weight='bold')

plt.savefig('scored_movies.png')

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfv = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3),
            stop_words = 'english')

# Filling NaNs with empty string
movies_new_df['overview'] = movies_new_df['overview'].fillna('')

In [None]:
# Fitting the TF-IDF on the 'overview' text
tfv_matrix = tfv.fit_transform(movies_new_df['overview'])

In [None]:
from sklearn.metrics.pairwise import sigmoid_kernel

# Compute the sigmoid kernel
sig = sigmoid_kernel(tfv_matrix, tfv_matrix)

In [None]:
# Reverse mapping of indices and movie titles
indices = pd.Series(movies_new_df.index, index=movies_new_df['original_title']).drop_duplicates()

In [None]:
indices

In [None]:

def give_rec(title, sig=sig):
    # Get the index corresponding to original_title
    idx = indices[title]

    # Get the pairwsie similarity scores 
    sig_scores = list(enumerate(sig[idx]))

    # Sort the movies 
    sig_scores = sorted(sig_scores, key=lambda x: x[1], reverse=True)

    # Scores of the 10 most similar movies
    sig_scores = sig_scores[1:11]

    # Movie indices
    movie_indices = [i[0] for i in sig_scores]

    # Top 10 most similar movies
    return movies_new_df['original_title'].iloc[movie_indices]

In [None]:

# Testing our content-based recommendation system with the seminal film Spy Kids
give_rec('Spy Kids')