In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np # linear algebra
import pandas as pd

In [None]:
df1 = pd.read_csv('/kaggle/input/tmdb-movie-metadata/tmdb_5000_credits.csv')
df1.head()

In [None]:
df2 = pd.read_csv('/kaggle/input/tmdb-movie-metadata/tmdb_5000_movies.csv')
df2.head()

In [None]:
print(df1.shape, df2.shape)

In [None]:
df1.columns

In [None]:
df1.columns = ['id', 'title', 'cast', 'crew']
df1.head()

In [None]:
df2 = df2.merge(df1, on = 'id')
df2.shape

## Simple Recommendation

In [None]:
df2.columns

In [None]:
c = df2['vote_average'].mean()
c

In [None]:
m = df2['vote_count'].quantile(0.9)
m

In [None]:
q_movies = df2.copy().loc[df2['vote_count'] >= m]
q_movies.shape

In [None]:
def weighted_rate(x, m=m, c=c):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m)*R)+(m/(m+v)*c)  #formula for by imdb

In [None]:
q_movies['score'] = q_movies.apply( weighted_rate, axis = 1)

In [None]:
q_movies = q_movies.sort_values('score', ascending = False)
q_movies['original_title'].head()

# Content based recommended system

In [None]:
df2.columns

In [None]:
df2['homepage'][1]

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer  #to remove not-required words like a, an, and, the, .... Also changes string values to numpy.float

In [None]:
tfid = TfidfVectorizer(stop_words = 'english', lowercase = True)    #lower case is true by default

In [None]:
df2['overview'] = df2['overview'].fillna('')

In [None]:
tf_matrix = tfid.fit_transform(df2['overview'])

In [None]:
tf_matrix

In [None]:
tf_matrix.shape

In [None]:
tf_matrix[0]

In [None]:
#cosine symmetry, euclidean distance, jaccard similarity can be used 
#pairwise distance can be used for finding cosine symmetry

In [None]:
from sklearn.metrics.pairwise import linear_kernel

In [None]:
cos_sim = linear_kernel(tf_matrix,tf_matrix)

In [None]:
cos_sim[0]

In [None]:
cos_sim.shape  #to find relation of each movie with other

In [None]:
indices = pd.Series(df2.index, index = df2['title_y']).drop_duplicates()  # having a series with title_y as index of original data and dropping index of original data

In [None]:
indices.head()

In [None]:
def get_5_rec(title, cosine_sim = cos_sim):
    idx = indices[title]                                            #found index of movie
    sim_scs = list(enumerate(cosine_sim[idx]))                      #extracting symmetry score for idx index movie
    sim_scs = sorted(sim_scs, key = lambda x:x[1], reverse = True)  #sorting and reversing the list i.e. in descending order and removing the x values form (x,y) i.e. having just y values i.e. score values
    sim_scs = sim_scs[:6]                                           #extracting just top 5 values
    movie_idx = [i[0] for i in sim_scs]
    return df2['title_y'].iloc[movie_idx]                           #returned the recommended movie titles

In [None]:
df2['title_y'].head(10)

In [None]:
get_5_rec("Pirates of the Caribbean: At World's End") #getting 5 similar recommendations

In [None]:
from sklearn.metrics import pairwise_distances

In [None]:
cosine_sim_pair_dist = 1 - pairwise_distances(tf_matrix, metric = 'cosine')  #for finding how much similar the data/movie is to other

In [None]:
pd.DataFrame(cosine_sim_pair_dist)

## multi-feature based recommending system

> ### features used: genres, keywords, crew, cast

In [None]:
from ast import literal_eval


In [None]:
features = ['cast', 'crew', 'keywords', 'genres']

In [None]:
for x in features:
    df2[x] = df2[x].apply(literal_eval)

In [None]:
df2['crew'][0]

In [None]:
df2['genres'][0]

In [None]:
df2['keywords'][0]

In [None]:
df2['cast'][0]

In [None]:
def get_inf_dir(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']

In [None]:
def get_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        if len(names) > 5:
            names = names[:5]
        return names
    return[]

In [None]:
df2['director'] = df2['crew'].apply(get_inf_dir)

In [None]:
df2['director'].head(200)

In [None]:
df2['director'].isnull().sum()

In [None]:
df2['director'].tail()

In [None]:
features = ['genres', 'keywords', 'cast'] 

In [None]:
for x in features:
    df2[x] = df2[x].apply(get_list)

In [None]:
df2['director'][0]

In [None]:
df2['keywords']

In [None]:
def clean_data(x):
    if isinstance(x,list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        if isinstance(x, str):
            return x.lower
        else:
            return ''

In [None]:
features = ['genres', 'keywords', 'cast', 'director']

In [None]:
for x in features:
    df2[x] = df2[x].apply(clean_data)

In [None]:
df2['keywords']

In [None]:
def create_soup(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' '+ ' '.join(x['genres'])

In [None]:
df2['soup'] = df2.apply(create_soup, axis = 1)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
df2['director']

In [None]:
count = CountVectorizer(stop_words = 'english')
count_matrix = count.fit_transform(df2['soup'])

In [None]:
count_matrix

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [None]:
def get_5_rec(title, cosine_sim = cosine_sim):
    idx = indices[title]                                            #found index of movie
    sim_scs = list(enumerate(cosine_sim[idx]))                      #extracting symmetry score for idx index movie
    sim_scs = sorted(sim_scs, key = lambda x:x[1], reverse = True)  #sorting and reversing the list i.e. in descending order and removing the x values form (x,y) i.e. having just y values i.e. score values
    sim_scs = sim_scs[:6]                                           #extracting just top 5 values
    movie_idx = [i[0] for i in sim_scs]
    return df2['title_y'].iloc[movie_idx]                           #returned the recommended movie titles

In [None]:
get_5_rec("Pirates of the Caribbean: At World's End") #getting 5 similar recommendations