In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
ratings=pd.read_csv("/kaggle/input/the-movies-dataset/ratings_small.csv")
movies=pd.read_csv("/kaggle/input/the-movies-dataset/movies_metadata.csv")
ratings.head()

In [None]:
ratings.isnull().sum()

In [None]:
movies.head()

In [None]:
movies.isnull().sum()

In [None]:
movies=movies.drop_duplicates()

In [None]:
movies.info()

# prepare movies df

In [None]:
ratings=ratings.rename(columns={"movieId":"id"})

In [None]:
movies=movies[["id","original_title","genres","overview","popularity","production_companies","production_countries","adult","runtime","vote_average","vote_count"]]
movies.head()

In [None]:
movies["genres"].unique()

In [None]:
movies[["production_companies","production_countries"]]

In [None]:
import ast
def parse_names(x,t):
    
    try:
        x=ast.literal_eval(x)
        if t=="companies":
            return x[0]['id']
        else:
            return x[0]['iso_3166_1']
    except:
        return -2

movies["production_companies"]=movies["production_companies"].apply(lambda x :parse_names(x,t="companies"))
movies["production_countries"]=movies["production_countries"].apply(lambda x :parse_names(x ,t="countries"))

In [None]:
movies["adult"]=movies["adult"].apply(lambda x : 1 if x=="True" else 0)

In [None]:
def parse_genres(x):
    try:
        x=ast.literal_eval(x)
        return [i["name"] for i in x]
    except:
        return "no genre"
movies["genres"]=movies["genres"].apply(parse_genres)

In [None]:
genres=list(set([j for i in movies.genres for j in i]))

In [None]:
genres=["Romance",'Adventure',
         'Comedy','Music',
       'Mystery',
     'Fantasy',
     'Thriller',
     'Crime',
      'Horror',
 'Family',
    'History',
 'Foreign',
 'Western',
 'Science Fiction',  'Documentary','Animation',
 'Drama', 'TV Movie',
 'War',
 'Action' ]

In [None]:
def parse_genres(g):
     movies[g]=[1 if g in i  else 0 for i in movies["genres"]]


In [None]:
# OHE genres
for genre in genres:
    parse_genres(genre)
        

In [None]:
countries=list(movies["production_countries"].unique())
movies["production_countries"]=movies["production_countries"].apply(lambda x: countries.index(x))

In [None]:
movies.head()

In [None]:
movies.drop("genres",axis=1,inplace=True)

In [None]:
import numpy as np
def parse_id(x):
    try:
        return int(x)
    except:
        return np.nan
movies["id"]=movies["id"].apply(parse_id)

In [None]:
movies.dropna(axis=0,inplace=True)

In [None]:
movies["popularity"]=movies["popularity"].astype(np.float64)

In [None]:
movies["id"]=movies["id"].astype(int)

In [None]:
movies.info()

# calc weighted rating

weighted avg : vR+mC/v+m

* v is the number of votes for the movie;
* m is the minimum votes required to be listed in the chart;
* R is the average rating of the movie; And
* C is the mean vote across the whole report

In [None]:
df=movies.copy()
C= df['vote_average'].mean()
m= df['vote_count'].quantile(0.9)

In [None]:
C,m

In [None]:
df["weighted_rating"]=(df["vote_count"]*df["vote_average"]+m*C)/(df["vote_count"]+m)

In [None]:
df.head()

In [None]:
df=df.dropna(axis=0)

In [None]:
df.sort_values(by="weighted_rating",ascending=False).head(5)

In [None]:
import matplotlib.pyplot as plt
plt.style.use("fivethirtyeight")
def plot_barh(x,y):
    plt.figure(figsize=(12,6))

    plt.barh(df.sort_values(by=x,ascending=False)[y].head(10),df.sort_values(by=x,ascending=False)[x].head(10))
    plt.xlabel(x)
    
plot_barh("weighted_rating","original_title")

In [None]:
plot_barh("popularity","original_title")

In [None]:
plot_barh("vote_count","original_title")

# content-based similarity

In [None]:
# we have to minimize the df to compute cosine similarity or the kernel restarts
df=df.sample(frac =.5)

In [None]:
df.head()

In [None]:
df=df.drop_duplicates(subset=['id'])

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words='english'
                       ,ngram_range=(1,1))
df['overview'] = df['overview'].fillna('')
tfidf_matrix = tfidf.fit_transform(df['overview'])
tfidf_matrix.shape

In [None]:
tfidf.get_feature_names()[:12]

In [None]:
df["overview"].iloc[0]

In [None]:
tfidf.inverse_transform(tfidf_matrix[0]) # first movie overview

In [None]:
# Import linear_kernel
# we will use sklearn's linear_kernel() instead of cosine_similarities() since it is faster.
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix

cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [None]:
sim_df=pd.DataFrame(columns=df["original_title"],index=df["original_title"],data=cosine_sim)
sim_df.head()

# get similarity

In [None]:
import random
def get_sim(title,sim_df,how="content"):
#     print("reccomendation for {} :".format(title))
    similarity_df=pd.DataFrame()
    similar_movies=sim_df.sort_values(by=title,ascending=False)[title]
    similarity_df["titles"]=similar_movies.index
    similarity_df["similarity {}".format(how)]=similar_movies.values
    return similarity_df
get_sim(random.choice(sim_df.columns),sim_df)

# collaborative filtering

In [None]:
ratings.head()

In [None]:
ratings=ratings.merge(df,on="id")[["userId","id","rating","original_title"]]

In [None]:
ratings

In [None]:
table=ratings.pivot_table(index="original_title",columns="userId",values="rating").fillna(0)
table

In [None]:
# a very sparse matrix , just to check if there are actually non-zero values
import seaborn as sns
sns.heatmap(table)

In [None]:
from scipy.sparse import csr_matrix

movie_features_df_matrix=csr_matrix(table.values)
movie_features_df_matrix

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
csim_collab=cosine_similarity(movie_features_df_matrix,movie_features_df_matrix)

In [None]:
csim_collab.shape

In [None]:
collab_csim_df=pd.DataFrame(index=table.index,columns=table.index,data=csim_collab)
collab_csim_df

In [None]:
get_sim("12 Angry Men",collab_csim_df)

# using movie features

In [None]:
movies.head()

In [None]:
movies_ft=movies.drop(columns=["overview","id","original_title"])
movies_ft.head()

## normalize features

In [None]:
from sklearn.preprocessing import MinMaxScaler
ftscaler=MinMaxScaler()

movies_ft[movies_ft.columns]=ftscaler.fit_transform(movies_ft)
movies_ft.head()

In [None]:
movies_ft.index=movies["original_title"]

In [None]:
movies_ft=movies_ft.T

In [None]:
def similar_genre(title):
    return movies_ft.iloc[:,:5000].corrwith(movies_ft[title]).sort_values(ascending=False)
similar_genre("2010")

In [None]:
def all_recom(titles):
    recom_df=pd.DataFrame()
    recom_df["by genre"]=similar_genre(titles).index[:20]
    recom_df["by content"]=get_sim(titles,sim_df).index
    recom_df["by other users"]=get_sim(titles,collab_csim_df).index
    return recom_df
all_recom("2010")

# Hybrid multi titles recommender

In [None]:
def hybrid_recomm_single(title):
    hybrid_df=pd.DataFrame()
    sim_movs=similar_genre(title)
    hybrid_df["titles"]=sim_movs.index
    hybrid_df["by genre"]=sim_movs.values
    try:
        hybrid_df=hybrid_df.merge(get_sim(title,sim_df,"content"),on="titles")
    except:
        hybrid_df["similarity content"]=[0]*len(hybrid_df)
    try:
        hybrid_df=hybrid_df.merge(get_sim(title,collab_csim_df,"collaborative"),on="titles")
    except:
        hybrid_df["similarity collaborative"]=[0]*len(hybrid_df)
    hybrid_df["overall_weight"]=0.25*hybrid_df["by genre"]+0.5*hybrid_df["similarity collaborative"]+0.7*hybrid_df["similarity content"]
#     hybrid_df=hybrid_df.sort_values(by="overall_weight",ascending=False)
    hybrid_df=hybrid_df.drop_duplicates(subset='titles',
                                       keep='first')
    return hybrid_df.iloc[1:21][["titles","overall_weight"]]
hybrid_recomm_single("Stuart Little")

In [None]:
def hybrid_recomm_multi(titles):
    hybrid_df=pd.DataFrame(columns=["titles","overall_weight"])
    for i in titles:
        hybrid_df=hybrid_df.append(hybrid_recomm_single(i))
#     hybrid_df=hybrid_df.drop_duplicates(subset="titles")
    hybrid_df=hybrid_df.groupby("titles")["overall_weight"].sum()
    return hybrid_df.sort_values(ascending=False).head(20)
hybrid_recomm_multi(["Shrek","2010","Minions","Alien","Monsters, Inc."])