In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn import preprocessing
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import RegexpTokenizer
from scipy.sparse import hstack

In [None]:
df = pd.read_csv("../input/movies-on-netflix-prime-video-hulu-and-disney/MoviesOnStreamingPlatforms_updated.csv")

In [None]:
df.head()

In [None]:
df=df.iloc[:,1:]

In [None]:
df.shape

In [None]:
df.dtypes

#Dropping ID as it is of no significance

In [None]:
df = df.drop(['ID'],axis=1)

In [None]:
# Splitting the variables into Numerical and Textual columns

num = df.select_dtypes(include=['float64',"int64"])
num.head()

In [None]:
char = df.select_dtypes(include= 'object')
char.head()

In [None]:
# Missing Value check

num.isnull().mean()

In [None]:
char.isnull().mean()

In [None]:
# Missing Value Imputation

num = num.loc[:,num.isnull().mean() <= .25]
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
num1 =pd.DataFrame(imputer.fit_transform(num),index=df.index,columns=num.columns)
num1.isnull().mean()

In [None]:
char = char.loc[:,char.isnull().mean() <= .25]
imputer=SimpleImputer(missing_values=np.nan, strategy='most_frequent') 
char1=pd.DataFrame(imputer.fit_transform(char),index=char.index,columns=char.columns)
char1.isna().mean()

In [None]:
# Converting "IMDb" and "Rotten Tomatoes" into numerical Variable
char1["IMDb"] = [str(score)[0:3] for score in char1["IMDb"]]
char1["Rotten Tomatoes"] = [str(score)[0:2] for score in char1["Rotten Tomatoes"]]
char1 = char1.astype({"IMDb":'float', "Rotten Tomatoes":'int64'})

In [None]:
char2 = char1.select_dtypes(include=['float64',"int64"])
char = char1.select_dtypes(include= 'object')

In [None]:
# Merging "IMDb" and "Rotten Tomatoes" with numerical Variables

num = pd.concat([num1,char2],axis=1,join="inner")

## Distribution Plots

In [None]:
# Year
plt.figure(figsize=(20,5))
sns.distplot(num['Year'])
plt.show()

In [None]:
# IMDb
plt.figure(figsize=(20,5))
sns.distplot(num['IMDb'])
plt.show()

In [None]:
# Rotten Tomatoes
plt.figure(figsize=(20,5))
sns.distplot(num['Rotten Tomatoes'])
plt.show()

In [None]:
# Runtime
plt.figure(figsize=(20,5))
sns.distplot(num['Runtime'])
plt.show()

In [None]:
# Function to calculate the movie count platform wise

def movie_count(platfor, count=False):
    if count==False:
        print('Platform {} Count: {}'. format(platfor, df[platfor].sum()))
    else:
        return df[platfor].sum()

In [None]:
movie_count('Netflix')
movie_count('Hulu')
movie_count('Prime Video')
movie_count('Disney+')

In [None]:
# pie chart showing the count of movies platform wise

plat = 'Prime Video','Netflix', 'Hulu', 'Disney'
por = [movie_count('Netflix', count = True),
       movie_count('Prime Video', count=True),
       movie_count('Hulu', count = True),
       movie_count('Disney+', count= True),]
fig1, ax1 = plt.subplots()
ax1.pie (por, labels = plat, autopct = '%1.1f%%', explode = (0.05,0.05,0.05,0.05), shadow = True, startangle = 100 )
plt.show()

In [None]:
# splitting Multiple Generes in a single column

gen = char['Genres'].str.split(',').apply(pd.Series, 1).stack()
gen.index = gen.index.droplevel(-1)
gen.name = 'Genres_spl'
df_genres = char.join(gen)

In [None]:
# plotting Generes

plt.figure(figsize=(15,5))
sns.countplot(x='Genres_spl', data=df_genres)
plt.xticks(rotation=90)
plt.show()

In [None]:
# splitting Multiple Countries in a single column

countr = char['Country'].str.split(',').apply(pd.Series, 1).stack()
countr.index = countr.index.droplevel(-1)
countr.name = 'Country_spl'
df_country = char.join(countr)

In [None]:
# plotting Countries displaying top 15
df_country['Country_spl'].value_counts()[:15].plot(kind='bar',figsize=(23,5))
plt.show()

In [None]:
# splitting Multiple Language in a single column

lang = char['Language'].str.split(',').apply(pd.Series, 1).stack()
lang.index = lang.index.droplevel(-1)
lang.name = 'Language_spl'
df_language = char.join(lang)

In [None]:
# plotting Language displaying top 10
df_language['Language_spl'].value_counts()[:10].plot(kind='bar',figsize=(20,5))
plt.show()

In [None]:
# Merging both Numerical and Textual columns

df1 = pd.concat([num,char],axis=1,join="inner")

## IMDB rating Distribution According On Each Platform

In [None]:
df2 = pd.melt(df1, id_vars=["Title","Year","IMDb","Type","Runtime"], var_name="platform")
df2 = df2[df2.value==1]
df2.drop(columns=["value"],axis=1,inplace=True)

In [None]:
plot = sns.FacetGrid(df2, col = "platform")
plot.map(plt.hist, "IMDb")
plt.show()

## Runtime Per Platform Along with Age Group

In [None]:
# Converting "IMDb" and "Rotten Tomatoes" to Numbers
df["IMDb"] = [str(score)[0:3] for score in df["IMDb"]]
df["Rotten Tomatoes"] = [str(score)[0:2] for score in df["Rotten Tomatoes"]]
df.head()

In [None]:
df3 = pd.melt(df, id_vars=["Title","Year","Age","IMDb","Rotten Tomatoes","Type","Runtime"], var_name="platform")
df3 = df3[df3.value==1]
df3.drop(columns=["value"],axis=1,inplace=True)

In [None]:
bar = sns.barplot(x="platform", y="Runtime",hue="Age", estimator=sum, data=df3)

## Recommender System

In [None]:
#using only the numerical columns

scaler = preprocessing.MinMaxScaler(feature_range=(0, 1))
num_min_max = pd.DataFrame((scaler.fit_transform(num)))
num_min_max.columns=num.columns
num_min_max.head()

In [None]:
# Compute the cosine similarity

cos_sim = cosine_similarity(num_min_max, num_min_max)

# Reverse mapping of indices and movie titles
indices = pd.Series(df1.index, index=df1['Title']).drop_duplicates()

In [None]:
indices.head()

In [None]:
def give_rec(title, sim = cos_sim):

    idx = indices[title]

    sim_scores = list(enumerate(sim[idx]))

    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    # Scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]
    # Movie indices
    movie_indices = [i[0] for i in sim_scores]
    # Top 10 most similar movies
    return df1['Title'].iloc[movie_indices]

In [None]:
# Execute get_rec() function for getting recommendation
give_rec('Swades',sim = cos_sim)

In [None]:
#the function performs all the important preprocessing steps

def preprocess(df):
    #combining all text columns
    # Selecting all object data type and storing them in list
    obj = list(df.select_dtypes(include=['object']).columns)
    # Removing ID and Title column
    obj.remove("Title")
    # Joining all text/object columns using commas into a single column
    df['all_text']= df[obj].apply(lambda x: ','.join(x.dropna().astype(str)),axis=1)

    # Creating a tokenizer to remove unwanted elements from our data like symbols and numbers
    token = RegexpTokenizer(r'[a-zA-Z]+')

    # Converting TfidfVector from the text
    tfv = TfidfVectorizer(lowercase=True,stop_words='english',ngram_range = (1,1),tokenizer = token.tokenize)
    text_counts = tfv.fit_transform(df['all_text'])

    

    # Aelecting numerical variables
    num_1 = df.select_dtypes(include=['float64',"int64"])

    # Scaling Numerical variables
    scaler = preprocessing.MinMaxScaler(feature_range=(0, 1))

    # Applying scaler on our data and converting i into a data frame
    num_min_max_1 = pd.DataFrame((scaler.fit_transform(num_1)))
    num_min_max_1.columns = num_1.columns

    # Adding our adding numerical variables in the TF-IDF vector
    IMDb = num_min_max_1.IMDb.values[:, None]
    X_train_dtm = hstack((text_counts, IMDb))
    Netflix = num_min_max_1.Netflix.values[:, None]
    X_train_dtm = hstack((X_train_dtm, Netflix))
    Hulu = num_min_max_1.Hulu.values[:, None]
    X_train_dtm = hstack((X_train_dtm, Hulu))
    Prime = num_min_max_1["Prime Video"].values[:, None]
    X_train_dtm = hstack((X_train_dtm, Prime))
    Disney = num_min_max_1["Disney+"].values[:, None]
    X_train_dtm = hstack((X_train_dtm, Disney))
    Runtime = num_min_max_1.Runtime.values[:, None]
    X_train_dtm = hstack((X_train_dtm, Runtime))
    return X_train_dtm

In [None]:
matri = preprocess(df1)
matri.shape

In [None]:
cos_sim1 = cosine_similarity(matri, matri)

# Reverse mapping of indices and movie titles
indices = pd.Series(df1.index, index=df1['Title']).drop_duplicates()

In [None]:
give_rec("Swades",sim = cos_sim1)