In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import datetime
import missingno as msno

# 1. Data Preparation

In [None]:
df = pd.read_csv('../input/netflix-shows/netflix_titles.csv')
df.info()
df

Convert datetime to Dtype 'datetime64'

In [None]:
df['date_added'] = pd.to_datetime(df['date_added'])
df.info()

In [None]:
# df['Hour'] = df['date_added'].apply(lambda time: time.hour)
df['Date'] = df['date_added'].apply(lambda x: x.day)
df['Day of Week'] = df['date_added'].apply(lambda x: x.dayofweek)
df['Month'] = df['date_added'].apply(lambda x: x.month)
df['Year'] = df['date_added'].apply(lambda x: x.year)
df

Check Missing Values

In [None]:
msno.matrix(df);

In [None]:
msno.bar(df);

In [None]:
#missing value
miss = pd.Series(df.isnull().sum(), name='count')
percent_miss = pd.Series(round(df.isnull().sum()/df.shape[0]*100,2), name='percent')
pd.DataFrame([miss, percent_miss]).T.sort_values(by = 'percent', ascending=False)

Unique values for each columns

In [None]:
uni = [df[col].nunique() for col in df.columns]  #df.nunique().to_list()
unique = dict(zip(df.columns,uni))
sorted(unique.items(), key=lambda x:x[1],reverse=False)

Replace Missing Values

In [None]:
df.director.fillna('Unknown', inplace=True)
df.director

In [None]:
df[df.country.isnull()]

In [None]:
#replace nan
df.loc[38, 'country'] = 'Thailand'

miss_col = df.columns[df.isnull().any()].to_list()
miss_col.remove('date_added')
print('Missing columns:',miss_col)

In [None]:
for col in miss_col:
    df[col].fillna('Unknown', inplace=True)

#drop rows which have date_added is NaN
df.dropna(subset =['date_added'], axis=0, inplace=True)

#check missing value
df.isnull().any()

# Analysis

## Recommendation System (content-based filtering)

In [None]:
#create column name 'text' which combine all texts together
df['text'] =  df['title'] + ' ' + df['director'] + ' ' + df['cast'] + ' ' + df['rating'] + ' '+ df['listed_in'] + ' ' + df['description'] 

#drop if duplicate data 
df.drop_duplicates(subset=['text'], inplace=True)

#reset index
df.reset_index(drop=True,inplace=True)

In [None]:
#sample text
df.text.sample(1).tolist()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import text

#add stop word
my_words = set(['Unknown'])

my_stop_words = text.ENGLISH_STOP_WORDS.union(my_words)

tfidfvectorizer = TfidfVectorizer(analyzer='word' , stop_words='english')

tfidf_term_vectors = tfidfvectorizer.fit_transform(df['text'])

print(tfidf_term_vectors.shape)

tfidf_term_vectors.todense()

In [None]:
# Import metrics
from sklearn.metrics.pairwise import linear_kernel, euclidean_distances, manhattan_distances, cosine_similarity

# Compute the metrics
linear_ke = linear_kernel(tfidf_term_vectors, tfidf_term_vectors)

euclidean = euclidean_distances(tfidf_term_vectors, tfidf_term_vectors) 

manhattan = manhattan_distances(tfidf_term_vectors, tfidf_term_vectors)

cosine = cosine_similarity(tfidf_term_vectors, tfidf_term_vectors) #same as linear_kernel

print(linear_ke.shape)

linear_ke

In [None]:
indices = pd.Series(df.index, index=df['title']).drop_duplicates()
indices

In [None]:
def get_recommendations(title, metric):
    
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    pairwsie = list(enumerate(metric[idx]))

    # Sort the movies based on the similarity scores
    pairwsie = sorted(pairwsie, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    scores = pairwsie[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in scores]

    # Return the top 10 most similar movies
    return df['title'].iloc[movie_indices]

In [None]:
df.title.unique().tolist()

In [None]:
name = 'Avengers: Infinity War'

metric = [linear_ke, euclidean, manhattan]

for i in metric:
    print(get_recommendations(name, i),'\n')

## Clean data by lower text and remove space 

In [None]:
df['text2'] = df['text'].apply(lambda x: str.lower(x.replace(" ", "")))
df['text2']

In [None]:
tfidf_term_vectors2 = tfidfvectorizer.fit_transform(df['text2'])

linear_ke2 = linear_kernel(tfidf_term_vectors2, tfidf_term_vectors2)
euclidean2 = euclidean_distances(tfidf_term_vectors2, tfidf_term_vectors2) 
manhattan2 = manhattan_distances(tfidf_term_vectors2, tfidf_term_vectors2)

In [None]:
name = 'Avengers: Infinity War'

metric = [linear_ke2, euclidean2, manhattan2]

for i in metric:
    print(get_recommendations(name, i),'\n')

In [None]:
set1 = get_recommendations('Avengers: Infinity War', linear_ke)
set2 = get_recommendations('Avengers: Infinity War', linear_ke2)

set1=set(set1)
set2=set(set2)

intersec = set1.intersection(set2)
intersec

In [None]:
set1 = get_recommendations('Avengers: Infinity War', euclidean)
set2 = get_recommendations('Avengers: Infinity War', euclidean2)

set1=set(set1)
set2=set(set2)

intersec = set1.intersection(set2)
intersec

In [None]:
set1 = get_recommendations('Avengers: Infinity War', manhattan)
set2 = get_recommendations('Avengers: Infinity War', manhattan2)

set1=set(set1)
set2=set(set2)

intersec = set1.intersection(set2)
intersec

different metric and different cleaning text has a different result