In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Netflix Data: Task 1 - What to watch on Netflix?

Last updated: July 13th, 2020

We create a recommendation engine to suggest similar shows in Netflix.
The similarity is calculated using following features of the shows:
### **Features used:** 
We select a number of features based on importance.
* Description 
* Genre
* Rating
* Country of Release

### **Approach:**
Each feature utilizes a different approach for similarity calculation. Eventually, we evaluate the cummulative similarity based on all the features processed to give a single number for the overall similarity score.
* Description - We filter the description by removing stopwords and punctuations. Then we use TF-IDF to find weight (importance) of each word in the description. Cosine similarity is used to find similarity between descriptions of two shows.
* Genre - Each show has set of genre types that are applicable as per the content. We match each genre and similarity is calculated as the [intersection/ union] of the genre for given two shows.
* Rating - As some of the rating titles mean similar categories, we group given ratings into appropriate clusters, e.g., Y, Y7, and Y7-FV are grouped in one cluster. As each rating has a distance from other rating, i.e., Y is closer to PG than R which means that a person who watches Y rated shows is more likely to watch Y or PG shows than R rated shows. Thus, each cluster is trategically placed from other clusters according to the likelihood of closeness.
* Country of Release - Country of release follows the same approach as that of Genre.

### **Final similarity score:**

We gave weights to each metric where description had the highest weight of 0.4 while genre, rating and country had equal weights of 0.2 each. 

Do the following:
1. 

## **Netflix Data EDA**

In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import seaborn as sns
from scipy import spatial
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix


In [None]:
# load data
# path = "C:/Users/gunjan/Google Drive/Kaggle/Netflix/data/"
path = '/kaggle/input/netflix-shows/netflix_titles.csv'

In [None]:
data_df = pd.read_csv(path)
data_df.head()

### Distribution of shows over release year

In [None]:
year_x = data_df['release_year'].values
fig = sns.distplot(year_x,kde=False)
plt.xlabel("Year of Release")
plt.ylabel("Frequency")
plt.title("Distribution of Netflix Shows over Release Year")
plt.show(fig)

### Distribution of shows over type (Movie or TV)

In [None]:
pd.value_counts(data_df['type']).plot(kind="bar")
plt.xlabel("Type of Show")
plt.ylabel("Frequency")
plt.title("Distribution of Netflix Shows over Type of Show")
plt.show()

### Distribution of shows over Country

In [None]:
country=data_df.groupby('country').count()
country.sort_values(by='show_id', inplace=True, ascending=False)
country_top=country.head(10)

country_top['show_id'].plot(kind='barh', figsize=(11,15))
plt.title("Distribution of Netflix Shows over Country")
plt.xlabel('Frequency')
plt.ylabel('Country')
plt.show()

### Distribution of Shows over Genre

In [None]:
genre=data_df.groupby('listed_in').count()
genre.sort_values(by='show_id', inplace=True, ascending=False)
genre_top=genre.head(10)

genre_top['show_id'].plot(kind='barh', figsize=(11,15))
plt.title("Distribution of Netflix Shows over Genre")
plt.xlabel('Frequency')
plt.ylabel('Genre')
plt.show()

## Content Based Filtering

Recommendation engine is a method to predict what user desires at a given moment in time. It can be divided into two sub-classes:

- Content-based Filtering
- Collaborative Filtering

#### Content-based Filtering

#### Collaborative Filtering

## CODE

### Recommendation by country

In [None]:
def clean_country(country_list):
    for i, country in enumerate(country_list):
        country_list[i] = country.strip()
    return set(country_list)

def find_country_score(movie_1, movie_2):
    try:
        country_m1 = movie_1['country'].split(',')
        country_m2 = movie_2['country'].split(',')
        country_m1 = clean_country(country_m1)
        country_m2 = clean_country(country_m2)
        union = len(country_m1.union(country_m2))
        inter = len(country_m1.intersection(country_m2)) 
        return inter/union
    except Exception as e:
        return 0.0
    
# test country similarity

country_sim = []
num = 50
test_data = data_df.head(num)
for i, row1 in test_data.iterrows():
    row_sim = []
    for j, row2 in test_data.iterrows():
        row_sim.append(find_country_score(row1, row2))
    country_sim.append(row_sim)

In [None]:
df_cm = pd.DataFrame(country_sim, range(num), range(num))
plt.figure(figsize=(10,10))
sns.set(font_scale=1.4) # for label size
sns.heatmap(df_cm, annot=True, cmap="Blues", annot_kws={"size": 5}) # font size

plt.show()

### Methodology

The above confusion matrix represents the similarity score between movies based on the country, the movie was released in. To find the recommendation, we have used intersection by union approach. Initially, we find the intersection of country list between two movies and total number of unique countries between two movies. Once we achieve this, we divide the number of intersected movies by all uniques movies and get our similarity score.  

Our dataset consists of 8% values which nan. We can assume that when we have nan, we can't find similarity and thus the score will be equal to 0. 

### Recommendation by Ratings

In [None]:
ratings = data_df['rating'].unique()
replace_rating = {}
for rating in ratings:
    if rating == 'TV-PG' or rating == 'PG' or rating == 'PG-13' or rating == 'TV-14':
        replace_rating[rating] = 'PG'
    
    elif rating == 'TV-MA' or rating == 'NC-17' or rating == 'R':
        replace_rating[rating] = 'R'
    
    elif rating == 'NR' or rating == 'UR' or rating == 'TV-G' or rating == 'G':
        replace_rating[rating] = 'U'
    
    elif rating == 'TV-Y7-FV' or rating == 'TV-Y7' or rating == 'TV-Y':
        replace_rating[rating] = 'Y'
    
    else:
        replace_rating[rating] = 'NAN'
replace_rating

data_df['rating'] = data_df['rating'].map(replace_rating)
data_df['rating']

In [None]:
def find_rating_score(movie_1, movie_2):
    rating_1 = movie_1['rating']
    rating_2 = movie_2['rating']
    if rating_1 == 'U':
        recom = 1.0
    
    elif rating_1 == 'Y':
        if rating_2 == 'R':
            recom = 0.0
        elif rating_2 == 'PG':
            recom = 0.25
        elif rating_2 == 'U':
            recom = 1.0
        else:
            recom = 0.0

    elif rating_1 == 'PG':
        if rating_2 == 'R':
            recom = 0.0
        elif rating_2 == 'Y':
            recom = 0.75
        elif rating_2 == 'U':
            recom = 1.0
        else:
            recom = 0.0
            
    elif rating_1 == 'R':
        if rating_2 == 'Y':
            recom = 0.25
        elif rating_2 == 'PG':
            recom = 0.5
        elif rating_2 == 'U':
            recom = 1.0
        else:
            recom = 0.0
    
    else:
        recom = 0.0
    
    return recom 

rating_sim = []
num = 50
test_data = data_df.head(num)
for i, row1 in test_data.iterrows():
    row_sim = []
    for j, row2 in test_data.iterrows():
        row_sim.append(find_rating_score(row1, row2))
    rating_sim.append(row_sim)

In [None]:
df_cm = pd.DataFrame(rating_sim, range(num), range(num))
plt.figure(figsize=(10,10))
sns.set(font_scale=1.4) # for label size
sns.heatmap(df_cm, annot=True, cmap="Blues", annot_kws={"size": 5}) # font size

plt.show()

### Recommendation by Genre

In [None]:
genre = data_df['listed_in']
genre = genre.values
list_genre = []
for g in genre:
    for i in g.split(','):
        list_genre.append(i.strip())

list(set(list_genre))

In [None]:
def clean_genre(genre_list):
    for i, genre in enumerate(genre_list):
        genre_list[i] = genre.strip()
    return set(genre_list) 

def find_genre_score(movie_1, movie_2):
    try:
        genre_m1 = movie_1['listed_in'].split(',')
        genre_m2 = movie_2['listed_in'].split(',')
        genre_m1 = clean_genre(genre_m1)
        genre_m2 = clean_genre(genre_m2)
        union = len(genre_m1.union(genre_m2))
        inter = len(genre_m1.intersection(genre_m2)) 
        return inter/union
    except Exception as e:
        return 0.0
    
# test country similarity

genre_sim = []
num = 50
test_data = data_df.head(num)
for i, row1 in test_data.iterrows():
    row_sim = []
    for j, row2 in test_data.iterrows():
        row_sim.append(find_genre_score(row1, row2))
    genre_sim.append(row_sim)

In [None]:
df_cm = pd.DataFrame(genre_sim, range(num), range(num))
plt.figure(figsize=(10,10))
sns.set(font_scale=1.4) # for label size
sns.heatmap(df_cm, annot=True, cmap="Blues", annot_kws={"size": 5}) # font size

plt.show()

### Recommendations by Description

In [None]:
# "stopwords" are the words that appear very frequently in a language and have very low importance in determining the context of the sentence. It is a good practice to remove these stopwords before we start processing the description for TF-IDF.
#  define all the stop words
stopwords = ['i',
'me',
'my',
'myself',
'we',
'our',
'ours',
'ourselves',
'you',
'your',
'yours',
'yourself',
'yourselves',
'he',
'him',
'his',
'himself',
'she',
'her',
'hers',
'herself',
'it',
'its',
'itself',
'they',
'them',
'their',
'theirs',
'themselves',
'what',
'which',
'who',
'whom',
'this',
'that',
'these',
'those',
'am',
'is',
'are',
'was',
'were',
'be',
'been',
'being',
'have',
'has',
'had',
'having',
'do',
'does',
'did',
'doing',
'a',
'an',
'the',
'and',
'but',
'if',
'or',
'because',
'as',
'until',
'while',
'of',
'at',
'by',
'for',
'with',
'about',
'against',
'between',
'into',
'through',
'during',
'before',
'after',
'above',
'below',
'to',
'from',
'up',
'down',
'in',
'out',
'on',
'off',
'over',
'under',
'again',
'further',
'then',
'once',
'here',
'there',
'when',
'where',
'why',
'how',
'all',
'any',
'both',
'each',
'few',
'more',
'most',
'other',
'some',
'such',
'no',
'nor',
'not',
'only',
'own',
'same',
'so',
'than',
'too',
'very',
's',
't',
'can',
'will',
'just',
'don',
'should',
'now']

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# extract all the descriptions
descriptions = data_df['description'].values
des = list(descriptions)

# cerate the vectorizer with given stopwords
vectorizer = TfidfVectorizer(stop_words = stopwords)
X = vectorizer.fit_transform(des)
XX = X.todense()
print(XX.shape)
## XX is the matrix which stores TFIDF scores for each description

In [None]:
# function to calculate similarity score using TFIDF on description

def find_description_score(movie_1, movie_2):
    
    '''
    INPUT: movie_1 : row2 for features of the show1 given by the User & 
           movie_2 : row2 for features of the show2 we are comparing this show with
    OUTPUT: Similarity score between the descriptions of show1 and show2
    '''
    
    # extract row numbers
    index_1 = data_df.index[data_df['title'] == movie_1['title']]
    index_2 = data_df.index[data_df['title'] == movie_2['title']]
    
    a = np.array(XX[index_1])
    b = np.array(XX[index_2])
    similarity_score = 1-spatial.distance.cosine(a[0], b[0])
    
    return similarity_score


### Combining the Recommendations

In [None]:
movie_1 = data_df.iloc[17]
print(movie_1)

def find_recommendations(data_df, movie_1):
    rec_score = []
    for index, movie_2 in tqdm(data_df.iterrows()):
        rec_country = find_country_score(movie_1, movie_2)
        rec_genre = find_genre_score(movie_1, movie_2)
        rec_rating = find_rating_score(movie_1, movie_2)
        rec_description = find_description_score(movie_1, movie_2)

        score = 0.2*rec_genre + 0.2*rec_rating + 0.2*rec_country + 0.4*rec_description
        rec_score.append(score)
    
    data_df['score'] = pd.DataFrame(rec_score)   
    return data_df

data_df = find_recommendations(data_df, movie_1)
data_df.sort_values(by=['score'], ascending=False).head(10)