# Machine Learning and Data Mining Case Study - Final Review
#### Submitted by,
### *R. SREE RANJANI* *(CB.EN.U4CSE18255)*
### *THAANVI SUDARSAN MEDA* *(CB.EN.U4CSE18262)*
### **Date of Submission**: 6/11/2021





## **Topic**
### ***Netflix Recommendation System***

## Exploratory Data Analysis(EDA)
### 1. Loading the data

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy import stats
import io
netflix_df = pd.read_csv("../input/netflix-shows/netflix_titles.csv")
netflix_df.head()

### 2. Data Description

In [None]:
print("Netflix dataframe details:")
print("Data Dimensions- Total Rows: {}\nTotal Columns: {}".format(netflix_df.shape[0],netflix_df.shape[1]))

In [None]:
print("Checking on the Null values found in columns")
netflix_df.isna().any()

In [None]:
netflix_df.describe()

### 3. Data Preprocessing
##### Based on the null values found in columns 'director', 'cast', 'country', 'date_added' and 'rating', we replace the null values by the mode.

In [None]:
for column in netflix_df.columns:
    netflix_df[column].fillna(netflix_df[column].mode()[0], inplace=True)
filledna = netflix_df.copy()
filledna.head()

In [None]:
!pip install unidecode
import unidecode
netflix_data = netflix_df.copy()
netflix_data.head()

netflix_data['date_added']

In [None]:
netflix_data['date_added'] = list(map(lambda x : str(x)[-4:],netflix_data['date_added']))
netflix_data['date_added'] = netflix_data['date_added'].astype('int')
netflix_data['age_of_show'] = 2021 - netflix_data['date_added']
netflix_data['age_of_show'] =  list(map(lambda x : str(x)+" years",netflix_data['age_of_show']))

netflix_vis = netflix_data.copy()

In [None]:
netflix_vis.head()

### 4. Data Visualization
##### We discover that the content type of movie is greater than shows in the dataset

In [None]:
from numpy import dot
import seaborn as sns
from numpy import random
from numpy.linalg import norm
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors  
sns.countplot(x = 'type', data = netflix_vis)

plt.xlabel("Show Type")
plt.ylabel("Count Of Content")

##### We infer from below that Netflix targets mature audience as there are more relevant content for TV-MA (mature audience)

In [None]:
plt.figure(figsize=(10,3))

plt.subplot(1,2,1)
movies_rating = netflix_vis[netflix_vis['type'] == "Movie"]
movies_rating.rating.hist(bins=10,color='magenta')
plt.xlabel("Rating")
plt.title("Movie ratings frequency")
plt.xticks(rotation = 45)


plt.subplot(1,2,2)
show_rating = netflix_vis[netflix_vis['type'] == "TV Show"]
show_rating.rating.hist(bins=10,color='green')
plt.xlabel("Rating")
plt.title("TV Show ratings frequency")
plt.xticks(rotation = 45)

plt.tight_layout(pad=0.4)

##### Content addition year wise

In [None]:
sns.boxplot(x = 'type',y = 'date_added', data=netflix_vis, palette='rocket')
plt.ylabel("Date Added")
plt.xlabel("Show Type")

##### Largest content contributors country wise

In [None]:
country = netflix_vis['country'].value_counts()[:10]
country.plot(kind='bar',title='No. of movies over the years contributed by different countries',color='red')

#### Movies v/s shows contributed by countries

In [None]:
plt.figure(figsize=(20,5))
cpal=['red','green','blue','magenta','purple','pink','yellow','brown','grey','orange']
for i in range(len(country.index)):
    plt.subplot(2,5,i+1)
    moviesvsshows = netflix_vis[netflix_vis['country'] == country.index[i]]
    moviesvsshows = moviesvsshows['type'].value_counts()
    moviesvsshows.plot(kind='bar',color=cpal[i])
    plt.title("Movies vs Shows in {}".format(country.index[i]))
    plt.xticks(rotation = 360)
    
plt.tight_layout(pad=0.5)

##### Age of shows in Netflix

In [None]:
age = netflix_vis['age_of_show'].value_counts()[:10]
age.plot(kind='bar',title='Number of movies over the years',color='violet')


##### Targeted genre of content

In [None]:
genre = netflix_vis["listed_in"].value_counts()[:10]
genre.plot(kind='barh',title='Genre of movies over the years',color='purple')

##### Content age of addition v/s release

In [None]:
added_year = pd.pivot_table(netflix_vis, values='title', index='date_added', aggfunc=np.count_nonzero)
release_year = pd.pivot_table(netflix_vis, values='title', index='release_year', aggfunc=np.count_nonzero)

plt.figure(figsize=(8,4))
sns.lineplot(x='date_added', y='title', data=added_year, label='Year Added')
sns.lineplot(x='release_year', y='title', data=release_year, label='Year Release')
plt.xlabel("Year")
plt.ylabel("Number of Content")

plt.legend(shadow=True)

##### Content duration 

In [None]:
plt.figure(figsize=(10,3))

plt.subplot(1,2,1)
movies_duration = netflix_vis[netflix_vis['type'] == "Movie"]
movies_duration = movies_duration['duration'].value_counts()[:10]
movies_duration.plot(kind='bar',title='Number of movies over the years',color='orange')
plt.xlabel("Duration")
plt.title("Top 10 Movie duration")
plt.xticks(rotation = 45)

plt.subplot(1,2,2)
show_duration = netflix_vis[netflix_vis['type'] == "TV Show"]
show_duration = show_duration['duration'].value_counts()[:10]
show_duration.plot(kind='bar',title='Number of movies over the years',color='brown')
plt.xlabel("Seasons")
plt.title("Top 10 TV Shows seasons")
plt.xticks(rotation = 45)

plt.tight_layout(pad=0.3)

## Implementation of the Recommender System
### ***Content Based filtering***

In [None]:
filledna.isna().any()

In [None]:
def clean_data(x):
    return str.lower(x.replace(" ",""))

In [None]:
#Features on which the model is to be filtered
features=['title', 'director', 'cast', 'listed_in', 'description']
filledna=filledna[features]

In [None]:
for feature in features:
    filledna[feature] = filledna[feature].apply(clean_data)
    
filledna.head()

In [None]:
def create_soup(x):
    return x['title']+ ' '+ x['director']+ ' '+ x['cast']+ ' ' + x['listed_in']+ ' ' +x['description']

In [None]:
filledna['soup'] = filledna.apply(create_soup, axis=1)

In [None]:
filledna.head()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(filledna['soup'])

cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

In [None]:
filledna = filledna.reset_index()
indices = pd.Series(filledna.index, index=filledna['title'])

In [None]:
def content_recommendation_system(title, cosine_sim=cosine_sim2):
    title = title.replace(' ', '').lower()
    idx = indices[title]
    
    #Get the pairwise similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    #Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x:x[1], reverse=True)
    
    #Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]
    
    #Get the movie indices
    movie_indices = [i[0] for i in sim_scores]
    
    #return the top 10 similar movies
    return netflix_df['title'].iloc[movie_indices]


In [None]:
content_recommendation_system('3 Idiots', cosine_sim2)

In [None]:
content_recommendation_system('Friends', cosine_sim2)

In [None]:
content_recommendation_system('Narcos', cosine_sim2)

In [None]:
content_recommendation_system('Gilmore Girls', cosine_sim2)

In [None]:
content_recommendation_system('Kota Factory', cosine_sim2)

### ***Collaborative Filterin*g**

In [None]:
import datetime
import numpy as np
Begin = datetime.datetime.now()

userid = np.arange(1, 2000+1, 1)
userid = ["uid" + str(userid) for userid in userid]

usercols = netflix_df['title'].unique()
userwatch = np.random.randint(5, size=(2000,len(usercols)))

watchhist = pd.DataFrame(userwatch,columns=usercols)
watchhist.insert(0,"user_id",userid)
watchhist = watchhist.set_index('user_id')
watchhist = watchhist.transpose()
watchhist.index.name = 'title'

watchhist.head()

In [None]:
from scipy.sparse import csr_matrix
watch_sparse=csr_matrix(watchhist)

In [None]:
watchhist.reset_index(inplace=True)
from sklearn.neighbors import NearestNeighbors
model=NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20) ## model

In [None]:
model.fit(watch_sparse)

In [None]:
def collab_recommendation_system(title):
    
    try:
        row = watchhist[watchhist['title'] == title].index[0]

        distances , indices = model.kneighbors(watch_sparse[row],n_neighbors=5+1) 
        indices = [l for m in indices.tolist() for l in m]
        distances = [l for m in distances.tolist() for l in m]

        recommendation = {}

        for i in range(1,len(indices)):
            movie_name = watchhist.iloc[indices[i]]['title']
            recommendation[movie_name] = distances[i]

        recommendation = {k: v for k, v in sorted(recommendation.items(), key=lambda item: item[1])}
        results = pd.DataFrame()

        for key,value in recommendation.items():
            row = netflix_df[netflix_df['title'] == key]
            results = results.append(row)

        return(results.iloc[1:,:-1])
        
    except:
        print("Sorry there seems to be a problem. Kindly try another movies or show")  

In [None]:
collab_recommendation_system("Mercy")   

In [None]:
collab_recommendation_system("3 Idiots")

In [None]:
collab_recommendation_system("Gilmore Girls")  

In [None]:
collab_recommendation_system("The Conjuring")  

In [None]:
collab_recommendation_system("Narcos")   