# Import Data And Libraries

In [None]:
import numpy as np 
import pandas as pd 
import os
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
data = pd.read_csv('/kaggle/input/netflix-shows/netflix_titles.csv')

# Exploratory Data Analysis (EDA)

In [None]:
data.describe()

In [None]:
data.isnull().sum()

In [None]:
data.head()

# Distribution of Movie/TV Show

In [None]:
bar, ax = plt.subplots(figsize = (12,12))
plt.pie(data['type'].value_counts(), labels = data['type'].value_counts().index, autopct="%.1f%%")
plt.title('Distribution of Movie/TV Show', size=20)

In [None]:
bar, ax = plt.subplots(figsize = (10,10))
sns.barplot(x = data['release_year'].value_counts().index[:5], y = data['release_year'].value_counts()[:5])
plt.xlabel('Year')
plt.ylabel('Frequency')
plt.title('Release Frequency over Years')

# Growth of Movie/TV Show over Years

In [None]:
movie_data = data[data['type'] == 'Movie']
tv_show_data = data[data['type'] == 'TV Show']
# bar,ax = plt.subplots(1,2,figsize=(10,10))
temp = data[['type', 'release_year']]
temp = temp.value_counts().to_frame()
temp.reset_index(level=[0,1], inplace=True)
temp = temp.rename(columns = {0:'count'})
temp = pd.concat([temp[temp['type'] == 'Movie'][:5], temp[temp['type']== 'TV Show'][:5]])


In [None]:
# ax, bar = plt.subplots(figsize = (10,10))
sns.catplot(x = 'release_year', y = 'count', hue = 'type', data = temp, kind = 'point')
plt.xlabel('Release Year')
plt.ylabel('Frequency')
plt.title('Growth of Movie/TV Show over Years', size=14)

# Histogram For Movie Duration

In [None]:
temp = data[data['type'] == 'Movie'].reset_index()
temp['movie_duration'] = [int(x.split()[0]) for x in temp['duration'][temp['type'] == 'Movie']]
temp['movie_duration']
bar, ax = plt.subplots(figsize = (10,10))
sns.distplot(a = temp['movie_duration'])

# Famous Director in Movie/TV Show

In [None]:
temp = list()
clean_data = data.dropna()
clean_data.reset_index(inplace=True)
for ind, element in clean_data.iterrows():
    type_show = element['type']
    for director in str(element['director']).split(','):
        temp.append([type_show, director])
director_data = pd.DataFrame(temp, columns= ['type', 'director'])
director_data

In [None]:
director_data_count = director_data.value_counts().to_frame()
director_data_count.reset_index(level=[0,1], inplace=True)
famous_director = director_data_count.rename(columns={0:'count'})
# famous_director = pd.concat([famous_director[famous_director['type'] == 'Movie'].iloc[:5,:], famous_director[famous_director['type'] == 'TV Show'].iloc[:5,:]])
# famous_director.reset_index(inplace=True)
famous_director

In [None]:

for unique_type in famous_director['type'].unique():
    bar, ax = plt.subplots(figsize=(10,10))
    sns.barplot(x = 'director', y = 'count', data = famous_director[famous_director['type'] == unique_type].iloc[:5])
    plt.xlabel('Director in {}'.format(str(unique_type)))
    plt.ylabel('Frequency')
    plt.title('Famous Director in {}'.format(str(unique_type)), size=20)

# Top Three countries with most TV Shows

In [None]:
temp = list()
clean_data = data.dropna()
clean_data.reset_index(inplace=True)
for ind, element in clean_data.iterrows():
    type_show = element['type']
    for country in str(element['country']).split(','):
        temp.append([type_show, country])
country_data = pd.DataFrame(temp, columns= ['type', 'country'])

country = country_data.value_counts().to_frame()
country.reset_index(level=[0,1], inplace = True)
country = country.rename(columns = {0:'count'})
country

In [None]:
bar, ax = plt.subplots(1,2, figsize=(14,10))
sns.barplot(x = 'count', y ='country', data = country[country['type'] == 'Movie'].iloc[:3,:], ax = ax[0])
ax[0].set_xlabel('Frequency')
ax[0].set_ylabel('Country')
ax[0].set_title('Top Three countries with most Movies')
sns.barplot(x = 'country', y = 'count', data = country[country['type'] == 'TV Show'].iloc[:3,:], ax = ax[1])
ax[1].set_xlabel('Country')
ax[1].set_ylabel('Frequency')
ax[1].set_title('Top Three countries with most TV Shows')

# Expansion of TV Show\Movie over Years in U.S.A

In [None]:
temp = data[['type','release_year']][data['country'] == 'United States']
temp = temp.value_counts().to_frame()
temp.reset_index(level=[0,1], inplace =True)
temp = temp.rename(columns = {0:'count'})
temp

In [None]:
bar, ax = plt.subplots(1,2, figsize = (14,8))
sns.pointplot(x = 'release_year', y = 'count', data = temp[temp['type'] == 'Movie'].iloc[:5], kind = 'point', ax = ax[0])
sns.pointplot(x = 'release_year', y = 'count', data = temp[temp['type'] == 'TV Show'].iloc[:5], kind = 'point', ax = ax[1])
ax[0].set_xlabel('Release Year')
ax[0].set_ylabel('Release Frequency')
ax[0].set_title('Expansion of Movies over Years in U.S.A', size=18)
ax[1].set_xlabel('Release Year')
ax[1].set_ylabel('Release Frequency')
ax[1].set_title('Expansion of TV Show over Years in U.S.A', size=18)


# Most Famous Cast in TV Show

In [None]:
temp = list()
clean_data = data.dropna()
clean_data.reset_index(inplace=True)
for ind, element in clean_data.iterrows():
    type_show = element['type']
    for cast in str(element['cast']).split(','):
        temp.append([type_show, cast])
cast_data = pd.DataFrame(temp, columns= ['type', 'country'])


cast = cast_data.value_counts().to_frame()
cast.reset_index(level=[0,1], inplace=True)
cast = cast.rename(columns = {0:'count'})
cast

In [None]:
bar, ax = plt.subplots(figsize= (10,10))
plt.pie(x = cast['count'][cast['type'] == 'Movie'][:10] , labels = cast['country'][cast['type'] == 'Movie'][:10], autopct="%.1f%%")
plt.title('Most Famous Cast in Movies', size=20)
bar, ax = plt.subplots(figsize= (10,10))
plt.pie(x = cast['count'][cast['type'] == 'TV Show'][:10] , labels = cast['country'][cast['type'] == 'TV Show'][:10], autopct="%.1f%%")
plt.title('Most Famous Cast in TV Show', size=20)

# Top Two Genre in last 10 years

In [None]:
temp = list()
clean_data = data.dropna()
clean_data.reset_index(inplace=True)
for ind, element in clean_data.iterrows():
    type_show = element['release_year']
    for cast in str(element['listed_in']).split(','):
        temp.append([type_show, cast])
cast_data = pd.DataFrame(temp, columns= ['release_year', 'cast'])
cast_data

In [None]:
cast = cast_data.value_counts().to_frame()
cast.reset_index(level=[0,1], inplace=True)
cast = cast.rename(columns = {0:'count'})


years = [2019, 2018, 2017, 2016, 2015, 2014, 2013, 2012, 2011, 2010]
year_data = list()
for year in years:
    temp1 = cast[cast['release_year'] == year].iloc[0,:]
    temp2 = cast[cast['release_year'] == year].iloc[1,:]
    year_data.append(list(temp1))
    year_data.append(list(temp2))
    
year = pd.DataFrame(year_data, columns=('years', 'genre', 'count'))


In [None]:
bar, ax = plt.subplots(figsize=(10,10))
sns.barplot(x = 'years', y ='count', hue='genre', data = year)
plt.xlabel('Years')
plt.ylabel('Frequency')
plt.title('Top Two Genre in last 10 years', size=20)

# Maximum Growth of Rating in specific year

In [None]:
temp = data[['release_year', 'rating']]
temp = temp.value_counts().to_frame()
temp.reset_index(level =  [0,1], inplace = True)
temp = temp.rename(columns = {0:'count'})
rating = pd.DataFrame([],columns = ('release_year', 'rating', 'count'))
for rating_element in data['rating'].unique():
    rating = pd.concat([rating, temp[temp['rating'] == rating_element].iloc[:1,:]])
rating

In [None]:
bar, ax = plt.subplots(figsize = (10, 10))
sns.barplot(x = 'count', y ='rating', data = rating, hue = 'release_year')
plt.xlabel('Count')
plt.ylabel('Ratings')
plt.title('Maximum Growth of Rating in specific year', size=18)

# Content Based Recommendation System

In [None]:
from nltk.corpus import stopwords  
from nltk.tokenize import word_tokenize  
import nltk
import re

In [None]:
data.head()

In [None]:
data.isnull().sum()

In [None]:
data['director'] = data['director'].fillna('')
data['cast'] = data['cast'].fillna('')
data['text'] = data['title'] + ' '+data['director'] + ' '+ data['cast']+ ' ' +data['listed_in'] + ' '+data['description']

In [None]:
def preprocess(text):
    text = re.sub('[^A-z]', ' ', text)
    stop_words = set(stopwords.words('english'))  
    word_tokens = word_tokenize(text)  
    lemmatizer = nltk.stem.WordNetLemmatizer()
    
    filtered_sentence = []  
    for w in word_tokens:  
        if w not in stop_words:  
            filtered_sentence.append(lemmatizer.lemmatize(w))
    filtered = ' '.join([x for x in filtered_sentence])
    return filtered.lower().strip()

In [None]:
data['text'] = data['text'].apply(lambda x : preprocess(x))

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
text_features = vectorizer.fit_transform(data['text'])

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
similarity_matrix = cosine_similarity(text_features)    

In [None]:
def get_recommendation(movie_name):
        movie_index = data[data['title'] == movie_name].index
        movie_similarity = similarity_matrix[movie_index]
        movie_data = pd.DataFrame({'cosine_similarity':movie_similarity[0], 'index':np.arange(6234)})
        movie_data = movie_data.sort_values(by = 'cosine_similarity', ascending = False)
        topn=10
        movie_ids = movie_data['index'][1:topn]
        recommendation_movies = list()
        for temp in movie_ids:
            movie = data['title'][temp]
            recommendation_movies.append(movie)
        return  recommendation_movies
            

In [None]:
get_recommendation('Transformers: Robots in Disguise')

# Recommendation For Completely New Movie

In [None]:
new_movie = {'movie_name' :'taare zameen par', 'genre' :'childrens film, drama', 'director' :'aamir khan, amole gupte', 'cast' :'Aamir Khan, darsheel safary', 'description':' The film explores the life and imagination of Ishaan, an 8-year-old dyslexic child. Although he excels in art, his poor academic performance leads his parents to send him to a boarding school. Ishaans new art teacher suspects that he is dyslexic and helps him to overcome his disability. Darsheel Safary stars as 8-year-old Ishaan, and Aamir Khan plays his art teacher.'}
new_movie = preprocess(' '.join([x for x in new_movie.values()]))
new_movie_features = vectorizer.transform([new_movie])


all_features = np.concatenate((text_features.toarray(), new_movie_features.toarray()))
similarity_matrix = cosine_similarity(all_features)    
temp = pd.DataFrame({'similarity':similarity_matrix[similarity_matrix.shape[0]-1], 'index':np.arange(6235)})
temp = temp.sort_values(by='similarity', ascending=False)


In [None]:
topn=10
for element in range(1, topn+1):
    index = list(temp['index'])[element]
    print(data['title'][index])

# > **Please give suggestions and upvote if you like**