# Netflix is so popular recently. It is a tv and a movie shows that can watch from streaming in tv, mobile phone, and tablet. Users also can watch as much as they like if using subscription. Its show also can be downloaded.
# Netflix is so interesting to explore, I decided to analysis from Netflix data and to futher analysis please check up my site http://auroradata.id/. In this site, I explained the visualization so you can catch more information.

# Data Description

In [None]:
# Import librries that needed.

import numpy as np
import pandas as pd
import warnings
warnings.simplefilter("ignore")
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objects as go
init_notebook_mode(connected=True)
import matplotlib.pyplot as plt
import seaborn as sns
!pip install rake-nltk
from rake_nltk import Rake
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
# Open the Netflix data 

data = pd.read_csv('/kaggle/input/netflix-shows/netflix_titles.csv')
data.head()

Data Cleansing for Visualization

In [None]:
# Check missing values

pd.isnull(data).sum()

In [None]:
# Check number of rows and columns 

data.shape

In [None]:
# Check type of columns

data.dtypes

In [None]:
# I think delete missing value in date_added dan rating is ok because the data is so small.

data.dropna(subset=['date_added', 'rating'])

In [None]:
# Drop columns that unnecessary for the exploration.

data = data.drop(['date_added', 'show_id'], axis = 1)
data.drop_duplicates(subset=['title'], inplace=True)

Data Visualization

In [None]:
# Fill missing values in country column, split countries, and make dataframe that only neccessary for the plot.

data['country'] = data['country'].fillna('Unknown')
data['country'] = data.country.str.replace(", | ,", ",")
data['listed_in'] = data.listed_in.str.replace(", | ,", ",")
country_data = pd.DataFrame(data.country.str.split(',').tolist(), index=data.type).stack()
country_data = country_data.reset_index([0, 'type'])
country_data.columns = ['types', 'country']

In [None]:
# Plot country column. 

data_countries = country_data['country'].replace("US", "United States").value_counts()

iplot([go.Choropleth(
    locationmode='country names',
    locations=data_countries.index.values,
    text=data_countries.index,
    z=data_countries.values
)])

In [None]:
#Plot types distribution for Netflix shows.

plt.figure(figsize =(10,5))
sns.set(style="dark")
sns.countplot(x="type", data=data, palette='twilight')
plt.title('Netflix Shows Types Distribution', size = 15, color='darkblue')
plt.xlabel('Types', color='darkblue')
plt.ylabel('Count', color='darkblue')
plt.show()

In [None]:
# Make genres counter for Netflix shows.

from collections import Counter

genre = list(data['listed_in'])

genres = []

for i in genre:
    i = list(i.split(","))
    for j in i:
        genres.append(j.replace(" ", ""))

gen = Counter(genres)

In [None]:
# Plot genres.

from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

genre_in = list(set(gen))
plt.rcParams['figure.figsize'] = (15,15)
wordcloud = WordCloud(max_font_size = 50, max_words = 100, background_color = 'white').generate(str(genre_in))

plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.title('Netflix Shows Genres Distribution', size = 23, color = 'darkblue')
plt.show()

In [None]:
# Make list of minues in duration.

season = data[['duration']].apply(lambda x: x.str.contains('Season|Seasons', regex=True)).any(axis=1)
time = data[~season]
time.index = np.arange(len(time))
times = []
for i in time.duration:
    if i[2] =='m':
        times.append(int(i[:2]))
    else:
        times.append(int(i[:3]))
time['duration_'] = times

In [None]:
# Show minutes.

plt.figure(figsize =(10,5))
sns.distplot(time.duration_, kde=False, color = 'blue')
plt.title('Netflix Shows Duration by Minutes', size = 15, color='darkblue')
plt.xlabel('Minutes', color='darkblue')
plt.ylabel('Count', color='darkblue')
plt.show()

In [None]:
# Make list of seasons in duration.

seasons = data[season]
seasons.index = np.arange(len(seasons))
seasons['value'] = seasons.duration.str[:2]
lists = list(seasons['value'])
for i in range (len(lists)):
    lists[i] = int(lists[i])
seasons['value'] = lists

In [None]:
# Show seasons.

plt.figure(figsize =(10,5))
sns.countplot(seasons.value)
plt.title('Netflix Shows Duration by Seasons', size = 15, color='darkblue')
plt.xlabel('Season', color='darkblue')
plt.ylabel('Count', color='darkblue')
plt.show()

In [None]:
# Show rating types distribution.

plt.figure(figsize =(10,5))
sns.lineplot(x = data.rating.value_counts().index,y = data.rating.value_counts().values)
plt.title('Rating-types Netflix Shows Distribution', size = 15, color='darkblue')
plt.ylabel('Count', color='darkblue')
plt.show()

# Data Prediction

We use certain columns (title,director, cast, listed in, and description) to make recommended system. Before we put in recommended system, we should clean that up and vectorize the words.

Data Preparation

In [None]:
# Clean unnecessary spaces.

clean_space = []

cols = ['title', 'director', 'cast', 'listed_in','description']

for i, cols in data.iterrows():
    if type(cols) == str:
        if cols.isspace():
            clean_space.append(i)

data.drop(clean_space, inplace = True)


In [None]:
# Change types of certain columns soo we can combine them to vectorize.

data['director'] = data['director'].astype(str)
data['cast'] = data['cast'].astype(str)
data['listed_in'] = data['listed_in'].astype(str)

In [None]:
# Make list of keywords in description column.

data['desc'] = ''
for index, row in data.iterrows():
    descrip = row['description']
    r = Rake()
    r.extract_keywords_from_text(descrip)
    key_descrip = r.get_word_degrees()
    row['desc'] = list(key_descrip.keys())
    

In [None]:
# Split word to get more precissely data.

data['cast'] = data['cast'].map(lambda x: x.split(','))
data['listed_in'] = data['listed_in'].map(lambda x: x.split(','))
data['director'] = data['director'].map(lambda x: x.split(','))

In [None]:
# Split director and cast columns so if we combine them, we can't conclude same word in cast and director is same person.

for index, row in data.iterrows():
    row['cast'] = [x.lower().replace(' ','') for x in row['cast']]
    row['director'] = ''.join(row['director']).lower()

In [None]:
# Combine words to vectorize. This is important because we can have more accurate if we including more words and can being a label.

data['combined'] = data['director'].astype(str) + ' ' + data['cast'].astype(str) + ' ' + data['listed_in'].astype(str) + ' ' + data['description'].astype(str)

In [None]:
# Split title from another columns so we can pair titles and labels.

data.set_index('title', inplace = True)
data.head()

In [None]:
# Make vector of the label.

count = CountVectorizer()
count_matrix = count.fit_transform(data['combined'])

In [None]:
# Generating the cosine similarity matrix of the label.
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim = cosine_similarity(count_matrix, count_matrix)
cosine_sim

In [None]:
# Make pairwise for the series array above.

indicates = pd.Series(data.index)
indicates[:5]

In [None]:
# Most important step. This is recommendation system.

def recommendation (Title, cosine_sim = cosine_sim):
    recommendation_title = []
    #Calling title for the matching
    title = indicates[indicates == Title].index[0]
    #Matching title and the combine values of cosine similarity
    match = pd.Series(cosine_sim[title]).sort_values(ascending = False)
    #Select only 10 best matching for the choosing film
    top_10_film = list(match.iloc[1:11].index)
    for i in top_10_film:
        recommendation_title.append(list(data.index)[i])
    return recommendation_title

In [None]:
#Check the functionality of recommendation system.

recommendation('Transformers: Robots in Disguise')

Hope, this is usefull! :)