In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.pyplot as plt
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/kaggle/input/netflix-shows/netflix_titles.csv')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
pt = sns.countplot(x='type', data = df)

It can be concluded that there are more movies than series on Netflix.

In [None]:
old = df.sort_values("release_year", ascending=True)
old[['title','release_year']][:10]

In [None]:
plt.figure(figsize=(10,10))
rating = sns.countplot(x='rating', data=df,order = df['rating'].value_counts().index[0:-1])

There are more than 2500 movies with TV-MA rating, 'TV-MA' rating is suitable only for the mature audiences assigned by the the TV Parental Guidelines.
Second highest rating is the TV-14 with slightly less than 2000 movies, which says that the content is not appropriate for children younger than 14 years.

In [None]:
ratings = pd.read_csv('/kaggle/input/imdb-extensive-dataset/IMDb ratings.csv',usecols=['weighted_average_vote'])
titles = pd.read_csv('/kaggle/input/imdb-extensive-dataset/IMDb movies.csv',usecols=['title','year','genre'])                                                                

In [None]:
titles.head()

In [None]:
ratings.head()

In [None]:
rating = pd.DataFrame({'Title':titles.title,
                       'Release Year':titles.year,
                       'Rating':ratings.weighted_average_vote,
                       'Genre':titles.genre})

In [None]:
rating.head()

In [None]:
rating.drop_duplicates(subset=['Title','Release Year','Rating'], inplace = True)

In [None]:
rating.dropna()

Now here we are performing the inner join on the ratings dataset and the netflix dataset to get the content that has both ratings on netflix and IMDB.

In [None]:
datajoin = rating.merge(df,left_on = 'Title', right_on='title', how = 'inner')
datajoin = datajoin.sort_values(by='Rating', ascending='False')

In [None]:
datajoin

In [None]:
rating.head()

In [None]:
import plotly.express as px
top15 = datajoin[0:15]
fig = px.sunburst(top15, path=['title','country'], values='Rating', color='Rating')
fig.show()

> Top 15 rated movies on Netflix.

In [None]:
countcountries = datajoin['country'].value_counts().sort_values(ascending=False)
countcountries = pd.DataFrame(countcountries)
countcountries.head()
top10 = countcountries[0:10]
top10

top 10 countries with highest rated content.

In [None]:
fig = px.funnel(top10)
fig.show()

In [None]:
plt.figure(figsize=(10,10))
yer = sns.countplot(x='release_year',data=df, order=df['release_year'].value_counts().index[0:15])

So, 2018 was the year with most number of releases.

In [None]:
UK= df[df['country']=='United Kingdom']

In [None]:
latest_UK = UK.sort_values(by='release_year', ascending = False)[:10]

In [None]:
latest_UK

In [None]:
import plotly.graph_objects as go

In [None]:
fig = go.Figure(data=[go.Table(header=dict(values=['Title', 'Release Year', 'type', 'duration']),
                              cells = dict(values=[latest_UK['title'], latest_UK['release_year'], latest_UK['type'], latest_UK['duration']]))])
fig.show()

In [None]:
US= datajoin[datajoin['country']=='United States']

In [None]:
Oldest_US = US.sort_values(by='release_year', ascending=True)[:10]

In [None]:
Oldest_US

In [None]:
fig = go.Figure(data=[go.Table(header=dict(values=['Title', 'Release Year', 'rating', 'duration']),
                              cells = dict(values=[Oldest_US['title'], Oldest_US['release_year'], Oldest_US['Rating'], Oldest_US['duration']]))])
fig.show()

In [None]:
season =['title', 'no']

In [None]:
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from collections import Counter

In [None]:
genre = list(df['listed_in'])
gen=[]
for i in genre:
    i = list(i.split(','))
    for j in i:
        gen.append(j.replace(' ', ""))
g = Counter(gen)

In [None]:
plt.rcParams['figure.figsize']=(13,13)
text = list(set(gen))
wordcloud = WordCloud(max_font_size=40,max_words=50).generate(str(text))
plt.imshow(wordcloud,interpolation='bilinear')
plt.axis('off')
plt.show()

A wordcloud of Genres

In [None]:
pclass = datajoin['release_year'].value_counts().to_frame().reset_index().rename(columns={'index':'Year','release_year':'Total Count'})


figure = go.Figure(data=[go.Scatter(
    x = pclass['Year'], 
    y = pclass['Total Count'],
    mode = 'markers',
    marker = dict(
        color = pclass['Total Count'],
        size = pclass['Total Count'] * 0.5,
        showscale = False
    ))])


figure.layout.template = 'seaborn'

figure.update_layout(title = 'Analysis by Year', xaxis_title = "Year Released", yaxis_title = "Total")
figure.show()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
datajoin['title']

In [None]:
tfidf = TfidfVectorizer(stop_words='english')

datajoin['description'] = datajoin['description'].fillna('')

tf_matrix = tfidf.fit_transform(datajoin['description'])

tf_matrix.shape

In [None]:
from sklearn.metrics.pairwise import linear_kernel

cosine_simalarity = linear_kernel(tf_matrix,tf_matrix)

In [None]:
indices = pd.Series(datajoin.index, index=datajoin['title']).drop_duplicates()

In [None]:
def recommendation(title, cosine_simalarity=cosine_simalarity):
    ids = indices[title]
    similar_scores = list(enumerate(cosine_simalarity[ids]))
    
    similar_scores = sorted(similar_scores, key=lambda x: x[1], reverse = True)
    
    similar_scores = similar_scores[1:11]
    
    movies_indices = [i[0] for i in similar_scores]
    return datajoin['title'].iloc[movies_indices]

 In Progress!!! 
