In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:70% !important; }</style>"))

In [None]:
data = pd.read_csv('../input/netflix-shows/netflix_titles.csv')
movie = data[data['type']=='Movie']
tv_show = data[data['type']=='TV Show']

In [None]:
fig, ax = plt.subplots(1,1, figsize=(5, 5))
sns.set(style='darkgrid')

ax = sns.countplot(x = 'type', data=data, palette='Reds')

Movies clearly have a dominance agaisnt tv shows, at least in quantity.

## Which month should be less loaded for producers to publish content

In [None]:
data['date_added'] = pd.to_datetime(data['date_added'])
data['month']=data.date_added.dt.month_name().fillna(0)
data['year'] = data.date_added.dt.year
df_month = data['month'].value_counts().reset_index()
df_month = df_month.rename(columns={'index':'month',
                                   'month':'count'})

### Using plotly

In [None]:
import plotly.express as px
import plotly.graph_objects as go

fig = go.Figure(data=[go.Bar(
    x=df_month['count'],
    y=df_month['month'],
    text=df_month['count'],
    marker_color='rgb(229, 9, 20)',
    
    orientation='h'# marker color can be a single color value or an iterable
)])
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(title_text='Amount of content added each month')

Here we see that overall February is the month in which less content is added, that probably will be the best time to add new content, is not as saturated as other months

In [None]:
month_order = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'][::-1]
df = data.groupby('year')['month'].value_counts().unstack().fillna(0)[month_order].T


In [None]:
plt.figure(figsize=(10,7),dpi=200)
plt.pcolor(df, cmap='afmhot_r', edgecolors='white', linewidths=2)
plt.xticks(np.arange(0.5,len(df.columns),1), df.columns, fontsize=7, fontfamily='serif')
plt.yticks(np.arange(0.5, len(df.index), 1), df.index, fontsize=7, fontfamily='serif')

plt.title('Netflix content Update by Months', fontsize=12, fontfamily='serif', fontweight='bold', position=(0.20, 1.0+0.02))
cbar=plt.colorbar()
cbar.ax.tick_params(labelsize=8)
cbar.ax.minorticks_on()


plt.show()

It seems like February should be the best month to publish new content on netflix, is the month with less activity through the years, but we need to ask ourselves why is this, why is so little content added in this month?

### Whats the day most content is Added.

In [None]:
data['day_added'] = data.date_added.dt.day_name()
df_days=data.groupby('day_added')['type'].count().reset_index()
df_days = df_days.nlargest(7, ['type']).reset_index()


In [None]:
fig = go.Figure(data=[go.Bar(
    x=df_days['type'],
    y=df_days['day_added'],
    text=df_days['type'],
    marker_color='rgb(229, 9, 20)',
    
    orientation='h'# marker color can be a single color value or an iterable
)])
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(title_text='Days content is usually added')

We see Friday as the days most of the content is added on the platform, which makes sense, since people usually have more time around the weekend

## Content rating visualization

In [None]:
plt.figure(figsize=(8,6))
plt.xticks(size=8)

sns.set(style='darkgrid')


ax = sns.countplot(x = 'rating', data=data, palette='Reds_r', order=data['rating'].value_counts().index[0:15])

### Content Distribution by Age

In [None]:
edad = data['rating'].value_counts().reset_index().T
clasificacion = pd.DataFrame()
clasificacion['Adults'] = edad[0]+edad[3]+edad[13]
clasificacion['Teens'] = edad[1] + edad[4]
clasificacion['Kids'] = edad[7] + edad[11] + edad[6] + edad[2]
clasificacion['Little Kids'] = edad[10] + edad[5] + edad[8]
clasificacion = clasificacion.T
clasificacion.drop(columns='index')
clasificacion.drop(columns='index')

In [None]:
fig = plt.figure(figsize=(10,5))
ax = plt.subplot()
sns.barplot( clasificacion.index,clasificacion['rating'], palette='RdGy')
ax.set_facecolor(('#EDEDEB'))
ax.set_title('Content Distribution by Age', fontsize=8, fontweight='bold')
ax.set_ylabel('count')
plt.show()

We see a Clear Dominance of Adult content, I thinks it is because obviusly only adults can pay for netflix, and lets be honest there is probably a lot of kids who watch adult content too, so more adult content is produced.

## Best Movies by IMDb Ratings

In [None]:
imdb_ratings=pd.read_csv('../input/imdb-extensive-dataset/IMDb ratings.csv', usecols=['weighted_average_vote'])
imdb_titles = pd.read_csv('../input/imdb-extensive-dataset/IMDb movies.csv', usecols=['title', 'year', 'genre'])
ratings = pd.DataFrame({'Title':imdb_titles.title,
                       'Release Year':imdb_titles.year,
                       'Rating':imdb_ratings.weighted_average_vote,
                       'Genre':imdb_titles.genre})
ratings.drop_duplicates(subset=['Title', 'Release Year', 'Rating'], inplace=True)

### Join Data from netflix and imdb

In [None]:
joint_data=ratings.merge(data,left_on='Title', right_on='title',how='inner')
joint_data = joint_data.sort_values(by='Rating', ascending=False)


### Top 10 movies and shows on netflix

In [None]:
top_rated=joint_data[0:10]
fig = px.sunburst(
    top_rated,
    path=['title','country'],
    values='Rating',
    color='Rating')
fig.show()

This is interesting at least for me, i did not think india has that many good movies, at least good rated, but i suppose they are, IMDb never faliled me.

### Top Movies

In [None]:
movie2 = joint_data[joint_data['type']=='Movie']
top_rated2=movie2[0:10]
fig = px.sunburst(
    top_rated,
    path=['title','country'],
    values='Rating',
    color='Rating')
fig.show()

## Word Cloud for Movies

In [None]:
from collections import Counter

genres = list(movie2['listed_in'])
gen=[]

for i in genres:
    i = list(i.split(','))
    for j in i:
        gen.append(j.replace(' ',''))
g=Counter(gen)

In [None]:
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from PIL import Image
text = list(set(gen))
plt.rcParams['figure.figsize'] = (13,13)

mask=np.array(Image.open('../input/masks/loc.png'))
wordcloud = WordCloud(max_words=1000000,background_color='black', mask=mask).generate(str(text))

plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

### Word Cloud for TV Shows

In [None]:
shows = joint_data[joint_data['type']=='TV Show']
genre_show = list(shows['listed_in'])
gen_show=[]

for i in genre_show:
    i = list(i.split(','))
    for j in i:
        gen_show.append(j.replace(' ',''))
g_s=Counter(gen_show)
text_show = list(set(gen_show))
plt.rcParams['figure.figsize'] = (13,13)

mask=np.array(Image.open('../input/masks/comment.png'))
wordcloud = WordCloud(max_words=1000000,background_color='white', mask=mask).generate(str(text_show))

plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

# Movie Genders

In [None]:
g={k: v for k, v in sorted(g.items(), key=lambda item: item[1], reverse=True)}


In [None]:
fig,ax = plt.subplots()
fig=plt.figure(figsize=(10,10))
x=list(g.keys())
y=list(g.values())
ax.vlines(x, ymin=0, ymax=y, color='red')
ax.plot(x,y,'o', color='black')
ax.set_xticklabels(x, rotation = 90)
ax.set_ylabel('Count of Movies')
ax.set_title('Genres')

# Show Genders

In [None]:
g_s={k: v for k, v in sorted(g_s.items(), key=lambda item: item[1], reverse=True)}

In [None]:
fig,ax = plt.subplots()
fig=plt.figure(figsize=(10,10))
x=list(g_s.keys())
y=list(g_s.values())
ax.bar(x, y, color='red')
#ax.plot(x,y, color='black')
ax.set_xticklabels(x, rotation = 90)
ax.set_ylabel('Count of Shows')
ax.set_title('Genres')
plt.show()

## Largest Number of Seasons

In [None]:
feats=['title','duration']
durations=shows[feats]
durations['no_of_seasons']=durations['duration'].str.replace(' Season','')
durations['no_of_seasons']=durations['no_of_seasons'].str.replace('s','')
durations['no_of_seasons']=durations['no_of_seasons'].astype(str).astype(int)
t = ['title', 'no_of_seasons']
top=durations[t]
top=top.sort_values(by='no_of_seasons', ascending=False)
top20=top[0:20]
top20 = top20.drop_duplicates()

In [None]:
fig = go.Figure(data=[go.Bar(
    x=top20['title'],
    y=top20['no_of_seasons'],
    text=top20['no_of_seasons'],
    marker_color='rgb(229, 9, 20)',
    
   # marker color can be a single color value or an iterable
)])
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(title_text='Most seasons')

# Content in India
india is second to the USA in terms of producing cotent so, i though it would be interesting to look at the data.

In [None]:
net_india=movie2[movie2['country']=='India']
net_india=net_india.dropna()

## Top Directors of Indian Movies

In [None]:
india_dic=net_india['director'].value_counts()
top = india_dic[:20].reset_index()
top = top.rename(columns={'index':'director','director':'movies_directed'})
top

In [None]:
import squarify
y = top
fig = plt.figure(figsize=(15,15))
squarify.plot(sizes = y['movies_directed'], label = y['director'], color=sns.color_palette('RdGy', n_colors = 20), linewidth=4, text_kwargs={'fontsize':8,
                                                                                                                          'fontweight':'bold'})
plt.title('Top 20 Indian Directors', position=(0.5, 1.0+0.03), fontsize = 20, fontweight='bold')
plt.axis('off')
plt.show()

## Best Indian Movies

In [None]:
top_rated_india = net_india.sort_values(by='Rating', ascending=False)
top_rated_india = top_rated_india[:10]

fig = px.sunburst(
    top_rated_india,
    path=['title','director'],
    values='Rating',
    color='Rating')
fig.show()

According to this data India has a lot of really good movies, and there will probably be a lot more with the constant grow of bolliwood. 