### In this notebook I analyse the descriptions of the films and TV shows, first by rating and then by country.
#### I use some tools from the nltk package to clean up the descriptions, and plotly for the visualizations.

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

In [None]:
df = pd.read_csv('../input/netflix-shows/netflix_titles.csv')

In [None]:
df.head()

In [None]:
df.info()

## What are the most used words in the descriptions?

In [None]:
def desc_list(description):
    '''Takes a string, removes punctuation and stopwords and returns a list of words.'''
    tokenizer = RegexpTokenizer(r'\w+')
    y = tokenizer.tokenize(description.lower())
    l = [word for word in y if not word in stopwords.words('english')]
    return l

In [None]:
df['desc_lists'] = df['description'].apply(desc_list)

In [None]:
df['desc_lists']

In [None]:
# We merge all the lists into one
all_words = []
for l in df.desc_lists:
    all_words.extend(l)

In [None]:
# Prepare the data for visualization
word_counts = pd.DataFrame(Counter(all_words), index=[0]).T.reset_index()
word_counts.rename(columns={'index':'word', 0:'count'}, inplace=True)
word_counts = word_counts.sort_values(by='count', ascending=False)

In [None]:
fig = px.bar(word_counts.iloc[:30], y='count', x='word', text='count')

fig.update_traces(textposition='outside')
fig.update_layout(uniformtext_minsize=5)
fig.update_xaxes(tickangle=45)

fig.show()

## What about most common words by audience suitability rating?

In [None]:
df['rating'].value_counts()

In [None]:
sorted(set(df['rating'].dropna()))

In [None]:
# Create a dictionary with the rating categories as keys and dataframes with all the word counts for each rating as values.
d_ratings = {}
for rating in sorted(set(df['rating'].dropna())):
    if pd.isnull(rating):
        continue
    else:
        l = []
        for subl in df[df['rating'] == rating]['desc_lists']:
            l.extend(subl)
        dff = pd.DataFrame(Counter(l), index=[0]).T/df['rating'].value_counts()[rating] # we devide the word count by the total number of titles in each rating
        d_ratings[rating] = dff.reset_index().rename(columns={'index':'word', 0:'count'}).sort_values(by='count', ascending=True)

In [None]:
for key, val in d_ratings.items():
    print(key)
    print(val.iloc[-5:])
    print('\n')

In [None]:
# Plotting the top 10 words for each rating
fig = make_subplots(rows=4, 
                    cols=4, 
                    subplot_titles= [key + ' (' + str(df['rating'].value_counts()[key]) + ' titles)' for key in d_ratings.keys()], 
                    horizontal_spacing=0.08)

counter=0
for r in range(1,5):
    for c in range(1,5):
        if r==4 and c>2:
            break
        else:
            fig.add_trace(go.Bar(y=list(d_ratings.items())[counter][1].iloc[-10:]['word'],
                                 x=list(d_ratings.items())[counter][1].iloc[-10:]['count'],
                                 text=[round(num, 2) for num in list(d_ratings.items())[counter][1].iloc[-10:]['count']],
                                 orientation='h'),
                            row = r,
                            col = c, )
            counter += 1
    
fig.update_layout(height=1500, width=1200, showlegend=False)
fig.update_traces(textposition='inside', textfont_size=20)
fig.show()

### A few takeaways:
1. For TV shows, the word 'love' is more common in the TV-PG and TV-14 ratings, while it's not even in the top 10 in the TV-MA (Mature Audience) category.
2. In TV-Y shows which are aimed at very young audiences, it's all about 'friends', 'fun' and 'adventures'.
3. In the TV-Y7 category about 10% of the shows have the word evil in their description.
4. For films, the R-rated category is the only one with the word 'woman' in its most common words.

## What about different countries?

In [None]:
df['country'].value_counts()[:10]

### We'll only use the top 6 countries.

In [None]:
countries = list(df['country'].value_counts()[:6].index)
countries

In [None]:
d_countries = {}
for country in countries:
    l = []
    for subl in df[df['country'] == country]['desc_lists']:
        l.extend(subl)
    dff = pd.DataFrame(Counter(l), index=[0]).T/df['country'].value_counts()[country]
    d_countries[country] = dff.reset_index().rename(columns={'index':'word', 0:'count'}).sort_values(by='count', ascending=True)

In [None]:
for key, val in d_countries.items():
    print(key)
    print(val.iloc[-5:])
    print('\n')

In [None]:
fig = make_subplots(rows=3,
                    cols=2,
                    subplot_titles=[key + '(' + str(df['country'].value_counts()[key]) + ' titles)' for key in d_countries.keys()],
                    vertical_spacing=0.1)
counter_=0
for r in range(1,4):
    for c in range(1,3):
        fig.add_trace(go.Bar(y=list(d_countries.items())[counter_][1].iloc[-10:]['word'],
                             x=list(d_countries.items())[counter_][1].iloc[-10:]['count'],
                             text=[round(num, 2) for num in list(d_countries.items())[counter_][1].iloc[-10:]['count']],
                             orientation='h'),
                        row = r,
                        col = c, )
        counter_ += 1
    
    
fig.update_layout(height=1000, width=1000, showlegend=False)
fig.update_traces(textposition='inside', textfont_size=24)
fig.show()

### A few takeaways:
1. There are proportionately more Indian and Japanese titles in this dataset with the word 'young' in their description.
2. India and South Korea are the only countries with the word 'woman' in their top 10 most common words of descriptions.
3. Japanese movies and shows are more 'mysterious' than those of other countries.
4. A lot of the action in Indian films and TV shows seems to take place in Mumbai.
5. The US and the UK have proportionately more documentaries in this dataset than the other top countries.