# Analysis of Netflix Data with IMDB Ratings

<table>
    <tr>
        <td>
            <img src="https://media.wearemotto.com/wp-content/uploads/2018/11/11001644/Netflix-Hero-Bingwatch.jpg">
        </td>
        <td>
            <p style="font-size:80px; font-weight:bold;">+</p>
        </td>
        <td>
            <img src="https://www.pngkey.com/png/full/343-3433435_facebook-imdb-icon.png" style="width:50%">
        </td>
    </tr>
</table>

## Data Creation and Cleaning

In [None]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import ipywidgets as widgets
import matplotlib
import matplotlib.pyplot as plt
from IPython.display import display # For widgets
from wordcloud import WordCloud # For wordcloud visualization

%matplotlib inline


# The following modules will assist in acquiring the IMDb data
from io import StringIO # For creation of string inputs to be read as '.tsv' by pandas
import re # For separation of netflix_df.country strings into lists
import gzip # For extraction of IMDB datasets
import urllib.request

### We'll create our dataframe using the attached data source.

In [None]:
ndf = pd.read_csv('../input/netflix-shows/netflix_titles.csv', index_col='show_id', low_memory=False)

In [None]:
ndf.head(5)

In [None]:
ndf.info()

### The Netflix download does not include any information on viewers' ratings; the 'rating' column in the above info refers to MPAA rating.  We can get users' rating info from IMDB.  They make their data available at https://datasets.imdbws.com.

In [None]:
# Download IMDB files from official repository
urllib.request.urlretrieve('https://datasets.imdbws.com/title.ratings.tsv.gz', 'title.ratings.tsv.gz');
urllib.request.urlretrieve('https://datasets.imdbws.com/title.basics.tsv.gz', 'title.basics.tsv.gz');

#extract files as text
with gzip.open('title.ratings.tsv.gz', 'rt', encoding='utf8') as reader:
    ratings_contents = reader.read()

with gzip.open('title.basics.tsv.gz', 'rt', encoding='utf8') as reader:
    basics_contents = reader.read()
    
# Create text strings to be loaded into panda.read_csv as a tab separated file
ratings_data = StringIO(ratings_contents)
basics_data = StringIO(basics_contents)

# Create pandas dataframe from tab separated file for ratings
imdb_ratings_df_raw = pd.read_csv(ratings_data, sep='\t', low_memory=False)

# Create pandas dataframe from tab separated file for titles
imdb_titles_df_raw = pd.read_csv(basics_data, sep='\t', low_memory=False)

# We will make a copy of our raw data, dropping any duplicate rows in the process
imdb_ratings_df = imdb_ratings_df_raw.copy().drop_duplicates()
imdb_titles_df = imdb_titles_df_raw.copy().drop_duplicates()

### Since the IMDB database is so vast, it comes in multiple files.  I only grabbed the two that I need to make a connection to the Netflix data: 'title.ratings.tsv' and 'title.basics.tsv'.  Let's have a look at the two IMDb dataframes separately and make sure the data is valid.

In [None]:
imdb_ratings_df.sample(5)

In [None]:
imdb_ratings_df.info()

### The Dtypes of 'imdb_ratings_df' look correct and there appear to be no null entries.  Let's check 'imdb_titles_df'.

In [None]:
imdb_titles_df.sample(5)

In [None]:
imdb_titles_df.info()

### All of the columns are of dtype 'object' and we do not get a null count.  This might present problems later in our analysis, but we will cross that bridge when we come to it.
### Let's combine our two IMDb dataframes into a single dataframe, matching titles with their user ratings.  We will accomplish this by joining them on their 'tconst' column and making this the index of the resulting dataframe.

In [None]:
idf = pd.merge(imdb_titles_df.set_index('tconst'), imdb_ratings_df.set_index('tconst'), left_index=True, right_index=True, how='inner').drop_duplicates()
idf.sample(5)

In [None]:
idf.info()

### There are a couple of different ways we can choose to join the IMDb and Netflix dataframes so that the data lines up correctly.  I have chosen to combine them where idf['primaryTitle', 'startYear'] and ndf['title', 'release_year'] match; this will minimize mismatched rows.
### Recall (and you can see just above) that idf[startYear] is of Dtype 'object'; we need to drop any non-numeric values and convert the column to 'int64' in order to compare the values with those of the Netflix dataframe.

In [None]:
idf = idf[idf.startYear.apply(lambda x: str(x).isnumeric())]
idf['startYear'] = idf['startYear'].astype(int)

### Let's convert all titles to lowercase to avoid misses and join the two dataframes

In [None]:
ndf['title'] = ndf['title'].str.lower()
idf['primaryTitle'] = idf['primaryTitle'].str.lower()

In [None]:
df = pd.merge(ndf, idf, left_on=['title', 'release_year'], right_on=['primaryTitle', 'startYear'], how='inner').drop_duplicates().reset_index()
df = df.drop('index', axis=1)
df.sample(5)

### We should parse the categorical 'date_added' column and make a numerical column from each of its constituent parts.  Let's get rid of any Null values.


In [None]:
type(df['date_added'].iloc[0])

In [None]:
df = df[df.date_added.isna() == False].reset_index()
df = df.drop("index", axis=1)

In [None]:
#Changing 'date_added' columnn to datetime dtype and creating columnn for each part
df.date_added = pd.to_datetime(df.date_added)
df['year_added'] = pd.DatetimeIndex(df.date_added).year.astype(int)
df['month_added'] = pd.DatetimeIndex(df.date_added).month.astype(int)
df['day_added'] = pd.DatetimeIndex(df.date_added).day.astype(int)

### Next, getting a count of the number of countries involved in a title's release will help our analysis down the line.

In [None]:
# Add a numeric column for the number of countries each title was released to
myList = []
for i in range(df.country.size):
    myList.append(str(df.country[i]).strip().split(','))

In [None]:
# Make a pandas Series from 'country' list
country = pd.Series(myList)

# Create 'country_count' column using map and lambda function involving 'country' Series
df['country_count'] = country.map(lambda x: len(x))

### Let's check the info

In [None]:
df.info()

### The Null values are all in categorical columns.  I don't want to lose these titles if we don't have to; there may be some interesting patterns to uncover in a future, more in depth, exploration.  Let's change them to the string "Unknown".

In [None]:
df = df.fillna("Unknown")

# Exploratory Analysis and Visualization

In [None]:
sns.set_style('darkgrid')
matplotlib.rcParams['font.size'] = 15
matplotlib.rcParams['figure.figsize'] = (9, 5)
matplotlib.rcParams['figure.facecolor'] = '00000000'

In [None]:
xplt = df[df["type"] == "Movie"].groupby('country').count().sort_values('title', ascending=False).reset_index().country.head(10)
yplt = df[df["type"] == "Movie"].groupby('country').count().sort_values('title', ascending=False).reset_index().title.head(10)

xplt2 = df[df["type"] == "TV Show"].groupby('country').count().sort_values('title', ascending=False).reset_index().country.head(10)
yplt2 = df[df["type"] == "TV Show"].groupby('country').count().sort_values('title', ascending=False).reset_index().title.head(10)

plt.figure(figsize=(16, 8))
plt.fontsize = 20
plt.xticks(rotation=75)
plt.title(label='Number of Titles Available by Country (Movies in Blue)')
plt.xlabel("Country")
plt.ylabel("Number of Titles")
sns.barplot(x=xplt, y=yplt, color="Blue");
sns.barplot(x=xplt2, y=yplt2, color="Red");

### Lots of TV content with "Unknown" as its country...  We'll explore this further in a future analysis.

## We can plot how ratings have changed over time.  We don't have timestamps from IMDb for when users cast input their rating, so let's use date added to Netflix and release year.

In [None]:
# Group by year added
movie_ratings = df[df.type == 'Movie'].groupby('year_added')['averageRating'].mean()
tv_ratings = df[df.type == 'TV Show'].groupby('year_added')['averageRating'].mean()

movie_ratings2 = df[df.type == 'Movie'].groupby('release_year')['averageRating'].mean()
tv_ratings2 = df[df.type == 'TV Show'].groupby('release_year')['averageRating'].mean()

fig, axes = plt.subplots(1, 2, figsize=(20, 8))

sns.lineplot(x=movie_ratings.index[:-1], y=movie_ratings[:-1], ax=axes[0]);
sns.lineplot(x=tv_ratings.index[:-1], y=tv_ratings[:-1], ax=axes[0]);
axes[0].set_title('Movie and TV Show Ratings Over Time by Year Added')
axes[0].set_xlabel("Year Added")
axes[0].set_ylabel("Average Rating")
axes[0].legend(labels=['Movies', 'TV Shows']);


sns.lineplot(x=movie_ratings2.index[:-1], y=movie_ratings2[:-1], ax=axes[1]);
sns.lineplot(x=tv_ratings2.index[:-1], y=tv_ratings2[:-1], ax=axes[1]);
axes[1].set_title('Movie and TV Show Ratings Over Time by Release Year')
axes[1].set_xlabel("Release Year")
axes[1].set_ylabel("Average Rating")
axes[1].legend(labels=['Movies', 'TV Shows']);

plt.tight_layout(pad = 5);

### User ratings TV Shows appear to exceed those for Movies, especially in recent years.  Let's see if this observation holds across regions.  We can use IpyWidgets to make choosing a region easier.

In [None]:
country_list = df[(df.country.isna() == False) & (df.country_count == 1)].country.sort_values().unique().tolist()

country_select = widgets.Dropdown(
                    options = country_list,
                    value = 'United States',
                    description = 'Countries',
                    disabled = False)

display(country_select)

button = widgets.Button(description="Plot")
output = widgets.Output()

matplotlib.rcParams["font.size"] = 15

display(button, output)

def on_button_clicked(b):
    avg_ratings_df = pd.DataFrame()
    country = country_select.value
    tv_avg = df[(df.country.str.contains(country)) & (df.type == 'TV Show')].groupby('year_added').averageRating.mean()
    movie_avg = df[(df.country.str.contains(country)) & (df.type == 'Movie')].groupby('year_added').averageRating.mean()
    with output:
        plt.figure(figsize=(16, 8));
        sns.lineplot(x=movie_avg.index[:-1], y=movie_avg[:-1]);
        sns.lineplot(x=tv_avg.index[:-1], y=tv_avg[:-1]);
        plt.title(label=country + ' Ratings Over Time')
        plt.legend(labels=['Movies', 'TV Shows']);

button.on_click(on_button_clicked)

### Has this trend affected the type of content Netflix has been adding?

In [None]:
movies_added = df[df.type == 'Movie'].groupby('year_added').title.count()
tv_added = df[df.type == 'TV Show'].groupby('year_added').title.count()

years = [2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020]
shows = []
movies = []
for year in years:
    shows.append([year, df[(df.type == 'TV Show') & (df.year_added == year)].title.count()])
    movies.append([year, df[(df.type == 'Movie') & (df.year_added == year)].title.count()])
shows = np.array(shows)
movies = np.array(movies)

fig, axes = plt.subplots(1, 2, figsize=(20,8))

sns.lineplot(x=movies_added.index[:-1], y=movies_added[:-1], ax=axes[0]);
sns.lineplot(x=tv_added.index[:-1], y=tv_added[:-1], ax=axes[0]);
axes[0].set_title(label='Type of Content Added by Netflix Over Time')
axes[0].set_xlabel("Year");
axes[0].set_ylabel("Number of Titles");
axes[0].legend(labels=['Movies', 'TV Shows']);

blue_bar = movies[:,1]
orange_bar = shows[:,1]

ind = np.arange(2008,2021)

width = 0.3       

# Plotting
axes[1].bar(ind[:-1], blue_bar[:-1], width, label='Movies', color="blue", edgecolor="cyan");
axes[1].bar((ind + width)[:-1], orange_bar[:-1], width, label='TV Shows', color="red", edgecolor="magenta");

axes[1].set_xlabel("Year")
axes[1].set_ylabel('Number of Titles')
axes[1].set_title('Amount of Content Added by Type and Year')
axes[1].set_xticks(ind + width / 2, minor=years)
axes[1].legend();

plt.tight_layout(pad=5)

In [None]:
years = [2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020]
shows = []
movies = []
for year in years:
    shows.append([year, df[(df.type == 'TV Show') & (df.year_added <= year)].title.count()])
    movies.append([year, df[(df.type == 'Movie') & (df.year_added <= year)].title.count()])
shows = np.array(shows)
movies = np.array(movies)


# Create axes for subplots
fig, axes = plt.subplots(1, 2, figsize=(20,8));

matplotlib.rcParams["font.size"] = 15

# Set up lineplot on axes[0]
sns.lineplot(x=movies[:-1,0], y=movies[:-1,1], ax=axes[0]);
sns.lineplot(x=shows[:-1,0], y=shows[:-1,1], ax=axes[0]);
axes[0].set_title(label='Amount of Content by Type and Year (Cumulative)')
axes[0].set_xlabel("Year");
axes[0].set_ylabel("Number of Titles");
axes[0].legend(labels=['Movies', 'TV Shows']);

# Set up bar graph
blue_bar = movies[:,1]
orange_bar = shows[:,1]
ind = np.arange(2008,2021)
width = 0.3       

# Plotting bar graph on axes[1]
axes[1].bar(ind[:-1], blue_bar[:-1] , width, label='Movies', color="Blue", edgecolor="Cyan");
axes[1].bar((ind + width)[:-1], orange_bar[:-1], width, label='TV Shows', color="Red", edgecolor="Magenta");
axes[1].set_xlabel('Year');
axes[1].set_ylabel('Number of Titles');
axes[1].set_title('Amount of Content by Type and Year (Cumulative)');
axes[1].legend(["Movies", "TV Shows"]);

axes[1].set_xticks(ind + width / 2, minor=years);

plt.tight_layout(pad=2);

## How much of each type of content is available?

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(18, 10))
matplotlib.rcParams["font.size"] = 15

axes[0].set_title("Content Available by Type")
axes[0].pie(df.groupby("type")["title"].count(), labels=df["type"].unique(), autopct="%1.1f%%", startangle=180, shadow=True);

axes[1].set_title("Content Available by MPAA Rating")
matplotlib.rcParams["font.size"] = 12
axes[1].pie(df[df["rating"] != "Unknown"].groupby("rating")["title"].count(), labels=df[df["rating"] != "Unknown"]["rating"].unique(), autopct="%1.1f%%", startangle=180, shadow=True);

plt.tight_layout()

## How many regions is Netflix available in?

In [None]:
print(f"Netflix is available in {len(df[df.country_count == 1].country.unique())} countries.")

## What are the distribution and density of user ratings?

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(20, 8))

fig.fontsize = 15

matplotlib.rcParams["font.size"] = 15

axes[0].hist(df[df["type"] == "Movie"]["averageRating"], color="Blue");
axes[0].hist(df[df["type"] == "TV Show"]["averageRating"], color="Red");
axes[0].set_title("Distribution of User Ratings")
axes[0].set_xlabel("Average Rating")
axes[0].set_ylabel("Frequency")
axes[0].legend(labels=["Movies", "TV Shows"]);

sns.distplot(df[df["type"] == "Movie"]["averageRating"], color="Blue", ax=axes[1]);
sns.distplot(df[df["type"] == "TV Show"]["averageRating"], color="Red", ax=axes[1]);
axes[1].set_title("Density Plot of User Ratings")
axes[1].set_xlabel("Average Rating")
axes[1].set_ylabel("Density")
axes[1].legend(labels=["Movies", "TV Shows"]);

plt.tight_layout(pad=5);

## How do ratings vary by region?

In [None]:
movie_ratings = dict.fromkeys(df[(df.country_count == 1) & (df.country.isna() == False)].country.sort_values().unique())
tv_ratings = dict.fromkeys(df[(df.country_count == 1) & (df.country.isna() == False)].country.sort_values().unique())

for country in movie_ratings.keys():
    movie_ratings[country] = df[(df.country.str.contains(country)) & (df.type == 'Movie')].averageRating.mean()

movie_ratings = pd.DataFrame.from_dict(movie_ratings, orient='index', columns=['ratings'])

for country in tv_ratings.keys():
    tv_ratings[country] = df[(df.country.str.contains(country)) & (df.type == 'TV Show')].averageRating.mean()

tv_ratings = pd.DataFrame.from_dict(tv_ratings, orient='index', columns=['ratings'])

fig, axes = plt.subplots(1, 2, figsize=(24, 12))

axes[0].set_title('Average Movie Ratings by Country')
sns.barplot(x=movie_ratings.ratings.sort_values(ascending=False), y=movie_ratings.sort_values('ratings', ascending=False).index, ax=axes[0]);

axes[1].set_title('Average TV Show Ratings by Country')
sns.barplot(x=tv_ratings.ratings.sort_values(ascending=False), y=tv_ratings.sort_values('ratings', ascending=False).index, ax=axes[1]);

plt.tight_layout(pad=5)

## How has the prevalence of MPAA ratings and users' opinions of them changed over time?

#### We can use a heatmap to visualize the amount of content uploaded per year for each MPAA rating.

In [None]:
# Create dataframe with necessary data columns
rating_df = df.groupby(["country", "rating", "year_added"]).agg({"averageRating":"mean", "title":"count", "numVotes":"sum"}).reset_index()

# Creating the dataframe for the averageRating heatmap
heatmap_df = rating_df.groupby(["year_added", "rating"])["numVotes"].sum().reset_index()
ratings_pivot = pd.DataFrame(heatmap_df.pivot("rating", "year_added", "numVotes"))

# Creating the dataframe for the title_count heatmap
heatmap_df2 = rating_df.groupby(["year_added", "rating"])["title"].sum().reset_index()
ratings_pivot2 = pd.DataFrame(heatmap_df2.pivot("rating", "year_added", "title"))

fig, axes = plt.subplots(1, 2, figsize=(20, 8))

sns.heatmap(ratings_pivot2, cmap="Blues", ax=axes[0]);
axes[0].set_title("Amount of Content Added by MPAA Rating and Year")
axes[0].set_xlabel("Year Added")
axes[0].set_ylabel("MPAA Rating")

sns.heatmap(ratings_pivot, cmap="Blues", ax=axes[1]);
axes[1].set_title("User Engagement (Number of Votes) with Content by Type and Year")
axes[1].set_xlabel("Year Added")
axes[1].set_ylabel("MPAA Rating")

plt.tight_layout(pad=5)

### Who are the ten directors with the most content on Netflix?

In [None]:
director_df = df.groupby(["country", "rating", "year_added", "director"]).agg({"averageRating":"mean", "title":"count"}).reset_index()
director_df = pd.DataFrame(director_df[director_df["director"] != "Unknown"].groupby("director")["title"].sum().sort_values(ascending=False)).reset_index().head(10)
director_df = director_df.rename({"title":"number_of_titles"}, axis=1)
director_df.style.hide_index()

### Who are the ten most highly rated directors (with at least 5 titles on Netflix) according to IMDb rating?

In [None]:
director_df = df.groupby(["country", "rating", "year_added", "director"]).agg({"averageRating":"mean", "title":"count"}).reset_index()
director_df = pd.DataFrame(director_df[(director_df["director"] != "Unknown") & (director_df["title"] >= 5)].groupby("director")["averageRating"].mean().sort_values(ascending=False)).reset_index().head(10)
director_df.style.hide_index()

### In which years were the most highly rated movies released?

In [None]:
test_df2 = df[df["type"] == "Movie"].sort_values(["release_year", "averageRating"], ascending=False)

max_df = pd.DataFrame()

for year in df["release_year"].unique():
  max_df = pd.concat([max_df, test_df2[test_df2["release_year"] == year].sort_values("averageRating", ascending=False).iloc[[0]]])

test_df = df[df["type"] == "Movie"].groupby("release_year")["averageRating"].mean().sort_values(ascending=False).reset_index()
test_df["best_movie"] = test_df["release_year"].map(lambda x: max_df[max_df["release_year"] == x]["title"].item())
test_df["director"] = test_df["release_year"].map(lambda x: max_df[max_df["release_year"] == x]["director"].item())
test_df["best_movie_user_rating"] = test_df["release_year"].map(lambda x: max_df[max_df["release_year"] == x]["averageRating"].item())
test_df["best_movie_country"] = test_df["release_year"].map(lambda x: max_df[max_df["release_year"] == x]["country"].item())
test_df["num_votes"] = test_df["release_year"].map(lambda x: max_df[max_df["release_year"] == x]["numVotes"].item())
test_df["num_movies_released_this_year"] = test_df["release_year"].map(lambda x: df[(df["type"] == "Movie") & (df["release_year"] == x)].groupby("release_year")["title"].count().item())

test_df.head(10)

## What are the most popular shows from each region?
### Use the Dropdown menu below to select a country.

In [None]:
country_list = df[(df.country.isna() == False) & (df.country_count == 1)].country.sort_values().unique().tolist()

country_select = widgets.Dropdown(
                    options = country_list,
                    value = 'United States',
                    description = 'Countries',
                    disabled = False)

display(country_select)

button = widgets.Button(description="TV Shows")
button1 = widgets.Button(description="Movies")
output = widgets.Output()
output1 = widgets.Output()

matplotlib.rcParams["font.size"] = 15

display(button, output)
display(button1, output1)

def on_button_clicked(b):
    tv_df = df[(df.country.str.contains(country_select.value)) & (df.type == 'TV Show')].sort_values('numVotes', ascending=False)
    with output:
        if tv_df.index.size == 0:
          print("Cannot Graph.")
        else:
          plt.title(label='Popular TV Shows in ' + country_select.value)
          sns.barplot(x=tv_df.averageRating.head(5), y=tv_df.title.head(5))
        
def on_button_clicked1(b):
    movie_df = df[(df.country.str.contains(country_select.value)) & (df.type == 'Movie')].sort_values('numVotes', ascending=False)
    with output1:
        if movie_df.index.size == 0:
          print("Cannot Graph.")
        else:
          plt.title(label='Popular Movies in ' + country_select.value)
          sns.barplot(x=movie_df.averageRating.head(5), y=movie_df.title.head(5))

button.on_click(on_button_clicked)
button1.on_click(on_button_clicked1)

## What words are most commonly used in the descriptions?

### We can visualize this with a wordcloud

In [None]:
plt.figure(figsize=(20,10))
wordcloud = WordCloud(background_color='Black',
                      width=1920,
                      height=1080
                      ).generate(" ".join(df.description))
plt.imshow(wordcloud)
plt.axis('off')
plt.savefig('cast.png')
plt.show()

##### "Stopwords" are removed and we are left with a visual representation of the frequency of words in the descriptions.

# Future Work

## Visualize things like most prolific director, actor, etc.

## Use NLP for a recommendation system

## Implement N-grams for actor/director combinations to make predictions on user ratings.