In this notebook, I have combined rating data from IMDb and matched it with films/TV shows in the Netflix data. My aim is to do data visualization of Netflix content with IMDb rating. I have restricted data analysis to India & US.
 

In [None]:
import urllib.request
import gzip
import shutil
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# IMDb dataset

## Download the dataset files from imdb

In [None]:
# To get title & region of the movie
urllib.request.urlretrieve("https://datasets.imdbws.com/title.akas.tsv.gz", "title.akas.tsv.gz")

# To get the year of release
urllib.request.urlretrieve("https://datasets.imdbws.com/title.basics.tsv.gz", "title.basics.tsv.gz")

# To get the rating of movie
urllib.request.urlretrieve("https://datasets.imdbws.com/title.ratings.tsv.gz", "title.ratings.tsv.gz")

In [None]:
# Unzip the gz files

with gzip.open("title.akas.tsv.gz", "r") as f_in, open("title.akas.tsv", "wb") as f_out:
    shutil.copyfileobj(f_in, f_out)

with gzip.open("title.basics.tsv.gz", "r") as f_in, open("title.basics.tsv", "wb") as f_out:
    shutil.copyfileobj(f_in, f_out)

with gzip.open("title.ratings.tsv.gz", "r") as f_in, open("title.ratings.tsv", "wb") as f_out:
    shutil.copyfileobj(f_in, f_out)

os.remove("title.akas.tsv.gz")
os.remove("title.basics.tsv.gz")
os.remove("title.ratings.tsv.gz")

## Preprocessing on IMDb dataset

In [None]:
# Read title file
df_imdb_title = pd.read_csv("title.akas.tsv", sep='\t', low_memory=False)

In [None]:
# Read basics file
df_imdb_basic = pd.read_csv("title.basics.tsv", sep='\t', low_memory=False)

In [None]:
# Read rating file
df_imdb_rating = pd.read_csv("title.ratings.tsv", sep='\t', low_memory=False)

In [None]:
# For current analysis we would limit the region to US and India
df_imdb_title = df_imdb_title.loc[np.logical_or(df_imdb_title.region == 'IN', df_imdb_title.region == 'US')]

# Remove na rows for title
df_imdb_title.title.dropna(inplace=True)
df_imdb_title.reset_index(inplace=True, drop=True)

In [None]:
# Keep only necessary columns in title df
df_imdb_title.drop(df_imdb_title.columns.difference(['titleId', 'title']), axis=1, inplace=True)
df_imdb_title

In [None]:
# Keep only necessary columns in ranking df
df_imdb_rating = df_imdb_rating.drop('numVotes', 1)
df_imdb_rating

In [None]:
df_imdb_rating.info()

In [None]:
# Merge title and rating dataframes on titleId
merged = pd.merge(df_imdb_title, df_imdb_rating, how='inner', left_on=['titleId'], right_on=['tconst'])
merged.drop('tconst', 1, inplace=True)
merged

In [None]:
# Merge it now with the basics data containing the release year of the movie
merged = pd.merge(merged, df_imdb_basic, how='inner', left_on=['titleId'], right_on=['tconst'])

In [None]:
# Keep only the necessary columns
merged.drop(merged.columns.difference(['title', 'startYear', 'averageRating']), axis=1, inplace=True)
merged = merged.rename({'startYear': 'release_year', 'averageRating' : 'IMDbrating'}, axis=1)
merged

# Netflix dataset

## Preprocessing on Netflix dataset

In [None]:
# Create the dataframe from netflix data
df = pd.read_csv('../input/netflix-shows/netflix_titles.csv')
df

In [None]:
df.info()

In [None]:
df.dropna(subset=['country'], inplace=True)
df.reset_index(inplace=True, drop=True)

In [None]:
# Restrict to US and India data
df = df.loc[np.logical_or(df.country.str.contains('United States'), df.country.str.contains('India'))]
df.reset_index(inplace=True, drop=True)

In [None]:
df['release_year'] = df['release_year'].apply(str)

In [None]:
# No duplicate titles in netflix data
df[df.duplicated(subset=['title'])]['title'].values

# Merge IMDb and Netflix dataset

In [None]:
# Merge IMDb and netflix data
df_merged = pd.merge(merged, df, how='inner', on=['title', 'release_year'])

In [None]:
df_merged

In [None]:
# Duplicate titles in the merged dataset
dup = df_merged[df_merged.duplicated(subset=['title'])]['title'].values
df_merged[df_merged['title'].isin(dup)]

In [None]:
# IMDb dataset had same titles for different movies
# during the merge it created duplicate titles in merged data set as well
# for simplification let us keep the 'first' title of all duplicate titles
df_merged = df_merged.drop_duplicates(subset=['title'])
df_merged.reset_index(inplace=True, drop=True)

In [None]:
df_merged.info()

In [None]:
df_merged.country.value_counts()

In [None]:
# For analysis purpose, we would treat the country of origin as US if there is US in the list of countries
# for e.g., country = "Unites States, Canada"
# Same for India as well
mask = df_merged.country.str.contains('India')
df_merged.loc[mask,'country'] = "India"
mask = df_merged.country.str.contains('United States')
df_merged.loc[mask,'country'] = "United States"

In [None]:
# Add separate month and year added columns
df_merged['month_added'] = pd.DatetimeIndex(df_merged['date_added']).strftime('%b')
df_merged['year_added'] = pd.DatetimeIndex(df_merged['date_added']).year
df_merged['year_added'] = df_merged['year_added'].astype('Int64').astype('str')

In [None]:
df_merged

In [None]:
df_merged_ts = df_merged.set_index(pd.DatetimeIndex(pd.to_datetime(df_merged['date_added']).values))

In [None]:
df_merged_ts

In [None]:
# Overall mean rating by region
df_merged_ts.groupby('country').IMDbrating.mean()

In [None]:
# Rating over the years
mean_rating_by_month_year = df_merged_ts.IMDbrating.groupby(pd.Grouper(freq='M')).mean()
mean_rating_by_month_year

In [None]:
# Forward fill NaN values
mean_rating_by_month_year.fillna(method='ffill', inplace=True)
mean_rating_by_month_year

In [None]:
df_merged_ts[['dur','time']] = df_merged_ts.duration.str.split(expand=True)
df_merged_ts.drop(['duration', 'time'], axis=1, inplace=True)
df_merged_ts.rename({'dur' : 'duration'}, axis=1, inplace=True)
df_merged_ts

In [None]:
df_merged_ts.duration = pd.to_numeric(df_merged_ts.duration)
df_merged_ts.info()

# Analysis and visualization

## Time series - Average IMDb rating on Netflix overall

In [None]:
fig, ax = plt.subplots(figsize=(18,8))
mean_rating_by_month_year.plot(color='red', linestyle='dashed', marker='o',  
             markerfacecolor='blue', markersize=5)
plt.ylabel("IMDb Rating")
plt.xlabel("Year")
plt.title("Average IMDb rating for content released on Netflix each month (Region: US & India) over the years")
plt.grid()

In [None]:
df_merged_ts.index.name = "Date"

In [None]:
# Creating dataframes for India and USA
df_merged_ts_India = df_merged_ts[df_merged_ts.country == "India"]
df_merged_ts_US = df_merged_ts[df_merged_ts.country == "United States"]
mean_rating_India = df_merged_ts_India.IMDbrating.groupby(pd.Grouper(freq='M')).mean()
mean_rating_US = df_merged_ts_US.IMDbrating.groupby(pd.Grouper(freq='M')).mean()
mean_rating_India.fillna(method='ffill', inplace=True)
mean_rating_US.fillna(method='ffill', inplace=True)

## Time Series - Average IMDb rating on Netflix region wise

In [None]:
fig, ax = plt.subplots(figsize=(18,8))
ax.plot(mean_rating_India, '-g', label="India")
ax.plot(mean_rating_US, '--b', label="US")
plt.ylabel("IMDb Rating")
plt.xlabel("Year")
plt.title("Average IMDb rating for content released on Netflix each month by region")
plt.legend()
plt.grid()

## Distribution of movie duration

In [None]:
fig, ax = plt.subplots(figsize=(12,8))
sns.kdeplot(df_merged_ts_India[df_merged_ts_India.type == "Movie"]['duration'], fill=True, 
            color='orange', alpha=.5, linewidth=0, label = 'India')
sns.kdeplot(df_merged_ts_US[df_merged_ts_US.type == "Movie"]['duration'], fill=True, 
            color='blue', alpha=.5, linewidth=0, label='US')
plt.title("Distribution of movie duration (in min)")
plt.legend()
plt.show()

- As expected, Indian movies have a mean duration higher than US movies
- Indian movies seem to have a mean duration around 140 min
- While US movies have a mean duration of around 90 min

## Top movie directors

In [None]:
# Top directors overall by ratings
df_top_directors = df_merged_ts[df_merged_ts.type == "Movie"].groupby(['director'])['IMDbrating'].mean()
df_top_directors = df_top_directors.sort_values(ascending=False)

fig = plt.figure(figsize = (10, 5))
plt.barh(df_top_directors[9::-1].index, df_top_directors[9::-1].values)
plt.xlabel("IMDb Rating")
plt.ylabel("Director")
plt.title("Top 10 directors on netflix with their mean IMDb ratings (Region: India & US)")
plt.show()

In [None]:
import seaborn as sns
col = sns.color_palette("Blues", 10).as_hex()
col

## Top Movie Directors India

In [None]:
# Top directors in India
df_top_directors_india_all = df_merged_ts_India[df_merged_ts_India.type == "Movie"].groupby(['director'])['IMDbrating'].mean()
df_top_directors_india_all = df_top_directors_india_all.sort_values(ascending=False)
df_top_directors_india_min5 = df_merged_ts_India[df_merged_ts_India.type == "Movie"].groupby(['director'])['IMDbrating'].agg({'count', 'mean'})
df_top_directors_india_min5 = df_top_directors_india_min5[df_top_directors_india_min5['count'] >= 5]['mean'].sort_values(ascending=False)

fig, ax = plt.subplots(2, figsize=(10,10))
plt.subplots_adjust(hspace = 0.4)

ax[0].hlines(y=df_top_directors_india_all[9::-1].index, xmin=0, xmax=df_top_directors_india_all[9::-1].values, 
             color=col,  linewidth=7)
ax[0].plot(df_top_directors_india_all[9::-1].values, df_top_directors_india_all[9::-1].index, 
           "o", markersize=7, color='red', alpha=0.6)
ax[0].set_xlabel("IMDb Rating", fontsize=12)
ax[0].set_ylabel("Director", fontsize=12)
ax[0].set_title("Top 10 movie directors on netflix with their mean IMDb ratings (Region: India)", fontsize=12)
ax[0].set_xlim([0,10])

ax[1].hlines(y=df_top_directors_india_min5[9::-1].index, xmin=0, xmax=df_top_directors_india_min5[9::-1].values, 
             color=col,  linewidth=7)
ax[1].plot(df_top_directors_india_min5[9::-1].values, df_top_directors_india_min5[9::-1].index, 
           "o", markersize=7, color='red', alpha=0.6)
ax[1].set_xlabel("IMDb Rating", fontsize=12)
ax[1].set_ylabel("Director", fontsize=12)
ax[1].set_title("Top 10 movie directors on netflix (min. 5 movies) with their mean IMDb ratings (Region: India)", fontsize=12)
ax[1].set_xlim([0,10])

plt.show()

- An interesting observation above is that none of the top directors are present in the directors list which have atleast 5 movies on Netflix!

In [None]:
# Top movie directors in US
df_top_directors_US_all = df_merged_ts_US[df_merged_ts_US.type == "Movie"].groupby(['director'])['IMDbrating'].mean()
df_top_directors_US_all = df_top_directors_US_all.sort_values(ascending=False)
df_top_directors_US_min5 = df_merged_ts_US[df_merged_ts_US.type == "Movie"].groupby(['director'])['IMDbrating'].agg({'count', 'mean'})
df_top_directors_US_min5 = df_top_directors_US_min5[df_top_directors_US_min5['count'] >= 5]['mean'].sort_values(ascending=False)

fig, ax = plt.subplots(2, figsize=(10,10))
plt.subplots_adjust(hspace = 0.4)

ax[0].hlines(y=df_top_directors_US_all[9::-1].index, xmin=0, xmax=df_top_directors_US_all[9::-1].values, 
             color=col,  linewidth=7)
ax[0].plot(df_top_directors_US_all[9::-1].values, df_top_directors_US_all[9::-1].index, 
           "o", markersize=7, color='red', alpha=0.6)
ax[0].set_xlabel("IMDb Rating", fontsize=12)
ax[0].set_ylabel("Director", fontsize=12)
ax[0].set_title("Top 10 movie directors on netflix with their mean IMDb ratings (Region: US)", fontsize=12)
ax[0].set_xlim([0,10])

ax[1].hlines(y=df_top_directors_US_min5[9::-1].index, xmin=0, xmax=df_top_directors_US_min5[9::-1].values, 
             color=col,  linewidth=7)
ax[1].plot(df_top_directors_US_min5[9::-1].values, df_top_directors_US_min5[9::-1].index, 
           "o", markersize=7, color='red', alpha=0.6)
ax[1].set_xlabel("IMDb Rating", fontsize=12)
ax[1].set_ylabel("Director", fontsize=12)
ax[1].set_title("Top 10 movie directors on netflix (min. 5 movies) with their mean IMDb ratings (Region: US)", fontsize=12)
ax[1].set_xlim([0,10])

plt.show()

In [None]:
# Top movies
top_movies_india = df_merged_ts_India.loc[df_merged_ts_India.type == 'Movie', ['title', 'IMDbrating']]
top_movies_india = top_movies_india.sort_values(ascending=False, by='IMDbrating')
top_movies_us = df_merged_ts_US.loc[df_merged_ts_US.type == 'Movie', ['title', 'IMDbrating']]
top_movies_us = top_movies_us.sort_values(ascending=False, by='IMDbrating')


fig, ax = plt.subplots(2, figsize=(10,10))
plt.subplots_adjust(hspace = 0.4)

ax[0].hlines(y=top_movies_india.title[9::-1], xmin=0, xmax=top_movies_india.IMDbrating[9::-1], 
             color=col,  linewidth=7)
ax[0].plot(top_movies_india.IMDbrating[9::-1], top_movies_india.title[9::-1],
           "o", markersize=7, color='red', alpha=0.6)
ax[0].set_xlabel("IMDb Rating", fontsize=12)
ax[0].set_ylabel("Movie", fontsize=12)
ax[0].set_title("Top 10 movies (Region: India)", fontsize=12)
ax[0].set_xlim([0,10])

ax[1].hlines(y=top_movies_us.title[9::-1], xmin=0, xmax=top_movies_us.IMDbrating[9::-1], 
             color=col,  linewidth=7)
ax[1].plot(top_movies_us.IMDbrating[9::-1], top_movies_us.title[9::-1],
           "o", markersize=7, color='red', alpha=0.6)
ax[1].set_xlabel("IMDb Rating", fontsize=12)
ax[1].set_ylabel("Movie", fontsize=12)
ax[1].set_title("Top 10 movies (Region: US)", fontsize=12)
ax[1].set_xlim([0,10])

plt.show()

In [None]:
# Top TV Shows
top_tv_india = df_merged_ts_India.loc[df_merged_ts_India.type == 'TV Show', ['title', 'IMDbrating']]
top_tv_india = top_tv_india.sort_values(ascending=False, by='IMDbrating')
top_tv_us = df_merged_ts_US.loc[df_merged_ts_US.type == 'TV Show', ['title', 'IMDbrating']]
top_tv_us = top_tv_us.sort_values(ascending=False, by='IMDbrating')


fig, ax = plt.subplots(2, figsize=(10,10))
plt.subplots_adjust(hspace = 0.4)

ax[0].hlines(y=top_tv_india.title[9::-1], xmin=0, xmax=top_tv_india.IMDbrating[9::-1], 
             color=col,  linewidth=7)
ax[0].plot(top_tv_india.IMDbrating[9::-1], top_tv_india.title[9::-1],
           "o", markersize=7, color='red', alpha=0.6)
ax[0].set_xlabel("IMDb Rating", fontsize=12)
ax[0].set_ylabel("TV Show", fontsize=12)
ax[0].set_title("Top 10 TV (Region: India)", fontsize=12)
ax[0].set_xlim([0,10])

ax[1].hlines(y=top_tv_us.title[9::-1], xmin=0, xmax=top_tv_us.IMDbrating[9::-1], 
             color=col,  linewidth=7)
ax[1].plot(top_tv_us.IMDbrating[9::-1], top_tv_us.title[9::-1],
           "o", markersize=7, color='red', alpha=0.6)
ax[1].set_xlabel("IMDb Rating", fontsize=12)
ax[1].set_ylabel("TV SHow", fontsize=12)
ax[1].set_title("Top 10 TV (Region: US)", fontsize=12)
ax[1].set_xlim([0,10])

plt.show()