# Netflix Data Analysis & Data Visualization (w/ IMDb dataset)

### This notebook includes the following parts:
1. Step 1: Loading Data & Preprocessing
2. Step 2: Data Visualization
3. Step 3: IMDb Dataset Import

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Import useful libraries
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
import urllib.request, urllib.parse

%matplotlib inline
plt.rcParams["figure.figsize"] = (12,9)

## Step 1: Loading Data and Preprocessing

In [None]:
df = pd.read_csv("../input/netflix-shows/netflix_titles.csv", index_col = "show_id")

In [None]:
df.head(10)

In [None]:
# Print out dataframe information
print(df.columns)
df.info()

Seems the column `date_added` can be transformed into _Datetime_ datatype, we then apply *pd.to_datetime()* for data transformation. Moreover, from the above information table, we can easily see not all records are non-null. We try to apply `dropna()` and see how it is going.

In [None]:
temp_df = df.dropna()
temp_df.info()

If we apply `dropna()` directly onto the dataframe, **around 3000 records** will be dropped out. As we still want to look up as much as data and don't want to remove so many records before at least having a glimpse on them, we will keep using the original dataframe for basic visualizations in order to gain more insights from the dataset.

In [None]:
df.date_added = pd.to_datetime(df.date_added)
df.info()

## Step 2: Merging with IMDb Dataset

In [None]:
import urllib.request, urllib.parse
from bs4 import BeautifulSoup



## Step 3: Data Visualization

In [None]:
# Visualize the top-10 movie-filming countries
count_df = df.groupby(['country']).count().sort_values(by=["description"], ascending=False)
plot_df = count_df.head(15)
sns.barplot(x=plot_df.index, y=plot_df.description)
plt.xticks(rotation=45, ha="right", size=15)
plt.yticks(size = 15)
plt.xlabel("Country", size=18)
plt.ylabel("Counts", size=18)
plt.title("Top-15 Movies-filming Country", size= 18)
plt.grid()
plt.show()

In [None]:
# Visualize the amount of filming products from different directors
count_df = df.groupby(['director']).count().sort_values(by=["description"], ascending=False)
plot_df = count_df.head(15)
sns.barplot(x=plot_df.index,  y=plot_df.description)
plt.xticks(rotation=45,ha="right", size=15)
plt.yticks(np.linspace(2, 20, 10, dtype=np.int32), size = 15)
plt.xlabel("Director", size=18)
plt.ylabel("Counts", size=18)
plt.title("Top-15 Most-Producing Directors", size= 18)
plt.grid()
plt.show()

In [None]:
# Visualize the amount of filming products in different types
count_df = df.groupby(['type']).count().sort_values(by=["description"], ascending=False)
fig, ax = plt.subplots(1,2)
ax[0].pie(count_df.description, labels=count_df.index, autopct='%1.1f%%', shadow=True, textprops={'fontsize': 15})
ax[0].set_title("Ratio of Filming Products Types", size=20)
ax[0].axis('equal')

year_df = df.loc[:, ["title", "release_year", "type"]][df.release_year >= 2000].groupby(["release_year", "type"]).count()
year_df.head()
year_df["type"], year_df["release_year"] = np.nan, np.nan
_type, _year = [], []
for n, i in enumerate(year_df.index):
    _year.append(int(i[0]))
    _type.append(i[1])
year_df = year_df.reset_index(drop=True)
update_dict = pd.DataFrame({"release_year":_year, "type":_type})
year_df.update(update_dict)
ax[1] = sns.lineplot(x="release_year", y="title", hue="type", data=year_df, markers=True, dashes=False)
ax[1].set_xlabel("Year", size=18)
ax[1].set_ylabel("Release Counts", size=18)
ax[1].tick_params(axis = "both", labelsize=15)
ax[1].set_title("Production Amount of Differnt \nFilming Types, 2000-2020", size= 20)
ax[1].legend(prop={"size":15})
ax[1].grid()
plt.show()

In [None]:
# Using one-hot encoding, compute the amount of filming products under each genra.
# Firstly, get all genra categories
temp = df["listed_in"]
genera = []
for i in temp:
    b = i.split(sep=", ")
    genera += b
genera = list(set(genera))
print(genera)
print("\nTotal number of genra: {}".format(len(genera)))

In [None]:
# Perform one-hot encoding manually, since the `listed_in` is in str format and get_dummies or sklearn.OneHongEncoder doesnt work.
for genre in genera:
    count_list = []
    for n in df.index:
        if genre in df.loc[n, "listed_in"].split(sep=", "):
            count_list.append(1)
        else:
            count_list.append(0)
    df[genre] = count_list

df.head()

In [None]:
df_temp = pd.DataFrame(df[genera].sum(axis=0), columns=["Counts"])
df_temp.sort_values("Counts", ascending=False, inplace=True)
sns.barplot(x=df_temp.Counts, y=df_temp.index)
plt.xticks( size=15)
plt.yticks(size=13)
plt.xlabel("Counts", size=18)
plt.ylabel("Genera", size=18)
plt.title("Genera Counts", size= 20)
plt.grid()
plt.show()

In [None]:
temp_df = df[df.type == "Movie"]
df_temp = pd.DataFrame(temp_df[genera].sum(axis=0), columns=["Counts"])
df_temp.sort_values("Counts", ascending=False, inplace=True)
plt.subplot(211)
plt.pie(df_temp.Counts, labels=df_temp.index, autopct='%1.1f%%',labeldistance=1, textprops={'fontsize': 10})
plt.title("Movie Types", size= 18)

temp_df = df[df.type == "TV Show"]
df_temp = pd.DataFrame(temp_df[genera].sum(axis=0), columns=["Counts"])
df_temp.sort_values("Counts", ascending=False, inplace=True)
plt.subplot(212)
plt.pie(df_temp.Counts, labels=df_temp.index, autopct='%1.1f%%', labeldistance=1, textprops={'fontsize': 10})
plt.title("TV Show Types", size=18)
plt.show()

We can see that (although some of the labels are overlapping), under "Movies" category, "International Movies" (21.1%) and "Drama" (18.2%) are the top-2 movie types having the most productions; whereas, "Internatinal TV Shows" (21.7%) and "TV Dramas" are the top-2 for "TV Show" category.

### Step 3: IMDb Dataset Import

In [None]:
# Now, we will try to make use of IMDb data, the python library `IMDbPY` will be used.
!pip install IMDbPY
import imdb 

In [None]:
# A demostration of how to use the python library `imdb`.
# creating instance of IMDb 
ia = imdb.IMDb() 

# movie name 
name = "Tarzan the wonder car"
  
# searchning the movie 
search = ia.search_movie(name) 

# id 
code = ia.get_imdbID(search[0])

# printing the result id
print("Search result ID: {}".format(code))
  
# getting information 
series = ia.get_movie(code) 
  
# getting rating of the series 
rating = series.data['rating'] 
  
# printing the object i.e name 
print("Search title: {}".format(series))
  
# print the rating 
print("Rating = {}".format(rating))

In [None]:
# Create an IMDB search function for the sake of convenience.
# We will use the first IMDB search (if string length similar) as our title-search result.
def imdb_rating_search(title):
    ia = imdb.IMDb() 
    search = ia.search_movie(title) 
    if len(search) == 0 or search[0]["title"].lower() != title.lower():
        return np.nan
    code = ia.get_imdbID(search[0])
    series = ia.get_movie(code)
    try:
        rating = series.data['rating']
    except:
        return np.nan
    return rating

In [None]:
%%time
# Problem on running: long duration.
# # Now we will use it to iterate over our Neflix dataframe to get corresponding IMDb rating for each movie/TV show.
# df["imdb_rating"] = np.nan
# t = df.apply(lambda row : imdb_rating_search(row['title']), axis = 1)
# print(t)
# df["imdb_rating"] = t