In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session


In [None]:

dataset_path = '../input/netflix-shows/netflix_titles.csv'
data = pd.read_csv(dataset_path)
data.head()

As we can see at that dataset, show_id and Movie is not belonging to anything we can get IMDb from, but that's helpful to suggest to the client. More over, realse_year can be used to determine exactly movie in the action connect to imdb.com and search movie by **title**.

First, we have to import some useful library to connect with website

In [None]:
import urllib.parse
from bs4 import BeautifulSoup
import requests
import numpy as np 
import pandas as pd 
from threading import Thread
import threading
import math
import time
from IPython.display import display # If you want to render pandas local, you should replace data.head() to display(data.head())

dataset_path = '../input/netflix-shows/netflix_titles.csv'
# suggestion_show_path = '../output/netflix-shows/suggestion.csv'
# imdb_dataset_path = '../output/netflix-shows/imdb_dataset.csv'

data = pd.read_csv(dataset_path)


Header of the request is important too. If your header is unreliable, your request will take long time to be response

In [None]:
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36'}

Because the dataset is very large, so if you want to connect and crawl data more quickly, u have to understand multi-threading to improve performance. Moreover, if you want to get more speed, you can create more processes to run. For me, I take about 15 minutes to catch all IMDb from imdb.com. Actually, the request take not too much performance, but the waiting time is too long. Therefore, it's very good for me to reduce waste time.

In [None]:
thread_num = 8 # Base on your computer.

suggest_imdb = 0 # That variable used to determine what rate are useful for user. between 0 and 10

def main():
    print("Starting crawl data")
    num_shows = int(len(data.index))
    print(num_shows)
    imdb_list = list(range(0,num_shows))
    steps = []
    start = 0
    end  = 0
    start_time = time.time()

    condition = num_shows%thread_num == 0
    loop = thread_num if condition else thread_num -1
    
    each_row = int(num_shows/thread_num if condition else math.ceil(num_shows/thread_num))
    for index in range(0,loop):
        end = start+each_row-1
        steps.append((start,end))
        start = end+1
        
    if not condition:
        steps.append((start,num_shows))
    print(steps)

    # Create thread, and then execute them, we will talk about that section below.
    lst_thread = []
    for thread in steps:
        print('Start Thread')
        th = threading.Thread(target=execute_analyze_data,args=(thread[0],thread[1],imdb_list,))
        th.start()
        lst_thread.append(th)
    
    for th in lst_thread:
        th.join()

    data.insert(2, "IMDb", imdb_list, True) 
    data.to_csv('data_with_imdb.csv', sep=',', encoding='utf-8', index=False)
    print("Done crawl data: ", time.time() - start_time, "s")
    select_top_imdb(data, suggest_imdb)

After crawling data, the system should analyze movies that satisfy user conditions in the upper. But don't worry about data which is not eligible for condition. Because i have create one more file to save all record "**data_with_imdb**"

In [None]:
def select_top_imdb(data, suggest_imdb):
    print("Starting select suggestion show")
    select = data.loc[data["IMDb"] >= suggest_imdb]
    select.to_csv('suggest_movie.csv', sep=',', encoding='utf-8', index=False)
    # print("Done: ", time.time() - now, "s")

Now, let create a middle method to link from base process to child task. Each process has its crawl assignment between *start* and *end* variable.

In [None]:
def execute_analyze_data(start,end, imdb_list):
    href_link = get_list_id(data,start,end)
    get_imdb_rate(imdb_list, data,href_link,start,end)

I have splice task of each process into 2 stage:
1. Get hyper link stage: I want to catch all link from imdb.com. You know, after crawling data then we can take the link to access main page to get IMDb. **But, please note that not all TV shows and Movies in Netflix has IMDb rate**

In [None]:
def get_list_id(data, start, end):
    href_link = []
    for index in range(start,end):
        print(index,"-Process get title id: ", data["title"][index], end="\n")

        values = {'q':data["title"][index]} 
        query = urllib.parse.urlencode(values) 
        query_find = 'https://imdb.com/find?{}'.format(query)
        response = requests.get(query_find, headers=headers)
        resp = response.content
        html = BeautifulSoup(resp,'html.parser')
        result = html.findAll("td", {"class": "result_text"})
        if len(result) > 0:
            isNew = False
            for item in result:
                if item.text.find("({})".format(data["release_year"])):
                    href_link.append(item.a["href"])
                    isNew = True
                    break;
            if not isNew:
                href_link.append(None)
                continue
        else:
            href_link.append(None)
    return href_link

After all, crawl one more times to catch IMDb score. All my access is base all BeautifulSoup parsing, and the key is on the HTML side.

In [None]:
def get_imdb_rate(imdb_list, data, href_link, start, end):
    i = 0
    for index in range(start,end):
        print(index,"Process get imdb: ", data["title"][index],  end="\n")
        if href_link[i]:
            query_find = 'https://imdb.com{}'.format(href_link[i])
            response = requests.get(query_find, headers=headers)
            resp = response.content
            html = BeautifulSoup(resp,'html.parser')
            result = html.findAll("div", {"class": "ratingValue"})[0]
            imdb_list[index] = float(result.strong.span.text)
        else:
            imdb_list[index] = -1
        i +=1
    return imdb_list

Ok, that's everything we need. 
**Note that, because kaggle is prevent request in/out. So if you want to catch the IMDb, you should do it in your local computer. With my computer, it take about 15 minutes to crawl all IMDb**

In [None]:
# if __name__ == "__main__":
#     main()

Horay, let check our new dataset.

In [None]:
# After crawl dataset.
dataset_path = '../input/netflix-title-imdb/netflix_titles_imdb.csv'
data = pd.read_csv(dataset_path)
data.head()

Fortunately, It's working and we get IMDb here.

In [None]:
#Drop unexpected dataset ||| IF need
data.info()

Humn, about 7372/7787 row have IMDb, that's good for me.

In [None]:
data = data.loc[data["IMDb"]<=10] # To ensure.
data = data.dropna(subset=['IMDb'])
data.info()

Firstly, we should render the relatively connection of country and IMDb. Maybe we can catch some information here.

In [None]:
from collections import Counter
from matplotlib import gridspec
import seaborn as sns
import matplotlib.pyplot as plt

t_data = data.dropna(subset=['country'])

pivot_columns = "country"

# Query the movie type and get it duration
pivot_data = t_data[pivot_columns]

# Because some data have invalid token space, so that make pandas count wrong. We need to strip that illegular and then sort
pivot_count = pd.Series(dict(Counter(','.join(pivot_data).replace(' ,',',').replace(', ',',').split(',')))).sort_values(ascending=False)
top15 = pivot_count.head(15)

IMDb_data = []
for country in top15.index:
    IMDb_data.append(t_data[t_data[pivot_columns].str.contains(country)].mean()["IMDb"])
    
    
fig = plt.figure(figsize=(20, 10))
gs = gridspec.GridSpec(nrows=1, ncols=2, height_ratios=[8], width_ratios=[10, 5])
ax1 = sns.set_style(style=None, rc=None )
fig, ax1 = plt.subplots(figsize=(30,10))
ax1.set_ylabel('Number of contributions', fontsize=18)
ax2 = ax1.twinx()
ax2.set_ylabel('IMDb', fontsize=18)
sns.lineplot(x=top15.index, y = IMDb_data, marker='o', sort = False, ax=ax2)
sns.barplot(data = t_data, x=top15.index, y=top15, alpha=0.5, ax=ax1, palette="mako")

Oops, how about the duration of each **movie**. Does it important for watcher to rate the movies ???. Let see.

Before doing anything, i will create a duration with int-type (minute first) for movie

In [None]:
# Next code, we're going to groupby the duration, that's why we create more column is use to measure the data
data['duration_int'] = data.apply(lambda row: int(row["duration"].split(' ')[0]) , axis=1)
data.head()

Luckily, the not null value of duration is equal with IMDb.

In [None]:
pivot_columns = 'duration'
pivot_data = data[data["type"] == "Movie"][pivot_columns]
pivot_data.head(100)
fig, ax1 = plt.subplots(figsize=(30,10))

sns.distplot(pivot_data.astype(str).str.extract('(\d+)'),kde=False,color=['red'])
pivot_data = data[data["type"] == "Movie"].groupby(by=pivot_columns).agg(['mean'])[['duration_int','IMDb']]
nframe =  pd.DataFrame()
# nframe['duration'] = pivot_data['duration_int']['mean']
nframe['IMDb'] = pivot_data['IMDb']['mean']
nframe['duration_'] = pivot_data['duration_int']['mean']
nframe = nframe.sort_values('duration_',ascending=True)
nframe = nframe.drop(columns=['duration_'])
nframe.head()
ax1.set_ylabel('Number of contributions', fontsize=18)
ax2 = ax1.twinx()
ax2.set_ylabel('IMDb', fontsize=18)
sns.lineplot(data=nframe, marker='o', sort = False, ax=ax2)
ax2.set(xticklabels=[])