In [None]:
# default_exp extract

# Data Extraction and Cleaning

> The TMDB dataset is only accessible through an API. Hence, we will write a few functions to extract the dataset, perform some simple preprocessing before saving to csv files.

In [None]:
#hide
from nbdev.showdoc import *
from fastcore.test import *

We first extract the dataset by accessing the TMDB API. Since not all requests will be successful, a larger than desired dataset size value should be used to return an eventual dataset size of roughly the same magnitude. 

In [None]:
#exporti

from pandas import DataFrame
import os
import concurrent
import requests
import pandas as pd

def extract_dataset(api_key: str, n: int):
    req_url = f"https://api.themoviedb.org/3/movie/{n}?api_key={api_key}&language=en-US"
    response = requests.get(req_url)
    if response.status_code == 200:
        array = response.json()
        return array
    return

def save_to_csv(df: DataFrame,
                save_path: str,
                fname: str):
    df.to_csv(os.path.join(save_path, fname))
    return

In [None]:
#export

def extract_dataset_threaded(api_key: str, 
                             max_ds_size: int, 
                             max_threads: int,
                             save_path: str, 
                             fname: str) -> DataFrame:
    
    max_threads = max_threads if max_threads < max_ds_size else max_ds_size
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_threads) as executor:
        movies = [ex for n in range(max_ds_size) if (ex := executor.submit(extract_dataset, api_key, n).result()) is not None]
    movies = pd.DataFrame.from_records(movies)
    save_to_csv(df=movies, save_path=save_path, fname=fname)
    return movies

Let's run the data extraction functions. 

In [None]:
#hide
api_key = "785475a69770b7dc1af964feff948dd7"
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', 10)
pd.set_option('display.width', None)

In [None]:
# api_key = "your_api_key_here"
max_ds_size = 20
max_threads = 8
movies = extract_dataset_threaded(api_key=api_key,
                                  max_ds_size=max_ds_size,
                                  max_threads=max_threads,
                                  save_path='.',
                                  fname='raw_data.csv')
print(f"Total number of examples downloaded is: {len(movies)}")
movies.head()

Total number of examples downloaded is: 15


Unnamed: 0,adult,backdrop_path,belongs_to_collection,budget,genres,...,tagline,title,video,vote_average,vote_count
0,False,/hQ4pYsIbP22TMXOUdSfC2mjWrO0.jpg,,0,"[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...",...,,Ariel,False,6.8,128
1,False,/l94l89eMmFKh7na2a1u5q67VgNx.jpg,,0,"[{'id': 18, 'name': 'Drama'}, {'id': 35, 'name...",...,,Shadows in Paradise,False,7.2,126
2,False,/u0zMKKpEdDWpOKmFW2sLbKKICJH.jpg,,4000000,"[{'id': 80, 'name': 'Crime'}, {'id': 35, 'name...",...,Twelve outrageous guests. Four scandalous requ...,Four Rooms,False,5.7,1910
3,False,/5aXp2s4l6g5PcMMesIj63mx8hmJ.jpg,,21000000,"[{'id': 28, 'name': 'Action'}, {'id': 53, 'nam...",...,Don't move. Don't whisper. Don't even breathe.,Judgment Night,False,6.5,184
4,False,,,42000,"[{'id': 99, 'name': 'Documentary'}]",...,A Megacities remix.,Life in Loops (A Megacities RMX),False,7.4,14


We have extracted the raw data into a dataframe for easy accessibility & saved it into a csv file for later. Now, we shall move on to the EDA and visualization step to see what insights we can draw from the data.  