# External data exploration

### Imports

In [1]:
import os

import pandas as pd
import numpy as np

from src.utils.const import DATA_DIR

### Useful path to data

In [2]:
ROOT_DIR = os.path.join(os.getcwd(), '..')
EXTERNAL_DIR = os.path.join(ROOT_DIR, DATA_DIR, 'external')
INTERIM_DIR = os.path.join(ROOT_DIR, DATA_DIR, 'interim')

## Generic useful function

In [3]:
from src.utils.wrapper import drop, fill_na, rename, replace, convert_to

### title-basics.csv

In [4]:
imdb = pd.read_csv(
    os.path.join(EXTERNAL_DIR, 'title-basics.csv'),
    encoding='utf-8',
    usecols=['tconst', 'runtimeMinutes'],
    dtype={'tconst': 'string'}
)

imdb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8838595 entries, 0 to 8838594
Data columns (total 2 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          string
 1   runtimeMinutes  object
dtypes: object(1), string(1)
memory usage: 134.9+ MB


#### Pipe imdb

In [5]:
imdb = imdb. \
    pipe(replace, 'runtimeMinutes', '([\\]*[a-zA-Z|\\-]+)', np.nan). \
    pipe(convert_to, 'runtimeMinutes', 'float32'). \
    pipe(rename, {'runtimeMinutes':'runtime'})

imdb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8838595 entries, 0 to 8838594
Data columns (total 2 columns):
 #   Column   Dtype  
---  ------   -----  
 0   tconst   string 
 1   runtime  float32
dtypes: float32(1), string(1)
memory usage: 101.1 MB


### tmdb-features.csv

In [6]:
tmdb = pd.read_csv(
        os.path.join(EXTERNAL_DIR, 'tmdb.csv'),
        encoding='utf-8',
        usecols=['movieId', 'tmdbId', 'imdb_id', 'runtime'],
        dtype={'movieId': 'int32', 'imdb_id':'string', 'tmdbId': 'float32', 'runtime': 'float32'}
)

tmdb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58098 entries, 0 to 58097
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   movieId  58098 non-null  int32  
 1   tmdbId   57917 non-null  float32
 2   runtime  57110 non-null  float32
 3   imdb_id  57156 non-null  string 
dtypes: float32(2), int32(1), string(1)
memory usage: 1.1 MB


#### Specific tmdb functions used in pipe

In [7]:
def extract_correct_runtime(df: pd.DataFrame) -> pd.DataFrame:
    df['runtime'] = df['runtime_x'].mask((df['runtime_x'].isna()) | (df['runtime_x'] == 0), df['runtime_y'])
    return df

### Pipe tmdb

In [8]:
tmdb = tmdb. \
    pipe(pd.merge, imdb, how='left', left_on='imdb_id', right_on='tconst'). \
    pipe(extract_correct_runtime). \
    pipe(fill_na, 'runtime', 'median'). \
    pipe(drop, ['tmdbId', 'imdb_id', 'tconst', 'runtime_x', 'runtime_y'])

tmdb.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 58098 entries, 0 to 58097
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   movieId  58098 non-null  int32  
 1   runtime  58098 non-null  float32
dtypes: float32(1), int32(1)
memory usage: 907.8 KB
