# External data exploration

### Imports

In [None]:
import os

import pandas as pd
import numpy as np

from src.utils.const import DATA_DIR

### Useful path to data

In [None]:
ROOT_DIR = os.path.join(os.getcwd(), '..')
EXTERNAL_DIR = os.path.join(ROOT_DIR, DATA_DIR, 'external')
INTERIM_DIR = os.path.join(ROOT_DIR, DATA_DIR, 'interim')

## Generic useful function

In [None]:
from src.utils.wrapper import drop, fill_na, rename, replace, convert_to

### title-basics.csv

In [None]:
imdb = pd.read_csv(
    os.path.join(EXTERNAL_DIR, 'title-basics.csv'),
    encoding='utf-8',
    usecols=['tconst', 'runtimeMinutes'],
    dtype={'tconst': 'string'}
)

imdb.info()

#### Pipe imdb

In [None]:
imdb = imdb. \
    pipe(replace, 'runtimeMinutes', '([\\]*[a-zA-Z|\\-]+)', np.nan). \
    pipe(convert_to, 'runtimeMinutes', 'float32'). \
    pipe(rename, {'runtimeMinutes':'runtime'})

imdb.info()

### tmdb-features.csv

In [None]:
tmdb = pd.read_csv(
        os.path.join(EXTERNAL_DIR, 'tmdb.csv'),
        encoding='utf-8',
        usecols=['movieId', 'tmdbId', 'imdb_id', 'runtime'],
        dtype={'movieId': 'int32', 'imdb_id':'string', 'tmdbId': 'float32', 'runtime': 'float32'}
)

tmdb.info()

#### Specific tmdb functions used in pipe

In [None]:
def extract_correct_runtime(df: pd.DataFrame) -> pd.DataFrame:
    df['runtime'] = df['runtime_x'].mask((df['runtime_x'].isna()) | (df['runtime_x'] == 0), df['runtime_y'])
    return df

### Pipe tmdb

In [None]:
tmdb = tmdb. \
    pipe(pd.merge, imdb, how='left', left_on='imdb_id', right_on='tconst'). \
    pipe(extract_correct_runtime). \
    pipe(fill_na, 'runtime', 'median'). \
    pipe(drop, ['tmdbId', 'imdb_id', 'tconst', 'runtime_x', 'runtime_y'])

tmdb.info()