# External data exploration

### Imports

In [1]:
import os

import pandas as pd
import numpy as np

from src.utils.const import DATA_DIR

### Useful path to data

In [2]:
ROOT_DIR = os.path.join(os.getcwd(), '..')
EXTERNAL_DIR = os.path.join(ROOT_DIR, DATA_DIR, 'external')
INTERIM_DIR = os.path.join(ROOT_DIR, DATA_DIR, 'interim')

## Generic useful function

In [3]:
from src.utils.wrapper import drop, fill_na, rename, replace, convert_to

### title-basics.csv

In [17]:
imdb = pd.read_csv(
    os.path.join(EXTERNAL_DIR, 'title-basics.csv'),
    encoding='utf-8',
    usecols=['tconst', 'runtimeMinutes'],
    dtype={'tconst': 'string'}
)

imdb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8830142 entries, 0 to 8830141
Data columns (total 2 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          string
 1   runtimeMinutes  object
dtypes: object(1), string(1)
memory usage: 134.7+ MB


#### Pipe imdb

In [18]:
imdb = imdb. \
    pipe(replace, 'runtimeMinutes', '([\\]*[a-zA-Z|\-]+)', np.nan). \
    pipe(convert_to, 'runtimeMinutes', 'float32'). \
    pipe(rename, {'runtimeMinutes':'runtime'})

imdb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8830142 entries, 0 to 8830141
Data columns (total 2 columns):
 #   Column   Dtype  
---  ------   -----  
 0   tconst   string 
 1   runtime  float32
dtypes: float32(1), string(1)
memory usage: 101.1 MB


#### Save imdb.csv to interim

In [6]:
filepath = os.path.join(INTERIM_DIR, 'imdb.csv')
if not os.path.exists(filepath):
    imdb.to_csv(filepath, encoding='utf-8', index=False)

### tmdb-features.csv

In [12]:
tmdb = pd.read_csv(
        os.path.join(EXTERNAL_DIR, 'tmdb-features.csv'),
        encoding='utf-8',
        usecols=['movieId', 'tmdbId', 'runtime'],
        dtype={'movieId': 'uint16', 'tmdbId': 'float32', 'runtime': 'float32'}
)

tmdb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58098 entries, 0 to 58097
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   movieId  58098 non-null  uint16 
 1   tmdbId   57917 non-null  float32
 2   runtime  58035 non-null  float32
dtypes: float32(2), uint16(1)
memory usage: 567.5 KB


#### Read links.csv from interim

In [13]:
links = pd.read_csv(
    os.path.join(INTERIM_DIR, 'links.csv'),
    encoding='utf-8',
    dtype={'movieId':'uint16', 'imdbId':'string'}
)

links.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58098 entries, 0 to 58097
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  58098 non-null  uint16
 1   imdbId   58098 non-null  string
dtypes: string(1), uint16(1)
memory usage: 567.5 KB


#### Specific tmdb functions used in pipe

In [10]:
def extract_correct_runtime(df: pd.DataFrame) -> pd.DataFrame:
    df['runtime'] = df['runtime_x'].mask((df['runtime_x'].isna()) | (df['runtime_x'] == 0), df['runtime_y'])
    return df

### Pipe tmdb

In [11]:
tmdb = tmdb. \
    pipe(pd.merge, links, how='left', on='movieId'). \
    pipe(pd.merge, imdb, how='left', left_on='imdbId', right_on='tconst'). \
    pipe(extract_correct_runtime). \
    pipe(fill_na, 'runtime', True). \
    pipe(drop, ['tconst', 'runtime_x', 'runtime_y'])

tmdb.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 58098 entries, 0 to 58097
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   movieId  58098 non-null  int64  
 1   tmdbId   57917 non-null  float64
 2   imdbId   58098 non-null  string 
 3   runtime  58098 non-null  float64
dtypes: float64(2), int64(1), string(1)
memory usage: 2.2 MB


#### Save tmdb-features.csv to interim

In [9]:
filepath = os.path.join(INTERIM_DIR, 'tmdb-features.csv')
if not os.path.exists(filepath):
    tmdb.to_csv(filepath, encoding='utf-8', index=False)