# External data exploration

### Imports

In [34]:
import os
from typing import List

import pandas as pd
import numpy as np

from src.utils.const import DATA_DIR
from src.utils.util import overview_data

### Path to raw data

In [2]:
ROOT_DIR = os.path.join(os.getcwd(), '..')
EXTERNAL_DIR = os.path.join(ROOT_DIR, DATA_DIR, 'external')
INTERIM_DIR = os.path.join(ROOT_DIR, DATA_DIR, 'interim')
RAW_DIR = os.path.join(ROOT_DIR, DATA_DIR, 'raw')

### title-basics.csv

In [3]:
imdb = pd.read_csv(os.path.join(EXTERNAL_DIR, 'title-basics.csv'), dtype={'tconst': 'string'}, encoding='utf-8',
                   usecols=['tconst', 'runtimeMinutes'])

In [4]:
imdb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8830142 entries, 0 to 8830141
Data columns (total 2 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          string
 1   runtimeMinutes  object
dtypes: object(1), string(1)
memory usage: 134.7+ MB


In [5]:
imdb['runtimeMinutes'].replace(regex='([\\]*[a-zA-Z|\-]+)', value=np.nan, inplace=True)

In [6]:
imdb[imdb['runtimeMinutes'].isna()].shape[0]

6450086

In [7]:
imdb['runtimeMinutes'].astype('float32')

0           1.0
1           5.0
2           4.0
3          12.0
4           1.0
           ... 
8830137     NaN
8830138     NaN
8830139     NaN
8830140    27.0
8830141    10.0
Name: runtimeMinutes, Length: 8830142, dtype: float32

In [8]:
imdb.rename(columns={'runtimeMinutes': 'runtime'}, inplace=True)

In [9]:
filepath = os.path.join(INTERIM_DIR, 'imdb.csv')
if not os.path.exists(filepath):
    imdb.to_csv(filepath, encoding='utf-8')

In [10]:
imdb = pd.read_csv(os.path.join(INTERIM_DIR, 'imdb.csv'), dtype={'tconst': 'string', 'runtime': 'float32'},
                   encoding='utf-8',
                   usecols=['tconst', 'runtime'])

### tmdb-features.csv

In [11]:
tmdb = pd.read_csv(os.path.join(EXTERNAL_DIR, 'tmdb-features.csv'), encoding='utf-8',
                   usecols=['movieId', 'tmdbId', 'runtime'])

### Add to tmdb-features imdbId

In [17]:
links = pd.read_csv(os.path.join(RAW_DIR, 'links.csv'), encoding='utf-8', dtype={'imdbId': 'string'})
links['imdbId'] = links['imdbId'].apply(lambda x: f"tt{x}")
links = links[['movieId', 'imdbId']]

Shape: (58098, 3)
Columns: ['movieId' 'imdbId' 'tmdbId']


In [18]:
tmdb_new = pd.merge(tmdb, links, on='movieId', how='left')
tmdb_new.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 58098 entries, 0 to 58097
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   movieId  58098 non-null  int64  
 1   tmdbId   57917 non-null  float64
 2   runtime  58035 non-null  float64
 3   imdbId   58098 non-null  object 
dtypes: float64(2), int64(1), object(1)
memory usage: 2.2+ MB
Shape: (58098, 4)
Columns: ['movieId' 'tmdbId' 'runtime' 'imdbId']


In [20]:
filepath = os.path.join(INTERIM_DIR, 'tmdb-features.csv')
if not os.path.exists(filepath):
    tmdb_new.to_csv(filepath, encoding='utf-8')

### Pipe tmdb missing runtime

In [35]:
def fill_na(df: pd.DataFrame, column: str, use_median: bool) -> pd.DataFrame:
    if use_median:
        median = df[column].median()
        df[column].fillna(median, inplace=True)
        return df
    mean = df[column].mean()
    df[column].fillna(mean, inplace=True)
    return df

def drop(df: pd.DataFrame, columns: List[str]) -> pd.DataFrame:
    df.drop(columns=columns, inplace=True)
    return df

def extract_runtime(df: pd.DataFrame, imdb: pd.DataFrame) -> pd.DataFrame:
    # Partition of the dataframe with Nan or 0 on runtime column
    missing_runtime = df[(df['runtime'] == 0) | (df['runtime'].isna())].copy()

    # Drop from the tmdb all the rows that are nan since are saved in missing_runtime
    local_tmdb = df.set_index('movieId')
    local_tmdb.drop(index=missing_runtime['movieId'], inplace=True)
    missing_runtime.drop(columns='runtime', inplace=True)
    missing_runtime = pd.merge(missing_runtime, imdb, left_on='imdbId', right_on='tconst',how='left')

    # Merge the missing_runtime with the gathered values to the tmdb dataset
    local_tmdb.reset_index(inplace=True)
    local_tmdb.rename(columns={'index': 'movieId'}, inplace=True)
    df = pd.concat([local_tmdb, missing_runtime])

    # TODO: Apply Standardization on the training set
    return df


tmdb = pd.read_csv(os.path.join(INTERIM_DIR, 'tmdb-features.csv'), encoding='utf-8',
                   usecols=['movieId', 'tmdbId', 'imdbId', 'runtime'])
tmdb.info()
tmdb=tmdb.pipe(extract_runtime, imdb).\
    pipe(fill_na,'runtime',True).\
    pipe(drop,['tconst'])
tmdb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58098 entries, 0 to 58097
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   movieId  58098 non-null  int64  
 1   tmdbId   57917 non-null  float64
 2   runtime  58035 non-null  float64
 3   imdbId   58098 non-null  object 
dtypes: float64(2), int64(1), object(1)
memory usage: 1.8+ MB
shape before merge1724
shape after merge1724
<class 'pandas.core.frame.DataFrame'>
Int64Index: 58098 entries, 0 to 1723
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   movieId  58098 non-null  int64  
 1   tmdbId   57917 non-null  float64
 2   runtime  58098 non-null  float64
 3   imdbId   58098 non-null  object 
dtypes: float64(2), int64(1), object(1)
memory usage: 2.2+ MB
