# Processed data storage

## Imports

In [3]:
import os

import pandas as pd

from src.utils.const import DATA_DIR

### Useful path to data

In [4]:
ROOT_DIR = os.path.join(os.getcwd(), '..')
INTERIM_DIR = os.path.join(ROOT_DIR, DATA_DIR, 'interim')
PROCESSED_DIR = os.path.join(ROOT_DIR, DATA_DIR, 'processed')

## Import all interim .csv

In [5]:
movies = pd.read_csv(
    os.path.join(INTERIM_DIR, 'movies.csv'),
    encoding='utf-8',
    dtype={'movieId': 'uint16', 'year': 'float32', 'title_length': 'int32'}
)

tags = pd.read_csv(
    os.path.join(INTERIM_DIR, 'tags.csv'),
    encoding='utf-8',
    dtype={'movieId': 'uint16', 'tag_count': 'int32'}
)

ratings = pd.read_csv(
    os.path.join(INTERIM_DIR, 'ratings.csv'),
    encoding='utf-8',
    dtype={'movieId': 'uint16', 'rating_count': 'int32', 'rating_mean': 'float32'}
)

imdb = pd.read_csv(
    os.path.join(INTERIM_DIR, 'title-basics.csv'),
    encoding='utf-8',
    usecols=['tconst', 'runtime'],
    dtype={'tconst': 'string', 'runtime': 'float32'}
)

tmdb = pd.read_csv(
    os.path.join(INTERIM_DIR, 'tmdb.csv'),
    encoding='utf-8',
    usecols=['movieId', 'tmdbId', 'imdb_id', 'runtime'],
    dtype={'movieId': 'uint16', 'imdb_id': 'string', 'tmdbId': 'float32', 'runtime': 'float32'}
)

In [9]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83737 entries, 0 to 83736
Data columns (total 22 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   movieId       83737 non-null  uint16 
 1   year          83737 non-null  float32
 2   title_length  83737 non-null  int32  
 3   Action        83737 non-null  int64  
 4   Adventure     83737 non-null  int64  
 5   Animation     83737 non-null  int64  
 6   Children      83737 non-null  int64  
 7   Comedy        83737 non-null  int64  
 8   Crime         83737 non-null  int64  
 9   Documentary   83737 non-null  int64  
 10  Drama         83737 non-null  int64  
 11  Fantasy       83737 non-null  int64  
 12  Film-Noir     83737 non-null  int64  
 13  Horror        83737 non-null  int64  
 14  IMAX          83737 non-null  int64  
 15  Musical       83737 non-null  int64  
 16  Mystery       83737 non-null  int64  
 17  Romance       83737 non-null  int64  
 18  Sci-Fi        83737 non-nu

## final.csv

In [8]:
final = movies.copy(). \
    pipe(pd.merge, ratings, on='movieId', how='inner')

final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 81551 entries, 0 to 81550
Data columns (total 24 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   movieId       81551 non-null  uint16 
 1   year          81551 non-null  float32
 2   title_length  81551 non-null  int32  
 3   Action        81551 non-null  int64  
 4   Adventure     81551 non-null  int64  
 5   Animation     81551 non-null  int64  
 6   Children      81551 non-null  int64  
 7   Comedy        81551 non-null  int64  
 8   Crime         81551 non-null  int64  
 9   Documentary   81551 non-null  int64  
 10  Drama         81551 non-null  int64  
 11  Fantasy       81551 non-null  int64  
 12  Film-Noir     81551 non-null  int64  
 13  Horror        81551 non-null  int64  
 14  IMAX          81551 non-null  int64  
 15  Musical       81551 non-null  int64  
 16  Mystery       81551 non-null  int64  
 17  Romance       81551 non-null  int64  
 18  Sci-Fi        81551 non-nu