# Add genome data

## Imports

In [1]:
import os

import pandas as pd
from pandas import pivot

from src.utils.const import DATA_DIR
from src.utils.wrapper import fill_na, drop, reset_index

### Useful path to data

In [2]:
ROOT_DIR = os.path.join(os.getcwd(), '..')
RAW_DIR = os.path.join(ROOT_DIR, DATA_DIR, 'raw')
INTERIM_DIR = os.path.join(ROOT_DIR, DATA_DIR, 'interim')
PROCESSED_DIR = os.path.join(ROOT_DIR, DATA_DIR, 'processed')

## Import all interim .csv

In [3]:
movies = pd.read_parquet(
    os.path.join(INTERIM_DIR, 'movies.parquet')
)

tags = pd.read_parquet(
    os.path.join(INTERIM_DIR, 'tags.parquet')
)

ratings = pd.read_parquet(
    os.path.join(INTERIM_DIR, 'ratings.parquet')
)

tmdb = pd.read_parquet(
    os.path.join(INTERIM_DIR, 'tmdb.parquet')
)

## genome-*.csv

In [4]:
genome_scores = pd.read_csv(
    os.path.join(RAW_DIR, 'genome-scores.csv'),
    encoding='utf-8',
    dtype={'movieId':'int32', 'tagId':'int32', 'relevance':'float32'}
)

genome_scores.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14862528 entries, 0 to 14862527
Data columns (total 3 columns):
 #   Column     Dtype  
---  ------     -----  
 0   movieId    int32  
 1   tagId      int32  
 2   relevance  float32
dtypes: float32(1), int32(2)
memory usage: 170.1 MB


In [5]:
genome_tags = pd.read_csv(
    os.path.join(RAW_DIR, 'genome-tags.csv'),
    encoding='utf-8',
    dtype={'tagId':'int32', 'tag':'string'}
)

genome_tags.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1128 entries, 0 to 1127
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   tagId   1128 non-null   int32 
 1   tag     1128 non-null   string
dtypes: int32(1), string(1)
memory usage: 13.3 KB


### Pipe genomes-*.csv

In [6]:
tags_relevance = genome_scores. \
    pipe(pd.merge, genome_tags, on='tagId', how='left'). \
    pipe(pivot, index='movieId', columns='tag', values='relevance'). \
    pipe(reset_index)

## final.csv

In [7]:
final = movies. \
    pipe(pd.merge, ratings, on='movieId', how='inner'). \
    pipe(pd.merge, tags, on='movieId', how='inner'). \
    pipe(fill_na, 'tag_count', 'zero'). \
    pipe(pd.merge, tmdb, on='movieId', how='inner'). \
    pipe(pd.merge, tags_relevance, on='movieId', how='inner'). \
    pipe(drop, 'movieId')

final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12987 entries, 0 to 12986
Columns: 1153 entries, year to zombies
dtypes: float32(1132), int32(21)
memory usage: 57.2 MB


In [8]:
final.head()

Unnamed: 0,year,title_length,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,...,world politics,world war i,world war ii,writer's life,writers,writing,wuxia,wwii,zombie,zombies
0,1995.0,16,0,1,1,1,1,0,0,0,...,0.03775,0.0225,0.04075,0.03175,0.1295,0.0455,0.02,0.0385,0.09125,0.02225
1,1995.0,14,0,1,0,1,0,0,0,0,...,0.04775,0.0205,0.0165,0.0245,0.1305,0.027,0.01825,0.01225,0.09925,0.0185
2,1995.0,23,0,0,0,0,1,0,0,0,...,0.058,0.02375,0.0355,0.02125,0.12775,0.0325,0.01625,0.02125,0.09525,0.0175
3,1995.0,24,0,0,0,0,1,0,0,1,...,0.049,0.03275,0.02125,0.03675,0.15925,0.05225,0.015,0.016,0.09175,0.015
4,1995.0,34,0,0,0,0,1,0,0,0,...,0.05375,0.02625,0.0205,0.02125,0.17725,0.0205,0.015,0.0155,0.08875,0.01575
