# Add genome data

## Imports

In [1]:
import os

import pandas as pd
from pandas import pivot

from src.utils.const import DATA_DIR
from src.utils.wrapper import fill_na, drop, reset_index

### Useful path to data

In [2]:
ROOT_DIR = os.path.join(os.getcwd(), '..')
RAW_DIR = os.path.join(ROOT_DIR, DATA_DIR, 'raw')
INTERIM_DIR = os.path.join(ROOT_DIR, DATA_DIR, 'interim')
PROCESSED_DIR = os.path.join(ROOT_DIR, DATA_DIR, 'processed')

## Import all interim .csv

In [3]:
movies = pd.read_parquet(
    os.path.join(INTERIM_DIR, 'movies.parquet')
)

tags = pd.read_parquet(
    os.path.join(INTERIM_DIR, 'tags.parquet')
)

ratings = pd.read_parquet(
    os.path.join(INTERIM_DIR, 'ratings.parquet')
)

tmdb = pd.read_parquet(
    os.path.join(INTERIM_DIR, 'tmdb.parquet')
)

## genome-*.csv

In [4]:
genome_scores = pd.read_csv(
    os.path.join(RAW_DIR, 'genome-scores.csv'),
    encoding='utf-8',
    dtype={'movieId':'int32', 'tagId':'int32', 'relevance':'float32'}
)

genome_scores.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14862528 entries, 0 to 14862527
Data columns (total 3 columns):
 #   Column     Dtype  
---  ------     -----  
 0   movieId    int32  
 1   tagId      int32  
 2   relevance  float32
dtypes: float32(1), int32(2)
memory usage: 170.1 MB


In [5]:
genome_tags = pd.read_csv(
    os.path.join(RAW_DIR, 'genome-tags.csv'),
    encoding='utf-8',
    dtype={'tagId':'int32', 'tag':'string'}
)

genome_tags.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1128 entries, 0 to 1127
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   tagId   1128 non-null   int32 
 1   tag     1128 non-null   string
dtypes: int32(1), string(1)
memory usage: 13.3 KB


In [6]:
genome_tags

Unnamed: 0,tagId,tag
0,1,007
1,2,007 (series)
2,3,18th century
3,4,1920s
4,5,1930s
...,...,...
1123,1124,writing
1124,1125,wuxia
1125,1126,wwii
1126,1127,zombie


### Pipe genomes-*.csv

In [None]:
tags_relevance = genome_scores. \
    pipe(pd.merge, genome_tags, on='tagId', how='left'). \
    pipe(pivot, index='movieId', columns='tag', values='relevance'). \
    pipe(reset_index)

## final.csv

In [None]:
final = movies. \
    pipe(pd.merge, ratings, on='movieId', how='inner'). \
    pipe(pd.merge, tags, on='movieId', how='inner'). \
    pipe(fill_na, 'tag_count', 'zero'). \
    pipe(pd.merge, tmdb, on='movieId', how='inner'). \
    pipe(pd.merge, tags_relevance, on='movieId', how='inner'). \
    pipe(drop, 'movieId')

final.info()

In [None]:
final.head()