# Add genome data

## Imports

In [None]:
import os

import pandas as pd

from src.utils.const import DATA_DIR
from src.utils.wrapper import fill_na, drop, reset_index

### Useful path to data

In [None]:
ROOT_DIR = os.path.join(os.getcwd(), '..')
RAW_DIR = os.path.join(ROOT_DIR, DATA_DIR, 'raw')
INTERIM_DIR = os.path.join(ROOT_DIR, DATA_DIR, 'interim')
PROCESSED_DIR = os.path.join(ROOT_DIR, DATA_DIR, 'processed')

## Import all interim .csv

In [None]:
movies = pd.read_parquet(
    os.path.join(INTERIM_DIR, 'movies.parquet')
)

tags = pd.read_parquet(
    os.path.join(INTERIM_DIR, 'tags.parquet')
)

ratings = pd.read_parquet(
    os.path.join(INTERIM_DIR, 'ratings.parquet')
)

tmdb = pd.read_parquet(
    os.path.join(INTERIM_DIR, 'tmdb.parquet')
)

## genome-*.csv

In [None]:
genome_scores = pd.read_csv(
    os.path.join(RAW_DIR, 'genome-scores.csv'),
    encoding='utf-8',
    dtype={'movieId':'int32', 'tagId':'int32', 'relevance':'float32'}
)

genome_scores.info()

In [None]:
genome_tags = pd.read_csv(
    os.path.join(RAW_DIR, 'genome-tags.csv'),
    encoding='utf-8',
    dtype={'tagId':'int32', 'tag':'string'}
)

genome_tags.info()

### Pipe genome-*.csv

In [None]:
tags_relevance = genome_scores. \
    pipe(pd.merge, genome_tags, on='tagId', how='left'). \
    pipe(pd.pivot, index='movieId', columns='tag', values='relevance'). \
    pipe(reset_index)

## final.csv

In [None]:
final = movies. \
    pipe(pd.merge, ratings, on='movieId', how='inner'). \
    pipe(pd.merge, tags, on='movieId', how='inner'). \
    pipe(fill_na, 'tag_count', 'zero'). \
    pipe(pd.merge, tmdb, on='movieId', how='inner'). \
    pipe(pd.merge, tags_relevance, on='movieId', how='inner'). \
    pipe(drop, 'movieId')

final.info()

In [None]:
final.head()