# Raw data exploration

### Imports

In [None]:
import os
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer

from src.utils.const import DATA_DIR

### Useful path to data

In [None]:
ROOT_DIR = os.path.join(os.getcwd(), '..')
RAW_DIR = os.path.join(ROOT_DIR, DATA_DIR, 'raw')
FIGURE_DIR = os.path.join(ROOT_DIR, 'reports', 'figures')

## Generic useful function

In [None]:
from src.utils.wrapper import drop, drop_na, rename, reset_index, fill_na, convert_to, extract_stat_feature

### movies.csv

In [None]:
movies = pd.read_csv(
    os.path.join(RAW_DIR, 'movies.csv'),
    encoding='utf-8',
    dtype={'movieId':'int32', 'title': 'string', 'genres': 'category'}
)

movies.info()

#### Specific movies functions used in pipe

In [None]:
def extract_year_from_title(df: pd.DataFrame) -> pd.DataFrame:
    regex = '.*\\((\\d{4})\\).*'
    df['year'] = df['title'].str.extract(pat=regex, expand=False)
    return df


def extract_title_length(df: pd.DataFrame) -> pd.DataFrame:
    df['title_length'] = df['title'].str.len()
    return df


def encode_genre(df: pd.DataFrame) -> pd.DataFrame:
    genres = df['genres'].str.split('|')
    mlb = MultiLabelBinarizer()
    encoded_genre = pd.DataFrame(
        mlb.fit_transform(genres),
        index=df['movieId'],
        columns=mlb.classes_
    )
    df = pd.merge(df, encoded_genre, on='movieId', how='inner')
    return df


def remove_no_genres(df: pd.DataFrame) -> pd.DataFrame:
    df_no_genre = df[df['(no genres listed)'] == 1].index
    df.drop(index=df_no_genre, inplace=True)
    return df

#### Pipe movies

In [None]:
movies = movies. \
    pipe(extract_year_from_title). \
    pipe(convert_to, 'year', 'float32'). \
    pipe(fill_na, 'year', 'median'). \
    pipe(extract_title_length). \
    pipe(convert_to, 'title_length', 'int32'). \
    pipe(encode_genre). \
    pipe(remove_no_genres). \
    pipe(drop, ['title', 'genres', '(no genres listed)'])

movies.info()

### tags.csv

In [None]:
tags = pd.read_csv(
    os.path.join(RAW_DIR, 'tags.csv'),
    encoding='utf-8',
    usecols=['movieId', 'tag'],
    dtype={'movieId':'int32', 'tag': 'string'}
)

tags.info()

#### Pipe tags

In [None]:
tags = tags. \
    pipe(drop_na). \
    pipe(extract_stat_feature, ['movieId'], 'tag', ['count']). \
    pipe(reset_index). \
    pipe(rename, {'count':'tag_count'}). \
    pipe(convert_to, 'tag_count', 'int32')

tags.info()

### ratings.csv

In [None]:
ratings = pd.read_csv(
    os.path.join(RAW_DIR, 'ratings.csv'),
    encoding='utf-8',
    usecols=['movieId', 'rating'],
    dtype={'movieId':'int32', 'rating':'float32'}
)

ratings.info()

#### Pipe ratings

In [None]:
ratings = ratings. \
    pipe(extract_stat_feature, ['movieId'], 'rating', ['count', 'mean']). \
    pipe(reset_index). \
    pipe(rename, {'count':'rating_count', 'mean':'rating_mean'}). \
    pipe(convert_to, 'rating_count', 'int32')

ratings.info()