# Raw data exploration

### Imports

In [1]:
import os
import pandas as pd
from typing import List, Dict
from sklearn.preprocessing import MultiLabelBinarizer

from src.utils.util import overview_data
from src.utils.const import DATA_DIR, DROP

### Path to raw data

In [2]:
ROOT_DIR = os.path.join(os.getcwd(), '..')
RAW_DIR = os.path.join(ROOT_DIR, DATA_DIR, 'raw')

## Generic useful function

In [3]:
def drop_na(df: pd.DataFrame) -> pd.DataFrame:
    df.dropna(inplace=True)
    return df


def drop(df: pd.DataFrame, columns: List[str]) -> pd.DataFrame:
    df.drop(columns=columns, inplace=True)
    return df


def fill_na(df: pd.DataFrame, column: str, use_median: bool) -> pd.DataFrame:
    if use_median:
        median = df[column].median()
        df[column].fillna(median, inplace=True)
        return df
    mean = df[column].mean()
    df[column].fillna(mean, inplace=True)
    return df


def convert_to(df: pd.DataFrame, column: str, _type: str) -> pd.DataFrame:
    df[column] = df[column].astype(_type)
    return df


def rename(df: pd.DataFrame, _dict: Dict[str, str]) -> pd.DataFrame:
    df.rename(columns=_dict, inplace=True)
    return df


def reset_index(df: pd.DataFrame) -> pd.DataFrame:
    df.reset_index(inplace=True)
    return df


def extract_feature_count(df: pd.DataFrame, by: str, column: str) -> pd.DataFrame:
    column_suffix = f'{column}_count'
    df_feature_count = pd.DataFrame(
        df.groupby(by=by, as_index=False)[column]
            .count()
            .rename(columns={column: column_suffix})
    )
    return df_feature_count


def extract_feature_mean(df: pd.DataFrame, by: str, column: str) -> pd.DataFrame:
    column_suffix = f'{column}_mean'
    df_feature_mean = pd.DataFrame(
        df.groupby(by=by, as_index=False)[column]
            .mean()
            .rename(columns={column: column_suffix})
    )
    return df_feature_mean


def extract_stat_feature(df: pd.DataFrame, by: List[str], column: str, stat: List[str]) -> pd.DataFrame:
    df_stat = pd.DataFrame(
        df.groupby(by, as_index=False)[column].agg(stat)
    )
    return df_stat

### movies.csv

In [4]:
movies = pd.read_csv(os.path.join(RAW_DIR, 'movies.csv'), encoding='utf-8',
                     dtype={'title': 'string', 'genres': 'category'})
overview_data(movies)

Shape: (58098, 3)
Columns: ['movieId' 'title' 'genres']


#### Specific movies functions used in pipe

In [5]:
def extract_year_from_title(df: pd.DataFrame) -> pd.DataFrame:
    regex = '.*\((\d{4})\).*'
    df['year'] = df['title'].str.extract(pat=regex, expand=False)
    return df


def extract_title_length(df: pd.DataFrame) -> pd.DataFrame:
    df['title_length'] = df['title'].str.len()
    return df


def encode_genre(df: pd.DataFrame) -> pd.DataFrame:
    genres = df['genres'].str.split('|')
    mlb = MultiLabelBinarizer()
    encoded_genre = pd.DataFrame(
        mlb.fit_transform(genres),
        index=df['movieId'],
        columns=mlb.classes_
    )
    df = pd.merge(df, encoded_genre, on='movieId', how='inner')
    return df


def remove_no_genres(df: pd.DataFrame) -> pd.DataFrame:
    df_no_genre = df[df['(no genres listed)'] == 1].index
    df.drop(index=df_no_genre, inplace=True)
    if DROP:
        print(f'Number of films without any genres to be dropped: {df_no_genre.shape[0]}')
    return df

#### Pipe movies

In [6]:
movies = movies. \
    pipe(extract_year_from_title). \
    pipe(convert_to, 'year', 'float32'). \
    pipe(fill_na, 'year', True). \
    pipe(extract_title_length). \
    pipe(encode_genre). \
    pipe(remove_no_genres). \
    pipe(drop, ['title', 'genres'])

movies.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 53832 entries, 0 to 58097
Data columns (total 23 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   movieId             53832 non-null  int64  
 1   year                53832 non-null  float32
 2   title_length        53832 non-null  Int64  
 3   (no genres listed)  53832 non-null  int32  
 4   Action              53832 non-null  int32  
 5   Adventure           53832 non-null  int32  
 6   Animation           53832 non-null  int32  
 7   Children            53832 non-null  int32  
 8   Comedy              53832 non-null  int32  
 9   Crime               53832 non-null  int32  
 10  Documentary         53832 non-null  int32  
 11  Drama               53832 non-null  int32  
 12  Fantasy             53832 non-null  int32  
 13  Film-Noir           53832 non-null  int32  
 14  Horror              53832 non-null  int32  
 15  IMAX                53832 non-null  int32  
 16  Musi

### tags.csv

In [7]:
tags = pd.read_csv(os.path.join(RAW_DIR, 'tags.csv'), encoding='utf-8', usecols=['movieId', 'tag'],
                   dtype={'tag': 'string'})
overview_data(tags)

Shape: (1108997, 2)
Columns: ['movieId' 'tag']


#### Pipe tags

In [8]:
tags = tags. \
    pipe(drop_na). \
    pipe(extract_feature_count, 'movieId', 'tag')

tags.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45981 entries, 0 to 45980
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   movieId    45981 non-null  int64
 1   tag_count  45981 non-null  int64
dtypes: int64(2)
memory usage: 718.6 KB


### ratings.csv

In [9]:
ratings = pd.read_csv(os.path.join(RAW_DIR, 'ratings.csv'), encoding='utf-8', usecols=['movieId', 'rating'])
overview_data(ratings)

Shape: (27753444, 2)
Columns: ['movieId' 'rating']


#### Pipe ratings

In [10]:
ratings = ratings. \
    pipe(extract_stat_feature, ['movieId'], 'rating', ['count', 'mean']). \
    pipe(reset_index). \
    pipe(rename, {'count':'rating_count', 'mean':'rating_mean'})

ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53889 entries, 0 to 53888
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   movieId       53889 non-null  int64  
 1   rating_count  53889 non-null  int64  
 2   rating_mean   53889 non-null  float64
dtypes: float64(1), int64(2)
memory usage: 1.2 MB


### After

In [None]:
links = pd.read_csv(os.path.join(RAW_DIR, 'links.csv'), encoding='utf-8')