# Raw data exploration

### Imports

In [None]:
import os
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer

from src.utils.const import DATA_DIR

### Useful path to data

In [None]:
ROOT_DIR = os.path.join(os.getcwd(), '..')
RAW_DIR = os.path.join(ROOT_DIR, DATA_DIR, 'raw')

### movies.csv

In [None]:
movies = pd.read_csv(
    os.path.join(RAW_DIR, 'movies.csv'),
    encoding='utf-8',
    dtype={'movieId': 'int32', 'title': 'string', 'genres': 'category'}
)

movies.info()

#### Specific movies functions used in pipe

In [None]:
def encode_genre(df: pd.DataFrame) -> pd.DataFrame:
    genres = df['genres'].str.split('|')
    mlb = MultiLabelBinarizer()
    encoded_genre = pd.DataFrame(
        mlb.fit_transform(genres),
        index=df['movieId'],
        columns=mlb.classes_
    )
    return pd.merge(df, encoded_genre, on='movieId', how='inner')


def remove_no_genres(df: pd.DataFrame) -> pd.DataFrame:
    df_no_genre = df[df['(no genres listed)'] == 1].index
    return df.drop(index=df_no_genre)

#### Pipe movies

In [None]:
# Preprocessing
regex_year = '.*\\((\\d{4})\\).*'
movies = (movies
          .assign(year=movies['title'].str.extract(pat=regex_year, expand=False),
                  title_length=movies['title'].str.len())
          .astype({'year': 'float32', 'title_length': 'int32'})
          .pipe(encode_genre))

# Cleaning
movies = (movies
          .fillna({'year': movies['year'].median()})
          .pipe(remove_no_genres)
          .drop(columns=['title', 'genres', '(no genres listed)']))

movies.info()

### tags.csv

In [None]:
tags = pd.read_csv(
    os.path.join(RAW_DIR, 'tags.csv'),
    encoding='utf-8',
    usecols=['movieId', 'tag'],
    dtype={'movieId': 'int32', 'tag': 'string'}
)

tags.info()

#### Pipe tags

In [None]:
tags = (tags
        .groupby(by='movieId', as_index=False)['tag'].agg('count')
        .rename(columns={'tag': 'tag_count'})
        .astype({'movieId': 'int32', 'tag_count': 'int32'}))

tags.info()

### ratings.csv

In [None]:
ratings = pd.read_csv(
    os.path.join(RAW_DIR, 'ratings.csv'),
    encoding='utf-8',
    usecols=['movieId', 'rating'],
    dtype={'movieId': 'int32', 'rating': 'float32'}
)

ratings.info()

#### Pipe ratings

In [None]:
ratings = (ratings
           .groupby(by='movieId')['rating'].agg(['count', 'mean'])
           .reset_index()
           .rename(columns={'count': 'rating_count', 'mean': 'rating_mean'})
           .astype({'movieId': 'int32', 'rating_count': 'int32'}))

ratings.info()