# Raw data exploration

### Imports

In [1]:
import os
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer

from src.utils.const import DATA_DIR, DROP

### Useful path to data

In [2]:
ROOT_DIR = os.path.join(os.getcwd(), '..')
RAW_DIR = os.path.join(ROOT_DIR, DATA_DIR, 'raw')
INTERIM_DIR = os.path.join(ROOT_DIR, DATA_DIR, 'interim')

## Generic useful function

In [3]:
from src.utils.wrapper import drop, drop_na, rename, reset_index, fill_na, convert_to, extract_stat_feature, apply

### movies.csv

In [4]:
movies = pd.read_csv(
    os.path.join(RAW_DIR, 'movies.csv'),
    encoding='utf-8',
    dtype={'movieId':'uint16', 'title': 'string', 'genres': 'category'}
)

movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58098 entries, 0 to 58097
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype   
---  ------   --------------  -----   
 0   movieId  58098 non-null  uint16  
 1   title    58098 non-null  string  
 2   genres   58098 non-null  category
dtypes: category(1), string(1), uint16(1)
memory usage: 758.3 KB


#### Specific movies functions used in pipe

In [5]:
def extract_year_from_title(df: pd.DataFrame) -> pd.DataFrame:
    regex = '.*\((\d{4})\).*'
    df['year'] = df['title'].str.extract(pat=regex, expand=False)
    return df


def extract_title_length(df: pd.DataFrame) -> pd.DataFrame:
    df['title_length'] = df['title'].str.len()
    return df


def encode_genre(df: pd.DataFrame) -> pd.DataFrame:
    genres = df['genres'].str.split('|')
    mlb = MultiLabelBinarizer()
    encoded_genre = pd.DataFrame(
        mlb.fit_transform(genres),
        index=df['movieId'],
        columns=mlb.classes_
    )
    df = pd.merge(df, encoded_genre, on='movieId', how='inner')
    return df


def remove_no_genres(df: pd.DataFrame) -> pd.DataFrame:
    df_no_genre = df[df['(no genres listed)'] == 1].index
    df.drop(index=df_no_genre, inplace=True)
    if DROP:
        print(f'Number of films without any genres to be dropped: {df_no_genre.shape[0]}')
    return df

#### Pipe movies

In [6]:
movies = movies. \
    pipe(extract_year_from_title). \
    pipe(convert_to, 'year', 'float32'). \
    pipe(fill_na, 'year', True). \
    pipe(extract_title_length). \
    pipe(convert_to, 'title_length', 'int32'). \
    pipe(encode_genre). \
    pipe(remove_no_genres). \
    pipe(drop, ['title', 'genres'])

movies.info()

Number of films without any genres to be dropped: 6259
<class 'pandas.core.frame.DataFrame'>
Int64Index: 83737 entries, 0 to 89995
Data columns (total 23 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   movieId             83737 non-null  uint16 
 1   year                83737 non-null  float32
 2   title_length        83737 non-null  int32  
 3   (no genres listed)  83737 non-null  int32  
 4   Action              83737 non-null  int32  
 5   Adventure           83737 non-null  int32  
 6   Animation           83737 non-null  int32  
 7   Children            83737 non-null  int32  
 8   Comedy              83737 non-null  int32  
 9   Crime               83737 non-null  int32  
 10  Documentary         83737 non-null  int32  
 11  Drama               83737 non-null  int32  
 12  Fantasy             83737 non-null  int32  
 13  Film-Noir           83737 non-null  int32  
 14  Horror              83737 non-null  int32  
 15

#### Save movies.csv to interim

In [7]:
filepath = os.path.join(INTERIM_DIR, 'movies.csv')
if not os.path.exists(filepath):
    movies.to_csv(filepath, encoding='utf-8', index=False)

### tags.csv

In [8]:
tags = pd.read_csv(
    os.path.join(RAW_DIR, 'tags.csv'),
    encoding='utf-8',
    usecols=['movieId', 'tag'],
    dtype={'movieId':'uint16', 'tag': 'string'}
)

tags.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1108997 entries, 0 to 1108996
Data columns (total 2 columns):
 #   Column   Non-Null Count    Dtype 
---  ------   --------------    ----- 
 0   movieId  1108997 non-null  uint16
 1   tag      1108981 non-null  string
dtypes: string(1), uint16(1)
memory usage: 10.6 MB


#### Pipe tags

In [9]:
tags = tags. \
    pipe(drop_na). \
    pipe(extract_stat_feature, ['movieId'], 'tag', ['count']). \
    pipe(reset_index). \
    pipe(rename, {'count':'tag_count'}). \
    pipe(convert_to, 'tag_count', 'int32')

tags.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36383 entries, 0 to 36382
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   movieId    36383 non-null  uint64
 1   tag_count  36383 non-null  int32 
dtypes: int32(1), uint64(1)
memory usage: 426.5 KB


#### Save tags.csv to interim

In [10]:
filepath = os.path.join(INTERIM_DIR, 'tags.csv')
if not os.path.exists(filepath):
    tags.to_csv(filepath, encoding='utf-8', index=False)

### ratings.csv

In [11]:
ratings = pd.read_csv(
    os.path.join(RAW_DIR, 'ratings.csv'),
    encoding='utf-8',
    usecols=['movieId', 'rating'],
    dtype={'movieId':'uint16', 'rating':'float32'}
)

ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27753444 entries, 0 to 27753443
Data columns (total 2 columns):
 #   Column   Dtype  
---  ------   -----  
 0   movieId  uint16 
 1   rating   float32
dtypes: float32(1), uint16(1)
memory usage: 158.8 MB


#### Pipe ratings

In [12]:
ratings = ratings. \
    pipe(extract_stat_feature, ['movieId'], 'rating', ['count', 'mean']). \
    pipe(reset_index). \
    pipe(rename, {'count':'rating_count', 'mean':'rating_mean'}). \
    pipe(convert_to, 'rating_count', 'int32')

ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41026 entries, 0 to 41025
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   movieId       41026 non-null  uint64 
 1   rating_count  41026 non-null  int32  
 2   rating_mean   41026 non-null  float32
dtypes: float32(1), int32(1), uint64(1)
memory usage: 641.2 KB


#### Save ratings.csv to interim

In [13]:
filepath = os.path.join(INTERIM_DIR, 'ratings.csv')
if not os.path.exists(filepath):
    ratings.to_csv(filepath, encoding='utf-8', index=False)

### links.csv

In [14]:
links = pd.read_csv(
    os.path.join(RAW_DIR, 'links.csv'),
    encoding='utf-8',
    usecols=['movieId', 'imdbId'],
    dtype={'movieId': 'uint16', 'imdbId': 'string'}
)

links.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58098 entries, 0 to 58097
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  58098 non-null  uint16
 1   imdbId   58098 non-null  string
dtypes: string(1), uint16(1)
memory usage: 567.5 KB


#### Pipe links

In [15]:
links = links. \
    pipe(apply, 'imdbId', lambda x: f'tt{x}'). \
    pipe(convert_to, 'imdbId', 'string')

links.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58098 entries, 0 to 58097
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  58098 non-null  uint16
 1   imdbId   58098 non-null  string
dtypes: string(1), uint16(1)
memory usage: 567.5 KB


#### Save links.csv to interim

In [16]:
filepath = os.path.join(INTERIM_DIR, 'links.csv')
if not os.path.exists(filepath):
    links.to_csv(filepath, encoding='utf-8', index=False)