# 01 â€” EDA and Preprocessing
Inspect the raw Kaggle files, decide column names, filter sparse users/items, and create train/test splits.


In [2]:
import pandas as pd
from pathlib import Path

from src import config
from src.data_loading import load_interactions, load_games_metadata, merge_datasets
from src.preprocessing import filter_users_and_items, train_test_split_by_time

RAW_INTERACTIONS = config.INTERACTIONS_FILE
RAW_GAMES = config.GAMES_METADATA_FILE
USER_COL = config.USER_COL
ITEM_COL = config.ITEM_COL
TIMESTAMP_COL = config.TIMESTAMP_COL
INTERACTION_VALUE_COL = config.INTERACTION_VALUE_COL
GAME_ID_COL_IN_GAMES = config.GAME_ID_COL_IN_GAMES


ModuleNotFoundError: No module named 'src'

## Load raw datasets
Adjust the paths above if your files differ. Inspect heads and column names to decide which columns to use.


In [None]:
interactions_raw = load_interactions(RAW_INTERACTIONS)
games_raw = load_games_metadata(RAW_GAMES)


## Inspect and choose columns
Update `src/config.py` after deciding the correct column names for:
- `USER_COL`, `ITEM_COL`
- `TIMESTAMP_COL` (set to `None` if unavailable)
- `INTERACTION_VALUE_COL` (e.g., hours played / rating / implicit 1)


In [None]:
print("Interaction column value counts
", interactions_raw[ITEM_COL].value_counts().head())
if TIMESTAMP_COL and TIMESTAMP_COL in interactions_raw.columns:
    print(interactions_raw[TIMESTAMP_COL].describe())


## Merge and align IDs
Ensure item identifiers match between interactions and games metadata. Rename games metadata id column to match ITEM_COL.


In [None]:
interactions_clean, games_clean = merge_datasets(
    interactions_raw,
    games_raw,
    user_col=USER_COL,
    item_col=ITEM_COL,
    game_id_col_in_games=GAME_ID_COL_IN_GAMES,
)


## Filter rare users/items
Set thresholds in `config.MIN_USER_INTERACTIONS` / `config.MIN_ITEM_INTERACTIONS`.


In [None]:
interactions_filtered = filter_users_and_items(
    interactions_clean,
    min_user_interactions=config.MIN_USER_INTERACTIONS,
    min_item_interactions=config.MIN_ITEM_INTERACTIONS,
    user_col=USER_COL,
    item_col=ITEM_COL,
)
print("After filtering:", interactions_filtered.shape)


## Train/test split
Time-aware split if timestamp is available; otherwise random with a fixed seed.


In [None]:
train_df, test_df = train_test_split_by_time(
    interactions_filtered,
    user_col=USER_COL,
    item_col=ITEM_COL,
    timestamp_col=TIMESTAMP_COL,
    n_test_items=1,
    random_state=config.RANDOM_STATE,
)
print("Train size", train_df.shape, "Test size", test_df.shape)


## Save processed data
Saved files feed later notebooks.


In [None]:
processed_dir = config.PROCESSED_DATA_DIR
processed_dir.mkdir(parents=True, exist_ok=True)

train_path = processed_dir / "train_interactions.parquet"
test_path = processed_dir / "test_interactions.parquet"
games_path = processed_dir / "games_metadata.parquet"

train_df.to_parquet(train_path, index=False)
test_df.to_parquet(test_path, index=False)
games_clean.to_parquet(games_path, index=False)

print("Saved:", train_path, test_path, games_path, sep="")
