In [None]:
import pandas as pd
import numpy as np

from src.data_prep.build_ml_training_dataset import (
    load_common,
    build_ml_dataset
)

from src.utils.config import (
    COMMON_DATASET_PATH,
    ML_TRAINING_DATASET_PATH,
    DATA_PROCESSED
)

from src.utils.logger import get_logger

logger = get_logger(__name__)

In [None]:
logger.info("Loading raw common dataset...")

df_raw = load_common()
df_raw.head()

In [None]:
logger.info("Dataset info before preprocessing")

print("Rows:", len(df_raw))
print("Columns:", df_raw.columns.tolist())
df_raw.info()
df_raw.describe(include="all").transpose()

In [None]:
logger.info("Building ML dataset...")

df_ml = build_ml_dataset(df_raw)
df_ml.head()

In [None]:
logger.info("Cleaning outliers based on EDA results...")

# Ограничения по EDA
df_ml["monetary_90d"] = df_ml["monetary_90d"].clip(0, 50000)
df_ml["frequency_90d"] = df_ml["frequency_90d"].clip(0, 200)

df_ml["discounts_used_90d"] = df_ml["discounts_used_90d"].clip(0, 50)

df_ml.head()

In [None]:
logger.info("Handling missing values and type casting...")

# Пропуски в категориальных заменяем заглушками
cat_cols = [
    "offer_type", "offer_category", "favorite_category",
    "visited_category_14d", "gender", "price_segment", "channel"
]

for col in cat_cols:
    df_ml[col] = df_ml[col].fillna("unknown")

# Пропуски в числовых -> 0
num_cols = df_ml.select_dtypes(include=["float", "int"]).columns.tolist()
for col in num_cols:
    df_ml[col] = df_ml[col].fillna(0)

df_ml.head()

In [None]:
logger.info("Target distribution after preprocessing")
df_ml["conversion"].value_counts(normalize=True)

In [None]:
logger.info("Saving processed ML dataset...")

DATA_PROCESSED.mkdir(parents=True, exist_ok=True)
df_ml.to_csv(ML_TRAINING_DATASET_PATH, index=False)

print("Saved to:", ML_TRAINING_DATASET_PATH)
len(df_ml), df_ml.shape

In [None]:
logger.info("Final schema check")

df_ml.info()
df_ml.head(10)