In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from src.utils.config import COMMON_DATASET_PATH, OFFERS_DATASET_PATH
from src.utils.logger import logger

sns.set_theme(style="whitegrid")

In [None]:
logger.info("Loading datasets...")

df = pd.read_csv(COMMON_DATASET_PATH)
offers = pd.read_csv(OFFERS_DATASET_PATH)

df.head(), offers.head()

In [None]:
logger.info("Dataset info")

print("Rows:", len(df))
print("Columns:", df.columns.tolist())

df.info()
df.describe(include="all").transpose()

In [None]:
print("Unique client_id:", df["client_id"].nunique())
print("Unique offer_id:", df["offer_id"].nunique())

In [None]:
df["conversion"].value_counts(normalize=True).plot(kind="bar")
plt.title("Conversion rate (0/1)")
plt.show()

df.groupby("channel")["conversion"].mean().sort_values().plot(kind="bar")
plt.title("CTR by channel")
plt.show()

df.groupby("offer_type")["conversion"].mean().sort_values().plot(kind="bar")
plt.title("CTR by offer_type")
plt.show()

df.groupby("offer_category")["conversion"].mean().sort_values().plot(kind="bar")
plt.title("CTR by offer_category")
plt.show()

df.groupby("price_segment")["conversion"].mean().sort_values().plot(kind="bar")
plt.title("CTR by price_segment")
plt.show()

In [None]:
numeric_cols = [
    "recency_days", "frequency_90d", "monetary_90d",
    "email_open_rate_30d", "age"
]

df[numeric_cols].hist(figsize=(12, 8), bins=20)
plt.tight_layout()
plt.show()

In [None]:
corr_cols = [
    "conversion", "recency_days", "frequency_90d",
    "monetary_90d", "discounts_used_90d",
    "avg_discount_percent_90d", "category_affinity_top1"
]

corr_matrix = df[corr_cols].corr()

plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=True, cmap="Blues")
plt.title("Correlation matrix")
plt.show()

In [None]:
print("Negative recency:", (df["recency_days"] < 0).sum())
print("Age out of range:", ((df["age"] < 15) | (df["age"] > 90)).sum())
print("Invalid monetary_90d:", (df["monetary_90d"] < 0).sum())