# EDA (Exploratory Data Analysis)

Dataset: `customers_ecommerce_churn.csv`

This notebook mirrors `src/eda.py`, but is easier to browse on GitHub.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

DATA_PATH = '../data/customers_ecommerce_churn.csv'
df = pd.read_csv(DATA_PATH)
df.head()

In [None]:
(df.shape, df.dtypes)

In [None]:
for c in ['signup_date','last_purchase_date']:
    df[c] = pd.to_datetime(df[c], errors='coerce')

missing = df.isna().sum().sort_values(ascending=False)
duplicates = df.duplicated().sum()
missing, duplicates

In [None]:
if duplicates:
    df = df.drop_duplicates().reset_index(drop=True)

df.shape

In [None]:
num_cols = [
    'sessions_last_30d','orders_last_90d','avg_order_value_usd','discount_rate',
    'support_tickets_last_90d','return_rate','nps_score','tenure_months','gross_revenue_12m_usd'
]
df[num_cols].describe().T

In [None]:
for c in ['sessions_last_30d','orders_last_90d','avg_order_value_usd','gross_revenue_12m_usd','nps_score']:
    plt.figure()
    df[c].dropna().hist(bins=30)
    plt.title(f'Distribution: {c}')
    plt.xlabel(c)
    plt.ylabel('count')
    plt.show()

In [None]:
for c in ['region','acquisition_channel','preferred_device']:
    churn_rate = df.groupby(c, dropna=False)['churned'].mean().sort_values(ascending=False)
    display(churn_rate)
    plt.figure()
    churn_rate.plot(kind='bar')
    plt.title(f'Churn rate by {c}')
    plt.ylabel('mean(churned)')
    plt.show()

In [None]:
corr = df[num_cols + ['churned']].corr(numeric_only=True)
corr['churned'].sort_values(ascending=False)

In [None]:
outliers = {}
for c in ['gross_revenue_12m_usd','avg_order_value_usd','orders_last_90d']:
    s = df[c].dropna()
    q1, q3 = s.quantile([0.25, 0.75])
    iqr = q3 - q1
    low, high = q1 - 1.5*iqr, q3 + 1.5*iqr
    outliers[c] = int(((df[c] < low) | (df[c] > high)).sum())
outliers