In [None]:
# --- Imports ---
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# --- Load raw data ---
df = pd.read_csv('../data/raw/retail_customers.csv')
print(df.head())

# --- Basic Info ---
print(df.info())
print(df.describe())

# --- Missing values ---
missing = df.isnull().sum()
print("Missing values:\n", missing)

# --- Exploratory Data Analysis (EDA) ---
sns.countplot(x='Churn', data=df)
plt.title('Customer Churn Distribution')
plt.show()

sns.histplot(df['Age'], bins=20, kde=True)
plt.title('Customer Age Distribution')
plt.show()

sns.boxplot(x='Churn', y='TotalSpend', data=df)
plt.title('Spending by Churn Status')
plt.show()

# --- Correlation Heatmap ---
corr = df.corr(numeric_only=True)
plt.figure(figsize=(10,8))
sns.heatmap(corr, cmap='coolwarm', annot=True)
plt.title('Correlation Matrix')
plt.show()

# --- Save cleaned version for next steps ---
df.to_csv('../data/processed/retail_customers_cleaned.csv', index=False)
