# Exploratory Data Analysis

This notebook provides basic distributions and response analysis for the campaign dataset.

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

DATA_PATH = "../data/raw/campaign_data.csv"

df = pd.read_csv(DATA_PATH)
df.head()

In [None]:
# Dataset overview
print("Shape:", df.shape)
print("\nMissing values:\n", df.isnull().sum()[df.isnull().sum() > 0] if df.isnull().sum().any() else "None")
df.describe(include="all").T

In [None]:
# 1. Response distribution
if "response" in df.columns:
    fig, ax = plt.subplots(1, 2, figsize=(10, 4))
    df["response"].value_counts().plot(kind="bar", ax=ax[0], color=["#2ecc71", "#3498db"])
    ax[0].set_title("Response Distribution (Count)")
    ax[0].set_xlabel("Response")
    df["response"].value_counts(normalize=True).plot(kind="bar", ax=ax[1], color=["#2ecc71", "#3498db"])
    ax[1].set_title("Response Distribution (Proportion)")
    ax[1].set_xlabel("Response")
    plt.tight_layout()
    plt.show()
    print("Response rate:", df["response"].mean().round(4))

## 2. Customer demographic distributions

In [None]:
# Age distribution
if "age" in df.columns:
    fig, ax = plt.subplots(figsize=(8, 4))
    df["age"].hist(bins=30, ax=ax, edgecolor="white")
    ax.set_title("Customer Age Distribution")
    ax.set_xlabel("Age")
    plt.tight_layout()
    plt.show()

# Categorical demographics: job, marital, education
fig, axes = plt.subplots(1, 3, figsize=(14, 4))
for col, ax in zip(["job", "marital", "education"], axes):
    if col in df.columns:
        order = df[col].value_counts().index
        sns.countplot(y=col, data=df, order=order, ax=ax, palette="viridis")
        ax.set_title(f"Distribution by {col.title()}")
plt.tight_layout()
plt.show()

## 3. Campaign performance trends

In [None]:
# Response rate by contact channel
if "contact" in df.columns and "response" in df.columns:
    rate_contact = df.groupby("contact")["response"].agg(["mean", "count"])
    rate_contact.columns = ["response_rate", "count"]
    rate_contact = rate_contact.sort_values("response_rate", ascending=True)
    fig, ax = plt.subplots(figsize=(6, 4))
    rate_contact["response_rate"].plot(kind="barh", ax=ax, color="steelblue")
    ax.set_title("Campaign Response Rate by Contact Channel")
    ax.set_xlabel("Response Rate")
    plt.tight_layout()
    plt.show()

# Response rate by month
if "month" in df.columns and "response" in df.columns:
    month_order = ["jan", "feb", "mar", "apr", "may", "jun", "jul", "aug", "sep", "oct", "nov", "dec"]
    rate_month = df.groupby("month")["response"].mean()
    rate_month = rate_month.reindex([m for m in month_order if m in rate_month.index])
    fig, ax = plt.subplots(figsize=(10, 4))
    rate_month.plot(kind="bar", ax=ax, color="coral")
    ax.set_title("Campaign Response Rate by Month")
    ax.set_xlabel("Month")
    ax.set_ylabel("Response Rate")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

# Response rate by number of campaign contacts
if "campaign" in df.columns and "response" in df.columns:
    rate_campaign = df.groupby("campaign")["response"].agg(["mean", "count"]).reset_index()
    rate_campaign = rate_campaign[rate_campaign["count"] >= 50]  # filter low-count
    fig, ax = plt.subplots(figsize=(8, 4))
    ax.bar(rate_campaign["campaign"].astype(str), rate_campaign["mean"], color="seagreen")
    ax.set_title("Response Rate by Number of Campaign Contacts")
    ax.set_xlabel("Number of contacts during campaign")
    ax.set_ylabel("Response Rate")
    plt.tight_layout()
    plt.show()

## 4. Correlation analysis

In [None]:
# Numeric columns only
numeric_df = df.select_dtypes(include=["number"])
if len(numeric_df.columns) > 1:
    corr = numeric_df.corr()
    plt.figure(figsize=(10, 8))
    sns.heatmap(corr, annot=True, fmt=".2f", cmap="RdBu_r", center=0, square=True)
    plt.title("Correlation Matrix (Numeric Features)")
    plt.tight_layout()
    plt.show()
# Correlation with target
if "response" in numeric_df.columns:
    target_corr = numeric_df.corr()["response"].drop("response", errors="ignore")
    target_corr = target_corr.reindex(target_corr.abs().sort_values(ascending=False).index)
    print("Correlation with response:\n", target_corr)

## 5. Response rate by segment

In [None]:
# Response rate by job (top 10 by volume)
if "job" in df.columns and "response" in df.columns:
    job_rates = df.groupby("job").agg(response_rate=("response", "mean"), count=("response", "count"))
    job_rates = job_rates[job_rates["count"] >= 100].sort_values("response_rate", ascending=True)
    fig, ax = plt.subplots(figsize=(8, 5))
    job_rates["response_rate"].plot(kind="barh", ax=ax, color="teal")
    ax.set_title("Response Rate by Job (min 100 contacts)")
    ax.set_xlabel("Response Rate")
    plt.tight_layout()
    plt.show()

# Response rate by marital status and education
for col in ["marital", "education"]:
    if col in df.columns and "response" in df.columns:
        seg = df.groupby(col)["response"].mean().sort_values(ascending=True)
        seg.plot(kind="bar", title=f"Response Rate by {col.title()}", figsize=(6, 3))
        plt.ylabel("Response Rate")
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()