# Uplift Bank Marketing - Data Exploration

## Load data

In [None]:
%matplotlib inline
import os
import pandas as pd
from ucimlrepo import fetch_ucirepo
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import StandardScaler

# Set the correct data directory (absolute path)
project_root = os.path.abspath(os.path.join(os.path.dirname("__file__"), "../src/uplift_bank_marketing/data"))
os.makedirs(project_root, exist_ok=True)

X_path = os.path.join(project_root, "X_bank_marketing.csv")
y_path = os.path.join(project_root, "y_bank_marketing.csv")

# Check if data already exists
if os.path.exists(X_path) and os.path.exists(y_path):
    print("Loading data from saved files...")
    X = pd.read_csv(X_path)
    y = pd.read_csv(y_path)
    print("Data loaded successfully.")
else:
    print("Downloading data from UCI Machine Learning Repository...")
    bank_marketing = fetch_ucirepo(id=222)
    
    # Save features and target
    X = bank_marketing.data.features
    y = bank_marketing.data.targets

    X.to_csv(X_path, index=False)
    y.to_csv(y_path, index=False)

    print("Data downloaded and saved successfully.")

# Combine features and target for analysis
df = X.copy()
df['target'] = y
df.head()


In [None]:
df.describe()

In [None]:
df["pdays"].describe()

In [None]:
df["campaign"].unique()

In [None]:
df["previous"].unique()

In [None]:
df['pdays_contacted'] = (df['pdays'] != -1).astype(int)


In [None]:
df['pdays_contacted'].unique()

In [None]:
# Calculating the unique values of "pdays_contacted" and their counts in the DataFrame
pdays_counts = df['pdays_contacted'].value_counts(dropna=False).reset_index()
pdays_counts.columns = ['pdays_contacted_value', 'count']
pdays_counts

In [None]:
# Calculating the unique values of "campaign" and their counts in the DataFrame
pdays_counts = df['campaign'].value_counts(dropna=False).reset_index()
pdays_counts.columns = ['campaign_value', 'count']
pdays_counts

In [None]:
df['campaign_once'] = (df['campaign'] == 1).astype(int)

In [None]:
crosstab_result_full = pd.crosstab(
    df["pdays_contacted"], 
    df["campaign_once"], 
    margins=True, 
    dropna=False
)

crosstab_result_full

In [None]:
crosstab_result_full = pd.crosstab(
    [df["pdays_contacted"], 
    df["campaign_once"]], 
    df["target"],
    margins=True, 
    dropna=False
)

crosstab_result_full

In [None]:
# Calculating the unique values of "previous" and their counts in the DataFrame
pdays_counts = df['previous'].value_counts(dropna=False).reset_index()
pdays_counts.columns = ['previous_value', 'count']
pdays_counts

In [None]:
# Calculating the unique values of "pdays" and their counts in the DataFrame
pdays_counts = df['pdays'].value_counts(dropna=False).reset_index()
pdays_counts.columns = ['pdays_value', 'count']
pdays_counts

In [None]:
df["pdays"].unique()

In [None]:
df["poutcome"].unique()

In [None]:
df_na = df

# Crosstab including NaN as a column (converted to string "NaN")
df_na['pdays_contacted_str'] = df_na['pdays_contacted'].astype(str)
df_na['poutcome_str'] = df_na['poutcome'].astype(str)

crosstab_result_full = pd.crosstab(
    df_na["pdays_contacted_str"], 
    df_na["poutcome_str"], 
    margins=True, 
    dropna=False
)

crosstab_result_full




In [None]:
# Crosstab with NaN as a separate column
crosstab_result_full = pd.crosstab(
    [df_na["pdays_contacted_str"], 
    df_na["poutcome_str"]], 
    df_na["target"], 
    margins=True, 
    dropna=False
)

crosstab_result_full

In [None]:
# Crosstab of pdays vs poutcome
crosstab_result = pd.crosstab(df["pdays_contacted"], df["poutcome"], dropna= False)
crosstab_result


In [None]:
df.info()

In [None]:
# Identifying all numerical variables
numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns

# Setting up the subplots (2 columns)
num_cols = len(numerical_columns)
fig, axes = plt.subplots(nrows=(num_cols + 1) // 2, ncols=2, figsize=(14, 6 * ((num_cols + 1) // 2)))
fig.tight_layout(pad=5.0)

# Flattening the axes for easy iteration
axes = axes.flatten() if num_cols > 1 else [axes]

# Plotting each numerical column in a subplot
for i, col in enumerate(numerical_columns):
    sns.histplot(df[col], kde=True, color='darkorange', ax=axes[i])
    axes[i].set_title(f"Distribution of {col} (Numerical Variable)")

# Removing any unused subplots
for i in range(len(axes)):
    if i >= num_cols:
        fig.delaxes(axes[i])

plt.show()

In [None]:
def process_contact_column(data_in_function):
    """
    Processes the 'contact' column:
    - Renames it to 'cellular'.
    - Encodes 'cellular' as 1 (cellular) and 0 (telephone).
    - Creates a 'cellular_missing' column if any missing values exist.
    """
    if "contact" in data_in_function.columns:
        # Create a missing flag if any values are missing
        if data_in_function['contact'].isnull().sum() > 0:
            data_in_function['cellular_missing'] = data_in_function['contact'].isnull().astype(int)
        
        # Renaming the column
        data_in_function.rename(columns={"contact": "cellular"}, inplace=True)
        
        # Encoding cellular as 1, telephone as 0
        data_in_function["cellular"] = data_in_function["cellular"].str.lower().map(
            {"cellular": 1, "telephone": 0}
        ).fillna(0).astype(int)
    
    return data_in_function

def handle_missing_values(data_in_function):
    """
    Handles missing values in the DataFrame.
    - For binary columns, creates a missing flag if needed.
    - For multi-level categorical columns, replaces with "unknown".
    """
    missing_cols = data_in_function.columns[data_in_function.isnull().sum() > 0]

    for col in missing_cols:
        if data_in_function[col].dtype == 'object':
            # Binary Columns (Two Unique Values)
            if data_in_function[col].nunique() <= 2:
                if data_in_function[col].isnull().sum() > 0:
                    missing_flag = f"{col}_missing"
                    data_in_function[missing_flag] = data_in_function[col].isnull().astype(int)
            else:
                # Multi-Level Categorical Columns
                data_in_function[col].fillna("unknown", inplace=True)
    
    return data_in_function


def encode_binary_columns(data_in_function):
    """
    Encodes binary columns as 0/1 (no/yes).
    - Applies to binary columns (two unique values).
    """
    for col in data_in_function.select_dtypes(include=['object']).columns:
        if data_in_function[col].nunique() <= 2:
            data_in_function[col] = data_in_function[col].str.lower().map(
                {"yes": 1, "no": 0, "true": 1, "false": 0, "1": 1, "0": 0}
            )
            data_in_function[col] = data_in_function[col].fillna(0).astype(int)  # Any unknown becomes 0
    
    return data_in_function


def one_hot_encode_multilevel(data_in_function):
    """
    Applies One-Hot Encoding to multi-level categorical columns.
    - Retains "unknown" and drops the second most frequent category.
    """
    categorical_cols = data_in_function.select_dtypes(include=['object']).columns

    for col in categorical_cols:
        value_counts = data_in_function[col].value_counts()
        values_to_drop = value_counts.index[1] if len(value_counts) > 1 else value_counts.index[0]
        
        if values_to_drop == "unknown":
            values_to_drop = value_counts.index[2] if len(value_counts) > 2 else None
        
        # One-Hot Encoding (0/1)
        one_hot = pd.get_dummies(data_in_function[col], prefix=col).astype(int)
        data_in_function = pd.concat([data_in_function.drop(columns=[col]), one_hot], axis=1)
        
        # Drop the chosen column if it exists and is not "unknown"
        drop_column = f"{col}_{values_to_drop}"
        if values_to_drop and drop_column in data_in_function.columns and "unknown" not in drop_column:
            data_in_function.drop(columns=[drop_column], inplace=True)
    
    return data_in_function


def process_age_column(data_in_function):
    """
    Processes the 'age' column:
    - Bins the continuous 'age' column into age groups.
    - One-Hot Encodes these age groups as 0/1, removing the reference category (18-25).
    - Drops the original 'age' column.
    """
    if "age" in data_in_function.columns:
        # Define age bins and labels
        bins = [18, 25, 35, 45, 55, 65, float("inf")]
        labels = ["18-25", "26-35", "36-45", "46-55", "56-65", "66+"]

        # Bin the 'age' column
        data_in_function["age_group"] = pd.cut(data_in_function["age"], bins=bins, labels=labels, right=False)

        # One-Hot Encode the age groups (as 0/1, not boolean)
        age_dummies = pd.get_dummies(data_in_function["age_group"], prefix="age_group", drop_first=True).astype(int)
        data_in_function = pd.concat([data_in_function.drop(columns=["age"]), age_dummies], axis=1)
    
    return data_in_function


def process_balance_column(data_in_function):
    """
    Processes the 'balance' column:
    - Creates a 'balance_negative' column indicating if balance was negative (0/1).
    - Applies log transformation to the absolute value of balance for normalization.
    - Standardizes the transformed balance.
    - Drops the original 'balance' column.
    """
    if "balance" in data_in_function.columns:
        # Create a negative balance indicator (0/1)
        data_in_function['balance_negative'] = (data_in_function['balance'] < 0).astype(int)
        
        # Calculate the absolute balance and log-transform it
        abs_balance = np.abs(data_in_function['balance'])
        log_balance = np.log1p(abs_balance)  # log(1 + |balance|)
        
        # Standardize the transformed balance
        scaler = StandardScaler()
        data_in_function['balance_processed'] = scaler.fit_transform(log_balance.values.reshape(-1, 1))
        
        # Drop the original balance column
        data_in_function.drop(columns=['balance'], inplace=True)
    
    return data_in_function


def process_day_of_month_column(data_in_function):
    """
    Processes the 'day_of_week' column (actually day of the month).
    - Bins it into early, mid, and late periods.
    - Creates binary columns for these periods with 0/1 values.
    - Drops the original 'day_of_week' column.
    - Drops the first category to avoid multicollinearity.
    """
    if "day_of_week" in data_in_function.columns:
        # Define custom bins: early, mid, late (1-31)
        bins = [1, 10, 20, 31]
        labels = ["early_month", "mid_month", "late_month"]
        
        # Bin the days into these categories
        data_in_function['day_of_month_period'] = pd.cut(
            data_in_function['day_of_week'], bins=bins, labels=labels, right=True
        )

        # One-Hot Encode the periods with 0/1 and drop first category
        period_dummies = pd.get_dummies(
            data_in_function['day_of_month_period'], 
            prefix="day_of_month", 
            drop_first=True
        ).astype(int)
        
        # Combine and clean up
        data_in_function = pd.concat([data_in_function.drop(columns=['day_of_week', 'day_of_month_period']), 
                                      period_dummies], axis=1)
    
    return data_in_function


def process_treatment_columns(data_in_function):
    """
    Processes the campaign-related columns:
    - Transforms 'campaign' into binary treatment (0 = no contact, 1 = any contact).
    - Drops 'campaign' (numerical exposure).
    - Drops 'pdays' (redundant with 'previous').
    - Converts 'previous' to binary (contacted before: yes/no).
    - Drops 'duration' (data leakage).
    """
    if "campaign" in data_in_function.columns:
        # Binary treatment (0 = no contact, 1 = any contact)
        data_in_function['treatment'] = (data_in_function['campaign'] > 0).astype(int)
        
        # Drop 'campaign' (numerical exposure)
        data_in_function.drop(columns=['campaign'], inplace=True)
    
    if "pdays" in data_in_function.columns:
        # Drop 'pdays' (highly correlated with 'previous')
        data_in_function.drop(columns=['pdays'], inplace=True)
    
    if "previous" in data_in_function.columns:
        # Binary indicator for prior contact (0 = no, 1 = yes)
        data_in_function['previous_contacted'] = (data_in_function['previous'] > 0).astype(int)
        # Drop 'previous' (numerical exposure)
        data_in_function.drop(columns=['previous'], inplace=True)
    
    # Drop 'duration' (data leakage)
    if "duration" in data_in_function.columns:
        data_in_function.drop(columns=['duration'], inplace=True)
    
    return data_in_function


def prepare_data(data_in_function):
    """
    Complete data preparation function combining all steps.
    - Processes 'contact' column (cellular).
    - Handles missing values.
    - Encodes binary columns.
    - One-Hot Encodes multi-level categorical columns.
    - Bins and encodes the 'age' column.
    - Processes the 'balance' column (negative flag, log transform, standardize).
    - Processes the 'day_of_week' (actually day of the month) column (binned with 0/1, drop first).
    - Processes the campaign-related columns.
    """
    data_in_function = process_contact_column(data_in_function)
    data_in_function = handle_missing_values(data_in_function)
    data_in_function = encode_binary_columns(data_in_function)
    data_in_function = one_hot_encode_multilevel(data_in_function)
    data_in_function = process_age_column(data_in_function)
    data_in_function = process_balance_column(data_in_function)
    data_in_function = process_day_of_month_column(data_in_function)
    data_in_function = process_treatment_columns(data_in_function)
    
    return data_in_function



In [None]:
df_prepared = prepare_data(df)
df_prepared.head()

In [None]:
df_prepared.describe()

In [None]:
# Identifying all numerical variables
numerical_columns = df_prepared.select_dtypes(include=['int64', 'float64']).columns

# Setting up the subplots (2 columns)
num_cols = len(numerical_columns)
fig, axes = plt.subplots(nrows=(num_cols + 1) // 2, ncols=2, figsize=(14, 6 * ((num_cols + 1) // 2)))
fig.tight_layout(pad=5.0)

# Flattening the axes for easy iteration
axes = axes.flatten() if num_cols > 1 else [axes]

# Plotting each numerical column in a subplot
for i, col in enumerate(numerical_columns):
    sns.histplot(df_prepared[col], kde=True, color='darkorange', ax=axes[i])
    axes[i].set_title(f"Distribution of {col} (Numerical Variable)")

# Removing any unused subplots
for i in range(len(axes)):
    if i >= num_cols:
        fig.delaxes(axes[i])

plt.show()