# Uplift Bank Marketing - Data Exploration

## Load data

In [None]:
%matplotlib inline
import os
import pandas as pd
from ucimlrepo import fetch_ucirepo
import seaborn as sns
import matplotlib.pyplot as plt

# Set the correct data directory (absolute path)
project_root = os.path.abspath(os.path.join(os.path.dirname("__file__"), "../src/uplift_bank_marketing/data"))
os.makedirs(project_root, exist_ok=True)

X_path = os.path.join(project_root, "X_bank_marketing.csv")
y_path = os.path.join(project_root, "y_bank_marketing.csv")

# Check if data already exists
if os.path.exists(X_path) and os.path.exists(y_path):
    print("Loading data from saved files...")
    X = pd.read_csv(X_path)
    y = pd.read_csv(y_path)
    print("Data loaded successfully.")
else:
    print("Downloading data from UCI Machine Learning Repository...")
    bank_marketing = fetch_ucirepo(id=222)
    
    # Save features and target
    X = bank_marketing.data.features
    y = bank_marketing.data.targets

    X.to_csv(X_path, index=False)
    y.to_csv(y_path, index=False)

    print("Data downloaded and saved successfully.")

# Combine features and target for analysis
df = X.copy()
df['target'] = y
df.head()


## Quick Overview

In [None]:
# Display the first few rows
df.head()

In [None]:
# Display the last few rows
df.tail()

In [None]:
# Basic information about the data
df.info()

In [None]:
# Basic statistics (numerical features)
df.describe().T

In [None]:
# Displaying the count of missing values in each column
missing_values = df.isnull().sum()
print(missing_values[missing_values > 0])

In [None]:
# Visualizing the pattern of missing data
plt.figure(figsize=(12, 8))
sns.heatmap(df.isnull(), cbar=False, cmap="viridis")
plt.title("Missing Data Pattern")
plt.show()

# Calculating percentage of missing values in each column
missing_percent = (df.isnull().sum() / len(df)) * 100
missing_percent = missing_percent[missing_percent > 0].sort_values(ascending=False)

# Displaying the percentages
print("Missing Values (%):")
print(missing_percent)

# Visualizing missing values as a percentage
plt.figure(figsize=(10, 6))
sns.barplot(x=missing_percent.values, y=missing_percent.index, palette="magma")
plt.title("Percentage of Missing Values by Feature")
plt.xlabel("Percentage of Missing Values")
plt.show()

The visual heatmap of missing values shows that the missing data is not concentrated in specific rows, suggesting that it is scattered throughout the dataset. However, the high concentration of missing values in poutcome is visually striking, reinforcing the need to carefully consider whether this column is useful.

In [None]:
# Columns with missing values
missing_cols = df.columns[df.isnull().sum() > 0]

# Displaying bar charts for categorical columns with missing values
for col in missing_cols:
    if df[col].dtype == 'object':  # Only for categorical variables
        plt.figure(figsize=(10, 5))
        sns.countplot(data=df, y=col, order=df[col].value_counts(dropna=False).index, palette="magma")
        plt.title(f"Distribution of {col} (Including Missing Values)")
        plt.show()


## Target Variable Distribution


In [None]:
# Plotting distribution of target variable
sns.countplot(data=df, x='target', palette="magma")
plt.title('Target Variable Distribution')
plt.show()

## Visualize Features

In [None]:
# Identifying all categorical variables
categorical_columns = df.select_dtypes(include=['object']).columns

# Visualizing all categorical variables
for col in categorical_columns:
    plt.figure(figsize=(10, 5))
    sns.countplot(data=df, y=col, order=df[col].value_counts().index, palette="magma")
    plt.title(f"Distribution of {col} (Categorical Variable)")
    plt.show()


In [None]:
def process_contact_column(data_in_function):
    """
    Processes the 'contact' column:
    - Renames it to 'cellular'.
    - Encodes 'cellular' as 1 (cellular) and 0 (telephone).
    - Creates a 'cellular_missing' column if any missing values exist.
    """
    if "contact" in data_in_function.columns:
        # Create a missing flag if any values are missing
        if data_in_function['contact'].isnull().sum() > 0:
            data_in_function['cellular_missing'] = data_in_function['contact'].isnull().astype(int)
        
        # Renaming the column
        data_in_function.rename(columns={"contact": "cellular"}, inplace=True)
        
        # Encoding cellular as 1, telephone as 0
        data_in_function["cellular"] = data_in_function["cellular"].str.lower().map(
            {"cellular": 1, "telephone": 0}
        ).fillna(0).astype(int)
    
    return data_in_function

def handle_missing_values(data_in_function):
    """
    Handles missing values in the DataFrame.
    - For binary columns, creates a missing flag if needed.
    - For multi-level categorical columns, replaces with "unknown".
    """
    missing_cols = data_in_function.columns[data_in_function.isnull().sum() > 0]

    for col in missing_cols:
        if data_in_function[col].dtype == 'object':
            # Binary Columns (Two Unique Values)
            if data_in_function[col].nunique() <= 2:
                if data_in_function[col].isnull().sum() > 0:
                    missing_flag = f"{col}_missing"
                    data_in_function[missing_flag] = data_in_function[col].isnull().astype(int)
            else:
                # Multi-Level Categorical Columns
                data_in_function[col].fillna("unknown", inplace=True)
    
    return data_in_function


def encode_binary_columns(data_in_function):
    """
    Encodes binary columns as 0/1 (no/yes).
    - Applies to binary columns (two unique values).
    """
    for col in data_in_function.select_dtypes(include=['object']).columns:
        if data_in_function[col].nunique() <= 2:
            data_in_function[col] = data_in_function[col].str.lower().map(
                {"yes": 1, "no": 0, "true": 1, "false": 0, "1": 1, "0": 0}
            )
            data_in_function[col] = data_in_function[col].fillna(0).astype(int)  # Any unknown becomes 0
    
    return data_in_function


def one_hot_encode_multilevel(data_in_function):
    """
    Applies One-Hot Encoding to multi-level categorical columns.
    - Retains "unknown" and drops the second most frequent category.
    """
    categorical_cols = data_in_function.select_dtypes(include=['object']).columns

    for col in categorical_cols:
        value_counts = data_in_function[col].value_counts()
        values_to_drop = value_counts.index[1] if len(value_counts) > 1 else value_counts.index[0]
        
        if values_to_drop == "unknown":
            values_to_drop = value_counts.index[2] if len(value_counts) > 2 else None
        
        # One-Hot Encoding (0/1)
        one_hot = pd.get_dummies(data_in_function[col], prefix=col).astype(int)
        data_in_function = pd.concat([data_in_function.drop(columns=[col]), one_hot], axis=1)
        
        # Drop the chosen column if it exists and is not "unknown"
        drop_column = f"{col}_{values_to_drop}"
        if values_to_drop and drop_column in data_in_function.columns and "unknown" not in drop_column:
            data_in_function.drop(columns=[drop_column], inplace=True)
    
    return data_in_function


def prepare_data(data_in_function):
    """
    Complete data preparation function combining all steps.
    - Processes 'contact' column (cellular).
    - Handles missing values.
    - Encodes binary columns.
    - One-Hot Encodes multi-level categorical columns.
    """
    data_in_function = process_contact_column(data_in_function)
    data_in_function = handle_missing_values(data_in_function)
    data_in_function = encode_binary_columns(data_in_function)
    data_in_function = one_hot_encode_multilevel(data_in_function)
    return data_in_function


In [None]:
df_prepared = prepare_data(df)
df_prepared.head()

In [None]:
df_prepared.isnull().sum()

In [None]:
df_prepared.describe()

In [None]:
# Identifying all numerical variables
numerical_columns = df_prepared.select_dtypes(include=['int64', 'float64']).columns

# Setting up the subplots (2 columns)
num_cols = len(numerical_columns)
fig, axes = plt.subplots(nrows=(num_cols + 1) // 2, ncols=2, figsize=(14, 6 * ((num_cols + 1) // 2)))
fig.tight_layout(pad=5.0)

# Flattening the axes for easy iteration
axes = axes.flatten() if num_cols > 1 else [axes]

# Plotting each numerical column in a subplot
for i, col in enumerate(numerical_columns):
    sns.histplot(df_prepared[col], kde=True, color='darkorange', ax=axes[i])
    axes[i].set_title(f"Distribution of {col} (Numerical Variable)")

# Removing any unused subplots
for i in range(len(axes)):
    if i >= num_cols:
        fig.delaxes(axes[i])

plt.show()