In [2]:
import os
import pandas as pd
import numpy as np
import random
from typing import Callable, Dict

SEED = 42

random.seed(SEED)
np.random.seed(SEED)

## 1. Load Processed Dataset

Load the processed dataset constructed in the previous notebook.
The dataset contains only decision-relevant state features and a binary reward signal, 
and will be used for offline evaluation of different decision policies.


In [3]:

# Path configuration
DATA_DIR = "../data"
PROCESSED_PATH = os.path.join(DATA_DIR, "bank_processed_for_bandit.csv")

# Load processed dataset
df = pd.read_csv(PROCESSED_PATH)

# Basic checks
print("Dataset shape:", df.shape)
df.head()


Dataset shape: (45211, 18)


Unnamed: 0,row_id,age_group,job,marital,education,default,housing,loan,contact,month,balance_group,campaign,day_group,campaign_group,pdays_group,previous_group,poutcome,reward
0,0,pre_retirement,management,married,tertiary,no,yes,no,unknown,may,high_balance,1,day_1_7,1_10_contacts,never_contacted,0_10_previous,unknown,0
1,1,mid_career,technician,single,secondary,no,yes,no,unknown,may,low_balance,1,day_1_7,1_10_contacts,never_contacted,0_10_previous,unknown,0
2,2,young_adult,entrepreneur,married,secondary,no,yes,yes,unknown,may,low_balance,1,day_1_7,1_10_contacts,never_contacted,0_10_previous,unknown,0
3,3,mid_career,blue-collar,married,unknown,no,yes,no,unknown,may,high_balance,1,day_1_7,1_10_contacts,never_contacted,0_10_previous,unknown,0
4,4,young_adult,unknown,single,unknown,no,no,no,unknown,may,low_balance,1,day_1_7,1_10_contacts,never_contacted,0_10_previous,unknown,0


## 2. Define Action Space

Actions represent different levels of marketing contact intensity that the bank can actively choose 
for each customer. The action space is kept consistent across all experiments.


In [4]:
# Action space definition
ACTION_LOW_INTENSITY = 1
ACTION_HIGH_INTENSITY = 2

ACTION_SPACE = [
    ACTION_LOW_INTENSITY,
    ACTION_HIGH_INTENSITY
]

ACTION_NAMES = {
    ACTION_LOW_INTENSITY: "low_intensity_contact",
    ACTION_HIGH_INTENSITY: "high_intensity_contact"
}

print("Defined action space:")
for a in ACTION_SPACE:
    print(f"{a}: {ACTION_NAMES[a]}")


Defined action space:
1: low_intensity_contact
2: high_intensity_contact


## 3. Define offline evaluation rules  
Offline evaluation is conducted using a replay-based action-matching protocol. For each observation, a policy selects an action based on the customer context. The observed reward is counted only when the selected action matches the historical action inferred from the data. This conservative approach avoids counterfactual assumptions and enables fair comparison across policies.


In [5]:
def infer_historical_action_from_campaign(campaign: int) -> int:
    """
    Map historical campaign counts to contact intensity actions.
    
    - 1 contact   -> low intensity
    - 2+ contacts -> high intensity
    
    No-contact is rarely observed in the historical data.
    """
    if campaign == 1:
        return ACTION_LOW_INTENSITY
    else:
        return ACTION_HIGH_INTENSITY

df['historical_action'] = df['campaign'].apply(infer_historical_action_from_campaign)
df['historical_action'].value_counts(normalize=True)



historical_action
2    0.611953
1    0.388047
Name: proportion, dtype: float64

## 4.1 Baseline: Random Policy
As a sanity check, we first evaluate a random policy using replay-based action matching.
This baseline verifies that the offline evaluation pipeline is functioning correctly
and provides a lower-bound reference for subsequent decision policies.


In [6]:
def replay_evaluate(
    df,
    policy_fn: Callable,
    reward_col: str = "reward",
    historical_action_col: str = "historical_action"
) -> Dict[str, float]:
    """
    Replay-based offline evaluation.

    For each row:
    - policy selects an action based on the state
    - reward is counted only if policy_action == historical_action
    """

    total_reward = 0
    matched_steps = 0
    n_rows = len(df)

    for _, row in df.iterrows():
        policy_action = policy_fn(row)
        historical_action = row[historical_action_col]

        if policy_action == historical_action:
            total_reward += row[reward_col]
            matched_steps += 1

    match_rate = matched_steps / n_rows if n_rows > 0 else 0.0
    avg_reward_on_matched = (
        total_reward / matched_steps if matched_steps > 0 else 0.0
    )

    return {
        "n_rows": n_rows,
        "matched_steps": matched_steps,
        "match_rate": match_rate,
        "total_reward_on_matched": total_reward,
        "avg_reward_on_matched": avg_reward_on_matched
    }


In [7]:
rng = random.Random(42)

def random_policy(_row):
    return rng.choice(ACTION_SPACE)

metrics_random = replay_evaluate(df, random_policy)
metrics_random


{'n_rows': 45211,
 'matched_steps': 22489,
 'match_rate': 0.49742319347061553,
 'total_reward_on_matched': 2624,
 'avg_reward_on_matched': 0.11667926541864912}

## 4.2 Baseline: Greedy Policy

We next evaluate a greedy baseline policy using replay-based action matching.
The greedy policy selects actions deterministically based on interpretable customer signals,
such as prior campaign outcomes and coarse-grained interaction history.

This baseline represents a reasonable business heuristic and serves as a stronger benchmark
than random selection, allowing us to assess whether learning-based policies provide
additional value beyond rule-based decision making.


In [8]:
def greedy_policy(row):
    """
    Greedy baseline policy (interpretable heuristic).
    Returns ACTION_LOW_INTENSITY or ACTION_HIGH_INTENSITY.
    """
    # Strong signal: previously successful -> low intensity is sufficient
    if str(row.get('poutcome', '')).lower() == 'success':
        return ACTION_LOW_INTENSITY

    # If never contacted before, start with low intensity
    if 'pdays_group' in row and str(row['pdays_group']) == 'never_contacted':
        return ACTION_LOW_INTENSITY

    # If high balance segment, prefer low intensity (cost-aware)
    if 'balance_group' in row and str(row['balance_group']) in ['high_balance', 'mid_high_balance']:
        return ACTION_LOW_INTENSITY

    # Otherwise, be more aggressive
    return ACTION_HIGH_INTENSITY


In [9]:
metrics_greedy = replay_evaluate(df, greedy_policy)
metrics_greedy


{'n_rows': 45211,
 'matched_steps': 17775,
 'match_rate': 0.3931565327022185,
 'total_reward_on_matched': 2554,
 'avg_reward_on_matched': 0.14368495077355836}

## 4.3 Random vs. Greedy Policy Comparison



### Comparison Summary
- The **random policy** achieves a higher match rate, as expected, since it samples actions uniformly and therefore matches historical actions more frequently.
- The **greedy policy** matches fewer historical actions, reflecting its more selective and deterministic decision rules.
- However, conditional on matched actions, the greedy policy achieves a **higher average reward**, indicating better decision quality when evaluation is possible.

This result suggests that while the greedy policy is evaluated on a smaller subset of the data, it selects actions that are more likely to lead to successful outcomes compared to random selection.

Overall, the greedy policy demonstrates stronger decision performance than the random baseline.

### Next Step

In the next notebook, we introduce an **offline contextual bandit approach using Thompson Sampling**.
The policy learns from historical data in a batch manner and accounts for uncertainty in decision making,
with performance evaluated under the same replay-based offline framework.

The goal is to assess whether an uncertainty-aware, learning-based policy can outperform both random and greedy baselines when restricted to observable historical outcomes.
