# data_and_problem_setup.ipynb
## Goal
Load the Bank Marketing dataset (bank-full.csv), perform light EDA, define context (features), design discrete actions for a contextual bandit framing, define reward, and save a processed CSV for later bandit experiments.


In [1]:
import os
import random
import numpy as np
import pandas as pd

# reproducibility
SEED = 2026
random.seed(SEED)
np.random.seed(SEED)

DATA_DIR = "../data"  # adjust relative path if needed
RAW_PATH = os.path.join(DATA_DIR, "bank-full.csv")
PROCESSED_PATH = os.path.join(DATA_DIR, "bank_processed_for_bandit.csv")


In [None]:
df = pd.read_csv(RAW_PATH, sep=';')
print(df.shape)
df.head()

(45211, 17)


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
24080,56,technician,married,secondary,no,589,yes,no,unknown,23,oct,518,1,147,2,success,yes
24165,30,admin.,married,secondary,no,873,yes,no,telephone,12,nov,119,1,167,3,success,no
24239,48,admin.,divorced,secondary,no,295,yes,no,cellular,17,nov,123,1,164,2,success,no
24264,49,management,married,tertiary,no,64,no,no,cellular,17,nov,208,1,159,1,success,no
24435,42,technician,married,tertiary,no,14282,yes,no,cellular,17,nov,77,1,103,4,success,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45190,32,blue-collar,married,secondary,no,136,no,no,cellular,16,nov,206,1,188,3,success,yes
45193,28,self-employed,single,tertiary,no,159,no,no,cellular,16,nov,449,2,33,4,success,yes
45195,68,retired,married,secondary,no,1146,no,no,cellular,16,nov,212,1,187,6,success,yes
45201,53,management,married,tertiary,no,583,no,no,cellular,17,nov,226,1,184,4,success,yes


In [3]:
df.columns

Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'y'],
      dtype='object')

In [4]:
df.isnull().sum()


age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

In [5]:
# add column: age_group
bins = [0, 21, 35, 50, 65, 100]
labels = [
    "student_young",
    "young_adult",
    "mid_career",
    "pre_retirement",
    "retired"
]

df["age_group"] = pd.cut(
    df["age"],
    bins=bins,
    labels=labels,
    right=True,
    include_lowest=True
)

df[['age','age_group']].head()

Unnamed: 0,age,age_group
0,58,pre_retirement
1,44,mid_career
2,33,young_adult
3,47,mid_career
4,33,young_adult


In [6]:
# add column: balance_group
df['balance_group'] = pd.qcut(
    df['balance'],
    q=4,
    labels=['low_balance', 'mid_low_balance', 'mid_high_balance', 'high_balance']
)


In [7]:
# add column: day_group
day_bins = list(range(0, 32, 7))  # [0,7,14,21,28]
day_bins.append(32)               # ensure day 31 included

df['day_group'] = pd.cut(
    df['day'],
    bins=day_bins,
    labels=[
        'day_1_7',
        'day_8_14',
        'day_15_21',
        'day_22_28',
        'day_29_31'
    ],
    include_lowest=True
)


In [8]:
# add column: campaign_group
df['campaign_group'] = pd.cut(
    df['campaign'],
    bins=[0, 10, 20, 30, 40, df['campaign'].max() + 1],
    labels=[
        '1_10_contacts',
        '11_20_contacts',
        '21_30_contacts',
        '31_40_contacts',
        '40plus_contacts'
    ],
    include_lowest=True
)


In [9]:
# add column: pdays_group
df['pdays_clean'] = df['pdays'].replace(-1, np.nan)

df['pdays_group'] = pd.cut(
    df['pdays_clean'],
    bins=[0, 30, 90, 180, 365, np.inf],
    labels=[
        'within_1_month',
        '1_3_months',
        '3_6_months',
        '6_12_months',
        'over_1_year'
    ]
)

df['pdays_group'] = df['pdays_group'].cat.add_categories(['never_contacted'])
df.loc[df['pdays'].eq(-1), 'pdays_group'] = 'never_contacted'


In [10]:
# add column: previous_group
df['previous_group'] = pd.cut(
    df['previous'],
    bins=[-1, 10, 20, 30, 40, df['previous'].max() + 1],
    labels=[
        '0_10_previous',
        '11_20_previous',
        '21_30_previous',
        '31_40_previous',
        '40plus_previous'
    ]
)


In [11]:
group_cols = [
    'balance_group',
    'day_group',
    'campaign_group',
    'pdays_group',
    'previous_group'
]

for col in group_cols:
    print(f"\n{col}")
    print(df[col].value_counts(dropna=False))



balance_group
balance_group
low_balance         11317
mid_high_balance    11306
high_balance        11297
mid_low_balance     11291
Name: count, dtype: int64

day_group
day_group
day_15_21    13900
day_8_14     10442
day_1_7       9798
day_22_28     7117
day_29_31     3954
Name: count, dtype: int64

campaign_group
campaign_group
1_10_contacts      44015
11_20_contacts       952
21_30_contacts       185
31_40_contacts        46
40plus_contacts       13
Name: count, dtype: int64

pdays_group
pdays_group
never_contacted    36954
6_12_months         4416
3_6_months          2480
over_1_year          643
1_3_months           530
within_1_month       188
Name: count, dtype: int64

previous_group
previous_group
0_10_previous      44917
11_20_previous       239
21_30_previous        43
31_40_previous         7
40plus_previous        5
Name: count, dtype: int64


### Quick EDA

In [12]:
print(df['y'].value_counts(normalize=True))

# conversion rate by cols
cols = [
    'age_group', 'job', 'marital', 'education', 'default', 'balance_group',
    'housing', 'loan', 'contact', 'day_group', 'month', 'campaign_group',
    'pdays_group', 'previous_group', 'poutcome'
]

for col in cols:
    print(f"\nConversion rate by {col}")
    display(
        df.groupby(col)['y']
          .apply(lambda x: (x == 'yes').mean())
          .sort_values(ascending=False)
    )




y
no     0.883015
yes    0.116985
Name: proportion, dtype: float64

Conversion rate by age_group


  df.groupby(col)['y']


age_group
retired           0.426099
student_young     0.312500
young_adult       0.127548
pre_retirement    0.116769
mid_career        0.093811
Name: y, dtype: float64


Conversion rate by job


job
student          0.286780
retired          0.227915
unemployed       0.155027
management       0.137556
admin.           0.122027
self-employed    0.118429
unknown          0.118056
technician       0.110570
services         0.088830
housemaid        0.087903
entrepreneur     0.082717
blue-collar      0.072750
Name: y, dtype: float64


Conversion rate by marital


marital
single      0.149492
divorced    0.119455
married     0.101235
Name: y, dtype: float64


Conversion rate by education


education
tertiary     0.150064
unknown      0.135703
secondary    0.105594
primary      0.086265
Name: y, dtype: float64


Conversion rate by default


default
no     0.117961
yes    0.063804
Name: y, dtype: float64


Conversion rate by balance_group


  df.groupby(col)['y']


balance_group
high_balance        0.161547
mid_high_balance    0.125155
mid_low_balance     0.108936
low_balance         0.072369
Name: y, dtype: float64


Conversion rate by housing


housing
no     0.167024
yes    0.077000
Name: y, dtype: float64


Conversion rate by loan


loan
no     0.126557
yes    0.066814
Name: y, dtype: float64


Conversion rate by contact


contact
cellular     0.149189
telephone    0.134205
unknown      0.040707
Name: y, dtype: float64


Conversion rate by day_group


  df.groupby(col)['y']


day_group
day_8_14     0.131871
day_1_7      0.125842
day_22_28    0.124210
day_29_31    0.112797
day_15_21    0.097050
Name: y, dtype: float64


Conversion rate by month


month
mar    0.519916
dec    0.467290
sep    0.464594
oct    0.437669
apr    0.196794
feb    0.166478
aug    0.110133
jun    0.102228
nov    0.101511
jan    0.101212
jul    0.090935
may    0.067195
Name: y, dtype: float64


Conversion rate by campaign_group


  df.groupby(col)['y']


campaign_group
1_10_contacts      0.119096
11_20_contacts     0.045168
31_40_contacts     0.021739
21_30_contacts     0.016216
40plus_contacts    0.000000
Name: y, dtype: float64


Conversion rate by pdays_group


  df.groupby(col)['y']


pdays_group
1_3_months         0.420755
over_1_year        0.292379
3_6_months         0.274194
6_12_months        0.177989
within_1_month     0.148936
never_contacted    0.091573
Name: y, dtype: float64


Conversion rate by previous_group


  df.groupby(col)['y']


previous_group
40plus_previous    0.400000
11_20_previous     0.192469
21_30_previous     0.139535
0_10_previous      0.116548
31_40_previous     0.000000
Name: y, dtype: float64


Conversion rate by poutcome


poutcome
success    0.647253
other      0.166848
failure    0.126097
unknown    0.091615
Name: y, dtype: float64

In [13]:
# conversion rate by job
df.groupby('job')['y'].apply(lambda x: (x=='yes').mean()).sort_values(ascending=False).head()


job
student       0.286780
retired       0.227915
unemployed    0.155027
management    0.137556
admin.        0.122027
Name: y, dtype: float64

In [14]:
# conversion rate by marital
df.groupby('marital')['y'].apply(lambda x: (x=='yes').mean()).sort_values(ascending=False).head()

marital
single      0.149492
divorced    0.119455
married     0.101235
Name: y, dtype: float64

In [15]:
# conversion rate by education
df.groupby('education')['y'].apply(lambda x: (x=='yes').mean()).sort_values(ascending=False).head()

education
tertiary     0.150064
unknown      0.135703
secondary    0.105594
primary      0.086265
Name: y, dtype: float64

## Establish processed csv

In [17]:
state_cols = [
    'age_group',
    'job',
    'marital',
    'education',
    'default',
    'housing',
    'loan',
    'contact',
    'month',
    'balance_group',
    'campaign',
    'day_group',
    'campaign_group',
    'pdays_group',
    'previous_group',
    'poutcome'
]

df_processed = df[state_cols].copy()

df_processed['reward'] = (df['y'] == 'yes').astype(int)
df_processed['row_id'] = df.index

df_processed = df_processed[['row_id'] + state_cols + ['reward']]

df_processed.to_csv(PROCESSED_PATH, index=False)



After completing exploratory data analysis and feature engineering, we construct a processed dataset that contains only decision-relevant state features and a binary reward signal. This dataset serves as the input for offline contextual bandit evaluation in subsequent experiments. (see the next step in notebook: bandit_baselines.ipynb)


## Action Definition: Contact Intensity

In this project, actions represent different levels of **marketing contact intensity** that the bank can actively choose for each customer.

### Action Space
We define a discrete action space with three possible actions:

| Action ID | Description |
|---------|-------------|
| 0 | No contact |
| 1 | Low-intensity contact |
| 2 | High-intensity contact |

### Rationale
Marketing outreach naturally involves decisions about **whether to contact a customer and how aggressively to do so**.  
This action design aligns with the historical campaign data, where customers were contacted multiple times with varying effort levels.

The contact intensity abstraction allows us to model realistic business decisions while remaining compatible with offline evaluation using historical data.

### Usage in Contextual Bandit Setting
For each customer context, a policy selects one of the three contact intensity actions.  
The observed reward is based on customer subscription outcomes recorded in the dataset, enabling offline replay-based evaluation of different decision policies.

The actual implement is shown in the following 2 notebooks: bandit_baselines.ipynb and contextual_bandit_results.ipynb.