Data Collection:

The dataset is available in UC Irvine Machine Learning Repository. We followed the import in Python instructions on the website to get the dataset.

In [None]:
pip install ucimlrepo

Note: you may need to restart the kernel to use updated packages.


In [None]:
from ucimlrepo import fetch_ucirepo 
import pandas as pd
import os
# fetch dataset 
bank_marketing = fetch_ucirepo(id=222) 
  
# data (as pandas dataframes) 
X = bank_marketing.data.features 
y = bank_marketing.data.targets 
  
# metadata 
print(bank_marketing.metadata) 
  
# variable information 
print(bank_marketing.variables) 


KeyboardInterrupt: 

Data Processing

In [8]:
X=pd.concat([X,y],axis=1)
X.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day_of_week,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,,5,may,261,1,-1,0,,no
1,44,technician,single,secondary,no,29,yes,no,,5,may,151,1,-1,0,,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,,5,may,76,1,-1,0,,no
3,47,blue-collar,married,,no,1506,yes,no,,5,may,92,1,-1,0,,no
4,33,,single,,no,1,no,no,,5,may,198,1,-1,0,,no


In [9]:
#To know the data types and missing data points 
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   age          45211 non-null  int64 
 1   job          44923 non-null  object
 2   marital      45211 non-null  object
 3   education    43354 non-null  object
 4   default      45211 non-null  object
 5   balance      45211 non-null  int64 
 6   housing      45211 non-null  object
 7   loan         45211 non-null  object
 8   contact      32191 non-null  object
 9   day_of_week  45211 non-null  int64 
 10  month        45211 non-null  object
 11  duration     45211 non-null  int64 
 12  campaign     45211 non-null  int64 
 13  pdays        45211 non-null  int64 
 14  previous     45211 non-null  int64 
 15  poutcome     8252 non-null   object
 16  y            45211 non-null  object
dtypes: int64(7), object(10)
memory usage: 5.9+ MB


In [10]:
#descriptive statistics
X.describe()

Unnamed: 0,age,balance,day_of_week,duration,campaign,pdays,previous
count,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0
mean,40.93621,1362.272058,15.806419,258.16308,2.763841,40.197828,0.580323
std,10.618762,3044.765829,8.322476,257.527812,3.098021,100.128746,2.303441
min,18.0,-8019.0,1.0,0.0,1.0,-1.0,0.0
25%,33.0,72.0,8.0,103.0,1.0,-1.0,0.0
50%,39.0,448.0,16.0,180.0,2.0,-1.0,0.0
75%,48.0,1428.0,21.0,319.0,3.0,-1.0,0.0
max,95.0,102127.0,31.0,4918.0,63.0,871.0,275.0


In [11]:
#droping duplicates if they exist
X.drop_duplicates(keep='first',inplace=True)
X.shape

(45211, 17)

In [13]:
#Handling Null Values: Part 1
X_null=X.isna().sum().to_frame('null_count').reset_index()
X_null=X_null[X_null['null_count']>0].reset_index(drop=True)

X_null['null_count']=X_null['null_count']*100/X.shape[0]
X_null

Unnamed: 0,index,null_count
0,job,0.637013
1,education,4.107407
2,contact,28.798301
3,poutcome,81.747805


In [14]:
#Handling Null Values: Part 2, dropping columns with more than 80% null_values. The threshold can be set based on the dataset.
columns_to_drop = X_null[X_null['null_count'] > 80]['index'].tolist()
X.drop(columns=columns_to_drop,inplace=True)
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 16 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   age          45211 non-null  int64 
 1   job          44923 non-null  object
 2   marital      45211 non-null  object
 3   education    43354 non-null  object
 4   default      45211 non-null  object
 5   balance      45211 non-null  int64 
 6   housing      45211 non-null  object
 7   loan         45211 non-null  object
 8   contact      32191 non-null  object
 9   day_of_week  45211 non-null  int64 
 10  month        45211 non-null  object
 11  duration     45211 non-null  int64 
 12  campaign     45211 non-null  int64 
 13  pdays        45211 non-null  int64 
 14  previous     45211 non-null  int64 
 15  y            45211 non-null  object
dtypes: int64(7), object(9)
memory usage: 5.5+ MB


In [15]:
#Handling Null Values: Part 3, we are goimng to impute columns with <80% missing values

# Step 1: Identify columns with <80% missing values
impute = X_null[X_null['null_count'] < 80]['index'].tolist()

# Step 2: Iterate through each column and impute missing values
for i in impute:
    # Check if the column is categorical (dtype is 'object')
    if X[i].dtype == 'object':
        # Fill missing values with the most frequent value (mode)
        X[i].fillna(X[i].mode()[0], inplace=True)
    else:
        # Fill missing values in numerical columns with median (immune to outliers)
        X[i].fillna(X[i].median(), inplace=True)

print("Missing values imputed successfully!")


Missing values imputed successfully!


In [16]:
# All the null values are handled
X.isna().sum()

age            0
job            0
marital        0
education      0
default        0
balance        0
housing        0
loan           0
contact        0
day_of_week    0
month          0
duration       0
campaign       0
pdays          0
previous       0
y              0
dtype: int64

In [17]:
#Partitioning numerical and categorical values
X_numerical=X[X.describe().columns]
X_categorical=X[[i for i in X.columns if i not in X_numerical.columns ]]

In [18]:
# We are encoding categorical values 

# 1) Month- we are doing ordinal encoding 

# Define the ordinal mapping
month_order = {'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6, 
               'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12}

# Apply ordinal encoding
X['month_encoded'] = X['month'].map(month_order)

print(X[['month', 'month_encoded']].head())

  month  month_encoded
0   may              5
1   may              5
2   may              5
3   may              5
4   may              5


In [None]:
# 2- job, maritial, contactwe are currently going with label encoding since it's good for tree based models may change in the future

from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder for 'job'
le_job = LabelEncoder()
X['job_encoded'] = le_job.fit_transform(X['job'])

print(X[['job', 'job_encoded']].head())

# Initialize LabelEncoder for 'marital'
le_marital = LabelEncoder()
X['marital_encoded'] = le_marital.fit_transform(X['marital'])

print(X[['marital', 'marital_encoded']].head())

# Initialize LabelEncoder
le_contact = LabelEncoder()

# Apply encoding
X['contact_encoded'] = le_contact.fit_transform(X['contact'])

# Display results
print(X[['contact', 'contact_encoded']].head())


            job  job_encoded
0    management            4
1    technician            9
2  entrepreneur            2
3   blue-collar            1
4   blue-collar            1
   marital  marital_encoded
0  married                1
1   single                2
2  married                1
3  married                1
4   single                2
    contact  contact_encoded
0  cellular                0
1  cellular                0
2  cellular                0
3  cellular                0
4  cellular                0


In [20]:
#3- education ordinal encoding
from sklearn.preprocessing import OrdinalEncoder

# Define the order of education levels
education_order = [['primary', 'secondary', 'tertiary']]

# Initialize OrdinalEncoder
ordinal_encoder = OrdinalEncoder(categories=education_order)

# Apply encoding
X['education_encoded'] = ordinal_encoder.fit_transform(X[['education']])

print(X[['education', 'education_encoded']].head())

   education  education_encoded
0   tertiary                2.0
1  secondary                1.0
2  secondary                1.0
3  secondary                1.0
4  secondary                1.0


In [21]:
X['loan'].value_counts()

loan
no     37967
yes     7244
Name: count, dtype: int64

In [22]:
#4- Binary columns yes/ no- binary encoding

# Define mapping for Yes/No columns
binary_mapping = {'yes': 1, 'no': 0}

# Apply encoding
X['default_encoded'] = X['default'].map(binary_mapping)
X['housing_encoded'] = X['housing'].map(binary_mapping)
X['loan_encoded'] = X['loan'].map(binary_mapping)

# Display results
print(X[['default', 'default_encoded', 'housing', 'housing_encoded', 'loan', 'loan_encoded']].head())





  default  default_encoded housing  housing_encoded loan  loan_encoded
0      no                0     yes                1   no             0
1      no                0     yes                1   no             0
2      no                0     yes                1  yes             1
3      no                0     yes                1   no             0
4      no                0      no                0   no             0


Final Outcome

-The dataset is now clean, structured, and fully prepared for exploratory data analysis (EDA) and model training.

-All missing values are handled, and categorical variables are properly encoded.

-The processed dataset is now ready for visualization and further insights in the next phase.

In [48]:
import os

repo_root = os.path.abspath(os.path.join(os.getcwd(), ".."))  # Move up one level
data_path = os.path.join(repo_root, "data", "Data Preprocessing", "data_processing.csv")

# Ensure the directory exists
os.makedirs(os.path.dirname(data_path), exist_ok=True)

X.to_csv(data_path)
