In [4]:
import pandas as pd

In [9]:
df = pd.read_csv("bank.csv", sep=";")

In [10]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,no
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,no
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,no


In [11]:
df.shape

(4521, 17)

In [12]:
df.columns

Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'y'],
      dtype='object')

In [13]:
df.dtypes

age           int64
job          object
marital      object
education    object
default      object
balance       int64
housing      object
loan         object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y            object
dtype: object

In [14]:
numeric_cols = df.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_cols = df.select_dtypes(include=["object"]).columns.tolist()

numeric_cols, categorical_cols

(['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous'],
 ['job',
  'marital',
  'education',
  'default',
  'housing',
  'loan',
  'contact',
  'month',
  'poutcome',
  'y'])

In [15]:
df.isna().sum()

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

In [16]:
for col in categorical_cols:
    print(col)
    print(df[col].value_counts().head())
    print("-" * 30)

job
job
management     969
blue-collar    946
technician     768
admin.         478
services       417
Name: count, dtype: int64
------------------------------
marital
marital
married     2797
single      1196
divorced     528
Name: count, dtype: int64
------------------------------
education
education
secondary    2306
tertiary     1350
primary       678
unknown       187
Name: count, dtype: int64
------------------------------
default
default
no     4445
yes      76
Name: count, dtype: int64
------------------------------
housing
housing
yes    2559
no     1962
Name: count, dtype: int64
------------------------------
loan
loan
no     3830
yes     691
Name: count, dtype: int64
------------------------------
contact
contact
cellular     2896
unknown      1324
telephone     301
Name: count, dtype: int64
------------------------------
month
month
may    1398
jul     706
aug     633
jun     531
nov     389
Name: count, dtype: int64
------------------------------
poutcome
poutcome
unknown 

In [17]:
for col in categorical_cols:
    print(col, "→", df[col].nunique())


job → 12
marital → 3
education → 4
default → 2
housing → 2
loan → 2
contact → 3
month → 12
poutcome → 4
y → 2


In [20]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(
    handle_unknown="ignore",
    sparse_output=False
)

In [21]:
numeric_cols = df.select_dtypes(include=["int64", "float64"]).columns.tolist()
numeric_cols

['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']

In [22]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [23]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

In [24]:
df[numeric_cols].describe()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
count,4521.0,4521.0,4521.0,4521.0,4521.0,4521.0,4521.0
mean,41.170095,1422.657819,15.915284,263.961292,2.79363,39.766645,0.542579
std,10.576211,3009.638142,8.247667,259.856633,3.109807,100.121124,1.693562
min,19.0,-3313.0,1.0,4.0,1.0,-1.0,0.0
25%,33.0,69.0,9.0,104.0,1.0,-1.0,0.0
50%,39.0,444.0,16.0,185.0,2.0,-1.0,0.0
75%,49.0,1480.0,21.0,329.0,3.0,-1.0,0.0
max,87.0,71188.0,31.0,3025.0,50.0,871.0,25.0


In [25]:
target_col = "y"

X = df.drop(columns=[target_col])
y = df[target_col]

In [26]:
y.value_counts()

y
no     4000
yes     521
Name: count, dtype: int64

In [27]:
y = y.map({"no": 0, "yes": 1})

In [28]:
numeric_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()

numeric_cols, categorical_cols

(['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous'],
 ['job',
  'marital',
  'education',
  'default',
  'housing',
  'loan',
  'contact',
  'month',
  'poutcome'])

In [32]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_cols),
        ("cat", categorical_transformer, categorical_cols)
    ]
)

In [33]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [34]:
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

X_train_processed.shape, X_test_processed.shape


((3616, 51), (905, 51))

In [35]:
import joblib
joblib.dump(preprocessor, "preprocessor.joblib")


['preprocessor.joblib']

In [36]:
feature_names = preprocessor.get_feature_names_out()
feature_names[:20], len(feature_names)

(array(['num__age', 'num__balance', 'num__day', 'num__duration',
        'num__campaign', 'num__pdays', 'num__previous', 'cat__job_admin.',
        'cat__job_blue-collar', 'cat__job_entrepreneur',
        'cat__job_housemaid', 'cat__job_management', 'cat__job_retired',
        'cat__job_self-employed', 'cat__job_services', 'cat__job_student',
        'cat__job_technician', 'cat__job_unemployed', 'cat__job_unknown',
        'cat__marital_divorced'], dtype=object),
 51)

In [37]:
import pandas as pd

X_train_df = pd.DataFrame(X_train_processed, columns=feature_names)
X_test_df  = pd.DataFrame(X_test_processed, columns=feature_names)

X_train_df.head()

Unnamed: 0,num__age,num__balance,num__day,num__duration,num__campaign,num__pdays,num__previous,cat__job_admin.,cat__job_blue-collar,cat__job_entrepreneur,...,cat__month_jun,cat__month_mar,cat__month_may,cat__month_nov,cat__month_oct,cat__month_sep,cat__poutcome_failure,cat__poutcome_other,cat__poutcome_success,cat__poutcome_unknown
0,-0.864347,-0.505865,-0.492444,-0.417317,0.065082,-0.411541,-0.329453,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,-0.673204,-0.448232,-0.248716,-0.612989,-0.570967,0.447116,11.136987,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
2,0.282513,-0.499461,0.726196,-0.804824,-0.252942,-0.411541,-0.329453,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,-0.673204,2.456534,0.360604,-0.820171,-0.570967,-0.411541,-0.329453,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.951514,1.297118,-0.004988,-0.724253,1.337181,-0.411541,-0.329453,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [38]:
X_train_df.shape, X_test_df.shape, y_train.shape, y_test.shape

((3616, 51), (905, 51), (3616,), (905,))

In [39]:
X_train_df.to_csv("X_train.csv", index=False)
X_test_df.to_csv("X_test.csv", index=False)

y_train.to_csv("y_train.csv", index=False)
y_test.to_csv("y_test.csv", index=False)