# 1. Investigate the Data

In [1]:
import pandas as pd

In [2]:
data = pd.read_csv("data/bank.csv.csv",delimiter=";")

In [91]:
data.isna().sum()

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

In [92]:
data.head(7)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,no
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,no
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,no
5,35,management,single,tertiary,no,747,no,no,cellular,23,feb,141,2,176,3,failure,no
6,36,self-employed,married,tertiary,no,307,yes,no,cellular,14,may,341,1,330,2,other,no


## Which columns should be formated as ordinal, which columns should be formated as nominal?

In [93]:
## suppose we want to scale some numerical variables
## while we also want to format some categorical variables\
## what should we do? => We use pipeline
nominal_cat = ['job','marital','default','contact','month']
ordinal_cat = ['education']
numerical_features = ['balance']
from sklearn.pipeline import Pipeline
from sklearn import preprocessing

pipeline_num = Pipeline([
    ('scaler', preprocessing.StandardScaler())
])
pipeline_nominal = Pipeline([
    ('ohencoder', preprocessing.OneHotEncoder())
])
pipeline_ordinal = Pipeline([
    ('labelencoder', preprocessing.OrdinalEncoder())
])
# include these 3 pipelines together
from sklearn.compose import ColumnTransformer
full_pipeline = ColumnTransformer([
    ('numerical_transform', pipeline_num, numerical_features),
    ('ordinal_transform', pipeline_ordinal, ordinal_cat),
    ('nominal_transform', pipeline_nominal, nominal_cat),
])



# 2. Train Test split

In [97]:
from sklearn.model_selection import train_test_split
X = data.iloc[:,:-1]
y = data.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42,stratify=y)

In [95]:
y_train.value_counts()

no     2682
yes     351
Name: y, dtype: int64

In [96]:
y_test.value_counts()

no     1150
yes     151
Name: y, dtype: int64

# 3. Application of Pipeline

In [98]:
X_train_new = full_pipeline.fit_transform(X_train)

In [50]:
X_train_new.shape

(3033, 35)

In [99]:
X_test_new = full_pipeline.fit_transform(X_test)

In [52]:
X_test_new.shape

(1301, 35)

In [100]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=250,max_depth=10, random_state=0,class_weight='balanced')

In [101]:
clf.fit(X_train_new,y_train)
from sklearn.metrics import classification_report as report
preds = clf.predict(X_test_new)
print(report(y_test,preds,digits=4))

              precision    recall  f1-score   support

          no     0.9072    0.8417    0.8733      1150
         yes     0.2222    0.3444    0.2701       151

    accuracy                         0.7840      1301
   macro avg     0.5647    0.5931    0.5717      1301
weighted avg     0.8277    0.7840    0.8033      1301



# 4. Attach model to the pipeline

In [102]:
clf = RandomForestClassifier(n_estimators=250,max_depth=10, random_state=20,class_weight='balanced')
steps = [('preprocessing',full_pipeline),('model', clf)]
pipeline = Pipeline(steps=steps)


In [103]:
pipeline.fit(X_train,y_train)

In [104]:
preds = pipeline.predict(X_test)
print(report(y_test,preds,digits=4))

              precision    recall  f1-score   support

          no     0.9081    0.8165    0.8599      1150
         yes     0.2097    0.3709    0.2679       151

    accuracy                         0.7648      1301
   macro avg     0.5589    0.5937    0.5639      1301
weighted avg     0.8271    0.7648    0.7912      1301

