# Project Checkpoint

## Data Cleaning

In [1]:
import numpy as np
import pandas as pd

In [53]:
# load data
df = pd.read_csv('bank-full.csv')
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [55]:
# Drop redundant and unnesecerry columns
df = df.drop(['contact', 'duration', 'pdays', 'poutcome'], axis=1)

In [56]:
# check the info for attricutes
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 13 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   default    45211 non-null  object
 5   balance    45211 non-null  int64 
 6   housing    45211 non-null  object
 7   loan       45211 non-null  object
 8   day        45211 non-null  int64 
 9   month      45211 non-null  object
 10  campaign   45211 non-null  int64 
 11  previous   45211 non-null  int64 
 12  y          45211 non-null  object
dtypes: int64(5), object(8)
memory usage: 4.5+ MB


In [57]:
# 0=NO and 1=YES
df['y'] = df['y'].replace({'yes': 1, 'no': 0})
df.head(10)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,day,month,campaign,previous,y
0,58,management,married,tertiary,no,2143,yes,no,5,may,1,0,0
1,44,technician,single,secondary,no,29,yes,no,5,may,1,0,0
2,33,entrepreneur,married,secondary,no,2,yes,yes,5,may,1,0,0
3,47,blue-collar,married,unknown,no,1506,yes,no,5,may,1,0,0
4,33,unknown,single,unknown,no,1,no,no,5,may,1,0,0
5,35,management,married,tertiary,no,231,yes,no,5,may,1,0,0
6,28,management,single,tertiary,no,447,yes,yes,5,may,1,0,0
7,42,entrepreneur,divorced,tertiary,yes,2,yes,no,5,may,1,0,0
8,58,retired,married,primary,no,121,yes,no,5,may,1,0,0
9,43,technician,single,secondary,no,593,yes,no,5,may,1,0,0


## EDA

## Feature Engineering & Base Model

In [58]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import sklearn.preprocessing as pp
from sklearn import metrics

In [59]:
# features
X = df.drop('y', axis=1)
# outcome
y = df.y

In [60]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=12)

In [61]:
# Numeric columns and associated transformers
num_feat = ['age', 'balance', 'day', 'campaign', 'previous']
num_transformer = Pipeline(steps=[
    ('scaler', pp.StandardScaler())   # z-scale
])

# Categorical columns and associated transformers
cat_feat = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'month']
cat_transformer = Pipeline(steps=[
    ('onehot', pp.OneHotEncoder())     # output from Ordinal becomes input to OneHot
])

# preprocessing pipeline (put them together)
preproc = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_feat),
        ('cat', cat_transformer, cat_feat)
    ])

pl = Pipeline(steps=[('preprocessor', preproc), ('classifier', DecisionTreeClassifier())])

In [62]:
pl.fit(X_train, y_train)

# performance on training data
pred_train = pl.predict(X_train)
rmse_train = np.sqrt(np.mean((pred_train - y_train)**2))
# train accuracy
acc_train = pl.score(X_train, y_train)
# train f1-score
f1_train = metrics.f1_score(y_train, pred_train)

print ("train RMSE: %s" % rmse_train)
print ("train accuracy: %s" % acc_train)
print ("train f1-score: %s" % f1_train)

train RMSE: 0.0
train accuracy: 1.0
train f1-score: 1.0


In [63]:
# performance on test data -- what we really care about
pred_test = pl.predict(X_test)
rmse_test = np.sqrt(np.mean((pred_test - y_test)**2))
# test accuracy
acc_test = pl.score(X_test, y_test)
# test f1-score
f1_test = metrics.f1_score(y_test, pred_test)

print ("test RMSE: %s" % rmse_test)
print ("test accuracy: %s" % acc_test)
print ("test f1-score: %s" % f1_test)

test RMSE: 0.4235231595863282
test accuracy: 0.8206281332940135
test f1-score: 0.28166519043401245


In [64]:
# the importance of each attribute
dict(zip(X.columns, pl.named_steps['classifier'].feature_importances_))

{'age': 0.16109679962000578,
 'job': 0.24186509887760896,
 'marital': 0.12091172796807781,
 'education': 0.06452691873198906,
 'default': 0.04967279928919023,
 'balance': 0.012767192184979603,
 'housing': 0.013075330877955035,
 'loan': 0.006507560011507303,
 'day': 0.004606681754533456,
 'month': 0.012335487965403553,
 'campaign': 0.005342914757529685,
 'previous': 0.005338869105629017}