## Import Modules

In [23]:
import matplotlib.pyplot as plt
import seaborn as sb

In [68]:
# Import the data
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.metrics import classification_report

In [2]:
# from ml_utils import build_encoders
import sys
sys.path.append('../Solved/')

from ml_utils import (fill_missing, build_encoders, encode_categorical, build_target_encoder, encode_target) 

## Preprocessing

In [3]:
df = pd.read_csv('https://static.bc-edx.com/ai/ail-v-1-0/m14/datasets/bank_marketing.csv')
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,83,retired,divorced,primary,no,0,no,no,telephone,31,may,664,1,77.0,3,success,no
1,32,technician,married,secondary,no,1242,yes,no,,2,jun,183,3,,0,,no
2,38,blue-collar,single,secondary,no,68,no,no,,5,jun,90,2,,0,,no
3,30,services,single,secondary,no,677,yes,no,cellular,21,nov,108,1,,0,,no
4,66,retired,married,primary,no,2173,no,no,cellular,15,jul,178,1,181.0,5,failure,no


In [4]:
y = df['y']
X = df.copy().drop(columns='y', axis=1)

In [5]:
X.isna().sum()/X.shape[0] *100

age           0.000000
job           0.645865
marital       0.000000
education     4.187802
default       0.000000
balance       0.000000
housing       0.000000
loan          0.000000
contact      28.771971
day           0.000000
month         0.000000
duration      0.000000
campaign      0.000000
pdays        81.670992
previous      0.000000
poutcome     81.682789
dtype: float64

In [6]:
# X.drop(columns=['contact', 'pdays'])

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [8]:
X_train.isna().sum()/X_train.shape[0] *100

age           0.000000
job           0.692069
marital       0.000000
education     4.301836
default       0.000000
balance       0.000000
housing       0.000000
loan          0.000000
contact      28.398411
day           0.000000
month         0.000000
duration      0.000000
campaign      0.000000
pdays        81.479297
previous      0.000000
poutcome     81.491094
dtype: float64

In [9]:
X_train['contact'].value_counts()

contact
cellular     16542
telephone     1667
Name: count, dtype: int64

In [10]:
X_train['poutcome'].value_counts()

poutcome
failure    2798
other      1042
success     867
Name: count, dtype: int64

## Missing Values

In [11]:
X_train_filled = fill_missing(X_train)
X_test_filled = fill_missing(X_test)
X_train_filled.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
8667,25,technician,divorced,tertiary,no,174,no,yes,unknown,27,may,350,4,-1.0,0,nonexistent
28232,57,management,married,tertiary,no,1631,yes,no,cellular,2,apr,254,2,261.0,3,failure
28487,40,management,married,secondary,no,643,yes,no,cellular,17,apr,74,2,256.0,1,failure
2532,28,technician,married,secondary,no,1086,yes,no,cellular,20,apr,1112,3,-1.0,0,nonexistent
20867,32,admin.,single,secondary,no,103,yes,no,cellular,17,apr,158,1,337.0,1,other


In [12]:
X_train_filled.isna().sum()/X_train.shape[0] *100

age          0.0
job          0.0
marital      0.0
education    0.0
default      0.0
balance      0.0
housing      0.0
loan         0.0
contact      0.0
day          0.0
month        0.0
duration     0.0
campaign     0.0
pdays        0.0
previous     0.0
poutcome     0.0
dtype: float64

In [13]:
encoders = build_encoders(X_train_filled)
encoders

[{'column': 'job',
  'multi_col_output': True,
  'encoder': OneHotEncoder(handle_unknown='infrequent_if_exist', max_categories=5,
                sparse_output=False)},
 {'column': 'marital',
  'multi_col_output': True,
  'encoder': OneHotEncoder(handle_unknown='ignore', sparse_output=False)},
 {'column': 'education',
  'multi_col_output': False,
  'encoder': OrdinalEncoder(categories=[['primary', 'secondary', 'tertiary']],
                 handle_unknown='use_encoded_value', unknown_value=-1)},
 {'column': 'default',
  'multi_col_output': False,
  'encoder': OrdinalEncoder(categories=[['no', 'yes']], handle_unknown='use_encoded_value',
                 unknown_value=-1)},
 {'column': 'housing',
  'multi_col_output': False,
  'encoder': OrdinalEncoder(categories=[['no', 'yes']], handle_unknown='use_encoded_value',
                 unknown_value=-1)},
 {'column': 'loan',
  'multi_col_output': False,
  'encoder': OrdinalEncoder(categories=[['no', 'yes']], handle_unknown='use_encoded_valu

In [35]:
# Encode X_train_filled and X_test_filled
X_train_encoded = encode_categorical(X_train_filled, encoders)
X_test_encoded = encode_categorical(X_test_filled, encoders)

X_train_encoded.head()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,x0_admin.,x0_blue-collar,x0_management,...,x0_unknown,x0_failure,x0_nonexistent,x0_other,x0_success,education,default,housing,loan,month
0,25,174,27,350,4,-1.0,0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,2.0,0.0,0.0,1.0,4.0
1,57,1631,2,254,2,261.0,3,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,2.0,0.0,1.0,0.0,3.0
2,40,643,17,74,2,256.0,1,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,3.0
3,28,1086,20,1112,3,-1.0,0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,3.0
4,32,103,17,158,1,337.0,1,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,3.0


In [34]:
# mask = np.triu(np.ones_like(X_train_encoded.corr())) 
# dataplot = sb.heatmap(X_train_encoded.corr(), cmap="YlGnBu", annot=True, mask=mask, linewidths=0.4) 

In [49]:
# Encode y_train and y_test
def get_values(row):
    if row=='no':
        return 0
    else:
        return 1
    
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)
y_test_encoded = le.transform(y_test)

In [50]:
from sklearn.decomposition import PCA

pca_model = PCA(n_components = 10)
pca_model.fit(X_train_encoded)

X_train_pca = pd.DataFrame(pca_model.transform(X_train_encoded))
X_test_pca = pd.DataFrame(pca_model.transform(X_test_encoded))
X_train_pca

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-1179.039259,93.319120,-41.663520,-15.961038,10.443826,0.443118,-1.190760,0.112364,-0.508880,0.674699
1,277.844985,-5.487803,220.136435,16.902565,-11.473453,0.489596,-1.518628,0.223408,-1.246776,0.650017
2,-710.402635,-184.150807,214.460014,-0.258317,2.708127,-0.677928,-1.832647,-1.592639,-0.233704,0.260640
3,-266.012696,854.092506,-39.111340,-12.953974,4.272100,0.569992,-2.006709,0.214105,0.135506,-0.190152
4,-1250.290735,-99.689319,295.759039,-7.884426,3.175073,-1.395286,-1.566812,-2.477097,-0.065569,-0.238657
...,...,...,...,...,...,...,...,...,...,...
25426,-126.342428,-136.450394,67.630282,-6.746261,1.337399,-1.657159,2.153106,-0.588976,0.073410,-0.157020
25427,-1045.246181,-65.857888,-42.204889,4.103459,6.870915,-1.404416,0.566819,-0.201299,0.861917,-0.734734
25428,600.582910,-194.435104,68.406904,0.025832,2.583952,-1.595431,5.065174,-0.072782,0.523019,-0.142587
25429,-1350.371692,-156.444124,-42.460550,-4.637896,3.662862,0.823209,1.778705,-0.405527,0.025891,-0.433784


In [58]:
np.unique(y_train_encoded, return_counts=True)

(array([0, 1]), array([22372,  3059]))

## Resample

In [1]:
# Import SMOTE from imblearn
from imblearn.over_sampling import SMOTE

# Instantiate the SMOTE instance 
# Set the sampling_strategy parameter equal to auto
smote_sampler = SMOTE(random_state=1, sampling_strategy='auto')

In [60]:
X_resampled, y_resampled = smote_sampler.fit_resample(X_train_pca, y_train_encoded)

In [63]:
y_resampled.shape

(44744,)

In [64]:
np.unique(y_resampled, return_counts=True)

(array([0, 1]), array([22372, 22372]))

## Modeling

### Base Model

In [71]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100)
model.fit(X_train_pca, y_train_encoded)

In [72]:
y_test_pred = model.predict(X_test_pca)
print(balanced_accuracy_score(y_test_encoded, y_test_pred))

0.6849127323792965


In [73]:
print(classification_report(y_test_encoded, y_test_pred))

              precision    recall  f1-score   support

           0       0.92      0.96      0.94      7477
           1       0.61      0.41      0.49      1000

    accuracy                           0.90      8477
   macro avg       0.77      0.68      0.71      8477
weighted avg       0.89      0.90      0.89      8477



### SMOTE Model

In [74]:
model = RandomForestClassifier(n_estimators=100)
model.fit(X_resampled, y_resampled)

In [75]:
y_test_pred = model.predict(X_test_pca)
print(balanced_accuracy_score(y_test_encoded, y_test_pred))

0.8032062993179082


In [76]:
print(classification_report(y_test_encoded, y_test_pred))

              precision    recall  f1-score   support

           0       0.96      0.90      0.93      7477
           1       0.48      0.71      0.57      1000

    accuracy                           0.87      8477
   macro avg       0.72      0.80      0.75      8477
weighted avg       0.90      0.87      0.88      8477



### Grid Search SMOTE

In [89]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV


In [90]:
grid_tuned_model = RandomForestClassifier()

In [91]:
param_grid = {
    'n_estimators':range(20, 100, 10), 
    'criterion' : ['gini', 'entropy', 'log_loss'],
    'max_depth' : [10, 12, 14, None],
}

In [92]:
grid_clf = RandomizedSearchCV(grid_tuned_model, param_grid, scoring='recall', cv=3)

In [93]:
grid_clf.fit(X_resampled, y_resampled)

In [94]:
print(grid_clf.best_params_)

{'n_estimators': 70, 'max_depth': None, 'criterion': 'log_loss'}


In [98]:
grid_y_pred = grid_clf.predict(X_test_pca)
print(classification_report(y_test_encoded, y_test_pred))

              precision    recall  f1-score   support

           0       0.96      0.90      0.93      7477
           1       0.48      0.71      0.57      1000

    accuracy                           0.87      8477
   macro avg       0.72      0.80      0.75      8477
weighted avg       0.90      0.87      0.88      8477



In [99]:
print(balanced_accuracy_score(y_test_encoded, grid_y_pred))

0.8028277383977531


## Predict on Held Out test Set

In [113]:
new_df = pd.read_csv('https://static.bc-edx.com/ai/ail-v-1-0/m14/lesson_3/datasets/bank_marketing_new_data.csv')

In [114]:
new_df

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,34,blue-collar,married,primary,no,5299,yes,no,,26,jun,75,5,,0,,no
1,30,admin.,single,secondary,no,414,no,no,cellular,30,apr,64,1,,0,,no
2,64,retired,married,,no,2923,no,no,cellular,12,mar,120,1,,0,,no
3,54,retired,divorced,secondary,no,2761,no,no,cellular,16,jul,771,1,,0,,no
4,36,services,single,secondary,no,165,yes,no,cellular,18,may,182,2,,0,,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11298,26,student,single,tertiary,no,5169,yes,no,cellular,20,apr,639,2,,0,,no
11299,60,retired,married,primary,no,1664,yes,no,,4,jun,142,2,,0,,no
11300,29,blue-collar,married,primary,no,25,yes,no,,4,jun,188,2,,0,,no
11301,33,entrepreneur,single,tertiary,no,138,no,no,cellular,4,feb,141,2,,0,,no


In [115]:
y = new_df['y']
X = new_df.copy().drop(columns='y', axis=1)

In [117]:
y.shape

(11303,)

In [116]:
X.shape

(11303, 16)

In [118]:
X_filled = fill_missing(X)
X_encoded = encode_categorical(X_filled, encoders)

In [119]:
X_encoded

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,x0_admin.,x0_blue-collar,x0_management,...,x0_unknown,x0_failure,x0_nonexistent,x0_other,x0_success,education,default,housing,loan,month
0,34,5299,26,75,5,-1.0,0,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,5.0
1,30,414,30,64,1,-1.0,0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,3.0
2,64,2923,12,120,1,-1.0,0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
3,54,2761,16,771,1,-1.0,0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,6.0
4,36,165,18,182,2,-1.0,0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11298,26,5169,20,639,2,-1.0,0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,2.0,0.0,1.0,0.0,3.0
11299,60,1664,4,142,2,-1.0,0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,5.0
11300,29,25,4,188,2,-1.0,0,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,5.0
11301,33,138,4,141,2,-1.0,0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,1.0


In [120]:
X_encoded_pca = pd.DataFrame(pca_model.transform(X_encoded))
y_encoded = le.transform(y)

In [121]:
y_encoded.shape

(11303,)

In [122]:
y_pred = model.predict(X_encoded_pca)

In [123]:
y_pred.shape

(11303,)

In [124]:
balanced_accuracy_score(y_pred, y_encoded)

0.7021082366906143