In [34]:
import pandas as pd

from sklearn.metrics import accuracy_score,classification_report
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression,LogisticRegressionCV
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
from sklearn.utils import resample
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')


In [2]:
df = pd.read_csv('bank-additional-full.csv',sep = ';')
df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [3]:
df.shape

(41188, 21)

#### Null Processing

In [4]:
df.replace('unknown',None, inplace = True)

In [5]:
df.replace('nonexistent',None,inplace = True)

In [6]:
df.isna().sum()

age                   0
job                 330
marital              80
education          1731
default            8597
housing             990
loan                990
contact               0
month                 0
day_of_week           0
duration              0
campaign              0
pdays                 0
previous              0
poutcome          35563
emp.var.rate          0
cons.price.idx        0
cons.conf.idx         0
euribor3m             0
nr.employed           0
y                     0
dtype: int64

In [7]:
# Column containing high null values

df.drop('poutcome',axis = 1,inplace = True)

In [8]:
lst = ['job','marital','education','default','housing','loan']

for i in lst:
    df[i].fillna(df[i].mode()[0],inplace = True)

In [9]:
labels = {'yes':1, 'no':0}

df['y']= df['y'].map(labels)

In [10]:
df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,261,1,999,0,1.1,93.994,-36.4,4.857,5191.0,0
1,57,services,married,high.school,no,no,no,telephone,may,mon,149,1,999,0,1.1,93.994,-36.4,4.857,5191.0,0
2,37,services,married,high.school,no,yes,no,telephone,may,mon,226,1,999,0,1.1,93.994,-36.4,4.857,5191.0,0
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,151,1,999,0,1.1,93.994,-36.4,4.857,5191.0,0
4,56,services,married,high.school,no,no,yes,telephone,may,mon,307,1,999,0,1.1,93.994,-36.4,4.857,5191.0,0


In [11]:
df['durationmin'] = df['duration']/60

In [12]:
df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y,durationmin
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,1.1,93.994,-36.4,4.857,5191.0,0,4.35
1,57,services,married,high.school,no,no,no,telephone,may,mon,...,1,999,0,1.1,93.994,-36.4,4.857,5191.0,0,2.483333
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,1.1,93.994,-36.4,4.857,5191.0,0,3.766667
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,1.1,93.994,-36.4,4.857,5191.0,0,2.516667
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,1.1,93.994,-36.4,4.857,5191.0,0,5.116667


In [13]:
df['y'].value_counts()

0    36548
1     4640
Name: y, dtype: int64

##### Get dummies

In [14]:
df = pd.get_dummies(df,drop_first = True)

In [15]:
X = df.drop('y',axis = 1)
Y = df['y']

### Handling imbalanced dataset

##### Undersampling

In [20]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size = 0.2,random_state = 42)

# concatenate our training data back together
data = pd.concat([X_train, Y_train], axis=1)

data_majority = data[data.y==0]
data_minority = data[data.y==1]
 
# Downsample majority class
data_majority_downsampled = resample(data_majority, 
                                 replace=False,    # sample without replacement
                                 n_samples=len(data_minority),  # to match minority class
                                 random_state=1234) # reproducible results
 
# Combine minority class with downsampled majority class
data_downsampled = pd.concat([data_majority_downsampled, data_minority])
 
# Display new class counts
data_downsampled.y.value_counts()

0    3705
1    3705
Name: y, dtype: int64

##### Oversampling

In [21]:
data_minority_upsampled = resample(data_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=len(data_majority),    # to match majority class
                                 random_state=1234) # reproducible results
 
# Combine majority class with upsampled minority class
data_upsampled = pd.concat([data_majority, data_minority_upsampled])
 
# Display new class counts
data_upsampled.y.value_counts()

0    29245
1    29245
Name: y, dtype: int64

##### Model selection

In [22]:
X_downtrain = data_downsampled.drop('y',axis = 1)
Y_downtrain = data_downsampled['y']

In [23]:
lr = LogisticRegressionCV(cv = 10,max_iter = 1000,n_jobs = -1)
lr.fit(X_downtrain,Y_downtrain)

In [24]:
print(classification_report(Y_test, lr.predict(X_test)))

              precision    recall  f1-score   support

           0       0.98      0.85      0.91      7303
           1       0.43      0.90      0.59       935

    accuracy                           0.86      8238
   macro avg       0.71      0.87      0.75      8238
weighted avg       0.92      0.86      0.88      8238



In [25]:
X_uptrain = data_upsampled.drop('y',axis = 1)
Y_uptrain = data_upsampled['y']

In [26]:
# Model for oversampled dataset

lr2 = LogisticRegressionCV(cv = 10, max_iter = 1000,n_jobs = -1)

In [27]:
lr2.fit(X_uptrain,Y_uptrain)

In [28]:
print(classification_report(Y_test, lr2.predict(X_test)))

              precision    recall  f1-score   support

           0       0.98      0.85      0.91      7303
           1       0.44      0.90      0.59       935

    accuracy                           0.86      8238
   macro avg       0.71      0.87      0.75      8238
weighted avg       0.92      0.86      0.88      8238



In [None]:
# As we see, accuracy is 0.86

##### Model without sampling

In [38]:
lr = LogisticRegression(max_iter = 1000, n_jobs = -1)
lr.fit(X_train,Y_train)

In [41]:
print(classification_report(Y_test, lr.predict(X_test)))

              precision    recall  f1-score   support

           0       0.93      0.97      0.95      7303
           1       0.66      0.42      0.51       935

    accuracy                           0.91      8238
   macro avg       0.80      0.70      0.73      8238
weighted avg       0.90      0.91      0.90      8238



In [19]:
# Support Vector Machine

sv = svm.SVC()
sv.fit(X_train,Y_train)

In [20]:
print(classification_report(Y_test, sv.predict(X_test)))

              precision    recall  f1-score   support

           0       0.91      0.98      0.94      7310
           1       0.60      0.22      0.32       928

    accuracy                           0.90      8238
   macro avg       0.76      0.60      0.63      8238
weighted avg       0.87      0.90      0.87      8238



In [21]:
# RandomForest

rd = RandomForestClassifier(n_estimators = 100,random_state=42)
rd.fit(X_train,Y_train)

In [22]:
print(classification_report(Y_test, rd.predict(X_test)))

              precision    recall  f1-score   support

           0       0.94      0.96      0.95      7310
           1       0.64      0.53      0.58       928

    accuracy                           0.91      8238
   macro avg       0.79      0.75      0.77      8238
weighted avg       0.91      0.91      0.91      8238



In [23]:
# LGBM Classifier

lgt = lgb.LGBMClassifier()
lgt.fit(X_train,Y_train)

[LightGBM] [Info] Number of positive: 3712, number of negative: 29238
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003008 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 993
[LightGBM] [Info] Number of data points in the train set: 32950, number of used features: 44
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.112656 -> initscore=-2.063898
[LightGBM] [Info] Start training from score -2.063898


In [24]:
print(classification_report(Y_test, lgt.predict(X_test)))

              precision    recall  f1-score   support

           0       0.94      0.96      0.95      7310
           1       0.64      0.56      0.59       928

    accuracy                           0.91      8238
   macro avg       0.79      0.76      0.77      8238
weighted avg       0.91      0.91      0.91      8238



### Model tuning on LogisticRegression

In [None]:
# We decided to select LogisticRegression and we will apply model tuning on this model and use it.

In [33]:
LRparam_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'penalty': ['l1', 'l2'],
    'max_iter': list(range(100,800,100)),
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
}

In [42]:
LR_search = GridSearchCV(lr, param_grid=LRparam_grid, refit = True, verbose = 3, cv=5)


In [43]:
LR_search.fit(X_train , Y_train)

Fitting 5 folds for each of 70 candidates, totalling 350 fits
[CV 1/5] END C=0.001, penalty=l1, solver=newton-cg;, score=nan total time=   0.0s
[CV 2/5] END C=0.001, penalty=l1, solver=newton-cg;, score=nan total time=   0.0s
[CV 3/5] END C=0.001, penalty=l1, solver=newton-cg;, score=nan total time=   0.0s
[CV 4/5] END C=0.001, penalty=l1, solver=newton-cg;, score=nan total time=   0.0s
[CV 5/5] END C=0.001, penalty=l1, solver=newton-cg;, score=nan total time=   0.0s
[CV 1/5] END ...C=0.001, penalty=l1, solver=lbfgs;, score=nan total time=   0.0s
[CV 2/5] END ...C=0.001, penalty=l1, solver=lbfgs;, score=nan total time=   0.0s
[CV 3/5] END ...C=0.001, penalty=l1, solver=lbfgs;, score=nan total time=   0.0s
[CV 4/5] END ...C=0.001, penalty=l1, solver=lbfgs;, score=nan total time=   0.0s
[CV 5/5] END ...C=0.001, penalty=l1, solver=lbfgs;, score=nan total time=   0.0s
[CV 1/5] END C=0.001, penalty=l1, solver=liblinear;, score=0.909 total time=   0.2s
[CV 2/5] END C=0.001, penalty=l1, solve

[CV 1/5] END C=0.1, penalty=l1, solver=liblinear;, score=0.910 total time= 1.1min
[CV 2/5] END C=0.1, penalty=l1, solver=liblinear;, score=0.908 total time= 1.1min
[CV 3/5] END C=0.1, penalty=l1, solver=liblinear;, score=0.907 total time=  54.8s
[CV 4/5] END C=0.1, penalty=l1, solver=liblinear;, score=0.912 total time= 1.2min
[CV 5/5] END C=0.1, penalty=l1, solver=liblinear;, score=0.912 total time= 1.1min
[CV 1/5] END .......C=0.1, penalty=l1, solver=sag;, score=nan total time=   0.0s
[CV 2/5] END .......C=0.1, penalty=l1, solver=sag;, score=nan total time=   0.0s
[CV 3/5] END .......C=0.1, penalty=l1, solver=sag;, score=nan total time=   0.0s
[CV 4/5] END .......C=0.1, penalty=l1, solver=sag;, score=nan total time=   0.0s
[CV 5/5] END .......C=0.1, penalty=l1, solver=sag;, score=nan total time=   0.0s
[CV 1/5] END ....C=0.1, penalty=l1, solver=saga;, score=0.907 total time=  23.5s
[CV 2/5] END ....C=0.1, penalty=l1, solver=saga;, score=0.904 total time=  20.7s
[CV 3/5] END ....C=0.1,

[CV 2/5] END C=10, penalty=l1, solver=liblinear;, score=0.908 total time= 1.5min
[CV 3/5] END C=10, penalty=l1, solver=liblinear;, score=0.908 total time= 1.9min
[CV 4/5] END C=10, penalty=l1, solver=liblinear;, score=0.912 total time= 3.3min
[CV 5/5] END C=10, penalty=l1, solver=liblinear;, score=0.912 total time= 2.7min
[CV 1/5] END ........C=10, penalty=l1, solver=sag;, score=nan total time=   0.0s
[CV 2/5] END ........C=10, penalty=l1, solver=sag;, score=nan total time=   0.0s
[CV 3/5] END ........C=10, penalty=l1, solver=sag;, score=nan total time=   0.0s
[CV 4/5] END ........C=10, penalty=l1, solver=sag;, score=nan total time=   0.0s
[CV 5/5] END ........C=10, penalty=l1, solver=sag;, score=nan total time=   0.0s
[CV 1/5] END .....C=10, penalty=l1, solver=saga;, score=0.907 total time=  37.8s
[CV 2/5] END .....C=10, penalty=l1, solver=saga;, score=0.904 total time=  33.7s
[CV 3/5] END .....C=10, penalty=l1, solver=saga;, score=0.907 total time=  32.9s
[CV 4/5] END .....C=10, pena

[CV 3/5] END C=1000, penalty=l1, solver=liblinear;, score=0.906 total time=   0.6s
[CV 4/5] END C=1000, penalty=l1, solver=liblinear;, score=0.912 total time= 4.1min
[CV 5/5] END C=1000, penalty=l1, solver=liblinear;, score=0.912 total time= 3.3min
[CV 1/5] END ......C=1000, penalty=l1, solver=sag;, score=nan total time=   0.0s
[CV 2/5] END ......C=1000, penalty=l1, solver=sag;, score=nan total time=   0.0s
[CV 3/5] END ......C=1000, penalty=l1, solver=sag;, score=nan total time=   0.0s
[CV 4/5] END ......C=1000, penalty=l1, solver=sag;, score=nan total time=   0.0s
[CV 5/5] END ......C=1000, penalty=l1, solver=sag;, score=nan total time=   0.0s
[CV 1/5] END ...C=1000, penalty=l1, solver=saga;, score=0.907 total time=  29.5s
[CV 2/5] END ...C=1000, penalty=l1, solver=saga;, score=0.904 total time=  33.7s
[CV 3/5] END ...C=1000, penalty=l1, solver=saga;, score=0.907 total time=  29.7s
[CV 4/5] END ...C=1000, penalty=l1, solver=saga;, score=0.908 total time=  31.7s
[CV 5/5] END ...C=1000

In [44]:
LR_grid = LR_search.best_estimator_

In [45]:
LR_grid.fit(X_train , Y_train)

In [46]:
print(classification_report(Y_test, LR_grid.predict(X_test)))

              precision    recall  f1-score   support

           0       0.93      0.97      0.95      7303
           1       0.65      0.42      0.51       935

    accuracy                           0.91      8238
   macro avg       0.79      0.70      0.73      8238
weighted avg       0.90      0.91      0.90      8238



In [1]:
# In conclusion, we get 91% accuracy.