In [None]:
pip install imblearn

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
import seaborn as sns
from matplotlib import pyplot as plt
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Dataset:
Input variables:

   **# bank client data:**
   
   1 - ID : ID of client
   
   2 - age (numeric)
   
   3 - job : type of job (categorical: "admin.","blue-collar","entrepreneur","housemaid","management","retired","self-employed","services","student","technician","unemployed","unknown")
   
   4 - marital : marital status (categorical: "divorced","married","single"; note: "divorced" means divorced or widowed)
   
   5 - education (categorical: "primary","secondary","tertiary,"unknown")
   
   6 - default: has credit in default? (categorical: "no","yes")
   
   7 - balance : has money in account? (numeric)
   
   8 - housing: has housing loan? (categorical: "no","yes")
   
   9 - loan: has personal loan? (categorical: "no","yes")
   
   10 - contact: contact communication type (categorical: "cellular","telephone") 
   
   11 - month: last contact month of year (categorical: "jan", "feb", "mar", ..., "nov", "dec")
   
   12 - day: last contact day of the week (categorical: "mon","tue","wed","thu","fri")
   
   13 - duration: last contact duration, in seconds (numeric). Important note:  this attribute highly affects the output target (e.g., if duration=0 then y="no"). Yet, the duration is not known before a call is performed. Also, after the end of the call y is obviously known. Thus, this input should only be included for benchmark purposes and should be discarded if the intention is to have a realistic predictive model.
   
   14 - campaign: number of contacts performed during this campaign and for this client (numeric, includes last contact)
   
   15 - pdays: number of days that passed by after the client was last contacted from a previous campaign (numeric; -1 means client was not previously contacted)
   
   16 - previous: number of contacts performed before this campaign and for this client (numeric)
   
   17 - poutcome: outcome of the previous marketing campaign (categorical: "failure","unknown","other","success")
   
  
   **Output variable (desired target):**
   
   18 - y - has the client subscribed a term deposit? (binary: "yes","no")
   
   We can't use test.csv because of we don't have y_test values.In this notebook I'am going to use only train.csv.

In [None]:
train = pd.read_csv("/kaggle/input/banking-dataset-marketing-targets/train.csv")
test = pd.read_csv("/kaggle/input/banking-dataset-marketing-targets/test.csv")

In [None]:
train.head()

In [None]:
train.shape , test.shape

# **1.Filling NaN(unknown) Values:**

In [None]:
train.isna().sum() , test.isna().sum()

In [None]:
train.dtypes

* **FILLING UNKNOWN VALUES :**
It's look like dataset has no NaN values but dataset has "unknown" values.

In [None]:
train["job"].value_counts()

We can fill unknown job values with mode value but also we can fill unknown job values with mode of education degree per job.

In [None]:
eduRatio = pd.DataFrame({'Job' : []})
for i in train["job"].unique():
    eduRatio = eduRatio.append(train[(train["job"] == i)]["education"].value_counts().to_frame().iloc[0] * 100 / train[(train["job"] == i)]["education"].value_counts().sum())
eduRatio["Job"] = train["job"].unique()
eduRatio

In [None]:
train.loc[(train.job == "unknown") & (train.education == "secondary"),"job"] = "services"
train.loc[(train.job == "unknown") & (train.education == "primary"),"job"] = "housemaid"
train.loc[(train.job == "unknown") & (train.education == "tertiary"),"job"] = "management"
train.loc[(train.job == "unknown"),"job"] = "blue-collar"

In [None]:
train["job"].value_counts()

In [None]:
train["marital"].value_counts() , test["marital"].value_counts()

In [None]:
train["education"].value_counts()

We can fill unknown values at education column with eduRatio again.

In [None]:
train.loc[(train.education == "unknown") & (train.job == "admin."),"education"] = "secondary"
train.loc[(train.education == "unknown") & (train.job == "management"),"education"] = "secondary"
train.loc[(train.education == "unknown") & (train.job == "services"),"education"] = "tertiary"
train.loc[(train.education == "unknown") & (train.job == "technician."),"education"] = "secondary"
train.loc[(train.education == "unknown") & (train.job == "retired"),"education"] = "secondary"
train.loc[(train.education == "unknown") & (train.job == "blue-collar"),"education"] = "secondary"
train.loc[(train.education == "unknown") & (train.job == "housemaid."),"education"] = "primary"
train.loc[(train.education == "unknown") & (train.job == "self-employed"),"education"] = "tertiary"
train.loc[(train.education == "unknown") & (train.job == "student"),"education"] = "secondary"
train.loc[(train.education == "unknown") & (train.job == "entrepreneur"),"education"] = "tertiary"
train.loc[(train.education == "unknown") & (train.job == "unemployed"),"education"] = "secondary"
#REST CAN BE SECONDARY
train.loc[(train.education == "unknown"),"education"] = "secondary"

In [None]:
train["education"].value_counts()

In [None]:
test["education"].value_counts()

In [None]:
train["default"].value_counts()

In [None]:
train["housing"].value_counts()

In [None]:
train["contact"].value_counts()

In [None]:
train["contact"].replace(["unknown"],train["contact"].mode(),inplace = True) # I replace unknown contact values with mode value

In [None]:
train["contact"].value_counts()

In [None]:
train["poutcome"].value_counts()

No need for ID column for training. Also dataset has pday column(number of days that passed by after last call).No need for "day" , "month" column for training.

In [None]:
train.drop(columns = ["ID","day","month"],inplace = True)

In [None]:
train["subscribed"].value_counts() #We can see here dataset imbalanced.Later we are going to augment the data with smote.


# **2.Encoding:**


We need to transform all categorical columns to numeric columns.

In [None]:
#OneHotEncoding of job column
ohe = OneHotEncoder(sparse = False)
train = pd.concat((train , pd.DataFrame(ohe.fit_transform(train["job"].to_frame()),columns = "job_" + np.sort(train["job"].unique()))),axis = 1)
train.drop(columns = ["job"],inplace = True)
#Marital column has 3 values lets apply OneHotEncoding again.
train = pd.concat((train , pd.DataFrame(ohe.fit_transform(train["marital"].to_frame()),columns = "marital_" + np.sort(train["marital"].unique()))),axis = 1)
train.drop(columns = ["marital"],inplace = True)

In [None]:
train.head()

Good! We can label encode education column.Because its ordinal data.Also we can transform default column yes/no values to 0 and 1

In [None]:
train.loc[(train.education == "tertiary"),"education"] = 2
train.loc[(train.education == "secondary") ,"education"] = 1
train.loc[(train.education == "primary"),"education"] = 0
#Default Column
train.loc[(train.default == "yes"),"default"] = 1
train.loc[(train.default == "no") ,"default"] = 0

In [None]:
train.balance.sort_values() # We have 2 outliner data.We can change their value to new maximum

In [None]:
train.loc[(train.balance > 66721),"balance"] = 66721
#Lets scale balance column.
train["balance"]= train["balance"] / 66721

Housing , loan and contact columns label encoding and dropping duration column(read intro)

In [None]:
train.loc[(train.housing == "yes"),"housing"] = 1 # housing column label encoding
train.loc[(train.housing == "no") ,"housing"] = 0

#Loan column label encoding
train.loc[(train.loan == "yes"),"loan"] = 1
train.loc[(train.loan == "no") ,"loan"] = 0

#contact column label encoding
train.loc[(train.contact == "telephone"),"contact"] = 1 # 0 means cellular 1 means telephone
train.loc[(train.contact == "cellular") ,"contact"] = 0

train.drop(columns = ["duration"],inplace = True)

pdays means how many days past after last contact.So we should change -1(non-called) values to something big like 999

In [None]:
train.loc[(train.pdays == -1),"pdays"] = 999

2 column left to encode before training.I'm going to use label encoding for subscribed , OneHotEncoding for poutcome column.

In [None]:
train = pd.concat((train , pd.DataFrame(ohe.fit_transform(train["poutcome"].to_frame()),columns = "poutcome_" + np.sort(train["poutcome"].unique()))),axis = 1)
train.drop(columns = ["poutcome"],inplace = True)

train.loc[(train.subscribed == "yes"),"subscribed"] = 1 # 0 means subscribed no 1 means yes
train.loc[(train.subscribed == "no") ,"subscribed"] = 0

In [None]:
train.info()

Before training we should transform object dtypes to int because some classifiers won't work with object dtype.

In [None]:
train.education = train.education.astype(int)
train.default = train.default.astype(int)
train.housing = train.housing.astype(int)
train.loan = train.loan.astype(int)
train.contact = train.contact.astype(int)
train.subscribed = train.subscribed.astype(int)

In [None]:
train.info()

In [None]:
plt.figure(figsize=(20,10))
sns.heatmap(train.corr(),annot = True)

Seems like nothing highly correlated with subscribed column

# 3.Training :

**Lets split the data**

In [None]:
y = train.subscribed.to_frame()
X = train.drop(columns = ["subscribed"])
X_train , X_test , y_train , y_test = train_test_split(X,y, test_size = 0.25, random_state = 10)

**LogisticRegression**

In [None]:
lr = LogisticRegression()
lr.fit(X_train,y_train)
y_predlr = lr.predict(X_test)
cmlr = confusion_matrix(y_test, y_predlr)
acclr = accuracy_score(y_test, y_predlr)
cmlr , acclr

It's look like we have 90 percent accuracy.But this predictions quite wrong because our accuracy at positive predicted values is : 6927 / (6927 + 63) = 0.99 but our accuracy will be low at negative predicted values: 134 / (788 + 134) = 0.14. This is happening because we did prediction at imbalanced dataset.Let's balance dataset with SMOTE

# 4.Data Augmentation

In [None]:
sm = SMOTE()
X_sm , y_sm = sm.fit_resample(X, y)
y_sm.subscribed.value_counts()

# 5.Traning after SMOTE

Data balanced. 

In [None]:
X_train_sm , X_test_sm , y_train_sm , y_test_sm = train_test_split(X_sm,y_sm, test_size = 0.25, random_state = 10)
lr2 = LogisticRegression()
lr2.fit(X_train_sm,y_train_sm)
y_predlr2 = lr2.predict(X_test_sm)
cmlr2 = confusion_matrix(y_test_sm, y_predlr2)
acclr2 = accuracy_score(y_test_sm, y_predlr2)
cmlr2 , acclr2

This time our accuracy around 72 percent. But as you can see our prediction improved at Specificty in confusion matrix.

**Support Vector Classifier**

In [None]:
svc = SVC()
svc.fit(X_train_sm, y_train_sm)
y_predsvc = svc.predict(X_test_sm)
cmsvc = confusion_matrix(y_test_sm, y_predsvc)
accsvc = accuracy_score(y_test_sm, y_predsvc)
cmsvc , accsvc

**KNN**

In [None]:
knn = KNeighborsClassifier()
knn.fit(X_train_sm, y_train_sm)
y_predknn = knn.predict(X_test_sm)
cmknn = confusion_matrix(y_test_sm, y_predknn)
accknn = accuracy_score(y_test_sm, y_predknn)
cmknn , accknn

**Random Forest Classifier**

In [None]:
rf = RandomForestClassifier()
rf.fit(X_train_sm, y_train_sm)
y_predrf = rf.predict(X_test_sm)
cmrf = confusion_matrix(y_test_sm, y_predrf)
accrf = accuracy_score(y_test_sm, y_predrf)
cmrf , accrf

# 6.Evaluation
Sensitivity : True Positive / (True Positive + False Negative) , Specificity : True Negative / (True Negative + False Negative)

In [None]:
print(f"Logistic Regression accuracy without SMOTE :{acclr * 100} Sensivity :{cmlr[0,0] * 100 / (cmlr[0,0] + cmlr[0,1])} Specificity : {cmlr[1,1] * 100 / (cmlr[1,1] + cmlr[1,0])}")
print(f"Logistic Regression accuracy with SMOTE :{acclr2 * 100} Sensivity :{cmlr2[0,0] * 100 / (cmlr2[0,0] + cmlr2[0,1])} Specificity : {cmlr2[1,1] * 100 / (cmlr2[1,1] + cmlr2[1,0])}")
print(f"Support Vector Classifier accuracy with SMOTE :{accsvc * 100} Sensivity :{cmsvc[0,0] * 100 / (cmsvc[0,0] + cmsvc[0,1])} Specificity : {cmsvc[1,1] * 100 / (cmsvc[1,1] + cmsvc[1,0])}")
print(f"K Nearest Neighbors Classfier accuracy with SMOTE :{accknn * 100} Sensivity :{cmknn[0,0] * 100 / (cmknn[0,0] + cmknn[0,1])} Specifictiy : {cmknn[1,1] * 100 / (cmknn[1,1] + cmknn[1,0])}")
print(f"Random Forest Classifier accuracy with SMOTE :{accrf * 100} Sensivity :{cmrf[0,0] * 100 / (cmrf[0,0] + cmrf[0,1])} Specificity : {cmrf[1,1] * 100 / (cmrf[1,1] + cmrf[1,0])}")

Without any hyperparameter tuning Random Forest Classifier gave us best result. Before smote our specificity values was low around %13. Dataset was imbalance and we balanced dataset with smote. After SMOTE, maybe accuracy of classifications seems a little bit low. But evaluate results with just "accuracy" won't proof anything.Sensitivity decreased with smote but specificity increased 10 times. If you have any question or suggestion please comment.Thanks for your time.