In [None]:
#import libraries
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

from sklearn.preprocessing import MinMaxScaler
from pandas.plotting import scatter_matrix 
import plotly.express as px
import os
print(os.listdir("../input"))

In [None]:
df = pd.read_csv("../input/bankfullcsv/bank-full.csv")

In [None]:
df.head()

In [None]:
df.shape

In [None]:
(df.isnull().sum()/df.shape[0])*100

In [None]:
#Bar plot for all categorical variables in the dataset
sns.countplot(x='job', data=df, color='blue', 
              order=df['job'].value_counts().index, );
plt.xticks(rotation = 90)

In [None]:
sns.countplot(x='marital', data=df, color='blue', 
              order=df['marital'].value_counts().index);


In [None]:
sns.countplot(x='education', data=df, color='blue', 
              order=df['education'].value_counts().index);

In [None]:
sns.countplot(x='housing', data=df, color='blue', 
              order=df['housing'].value_counts().index);

In [None]:
sns.countplot(x='loan', data=df, color='blue', 
              order=df['loan'].value_counts().index);

In [None]:
df.corr()

In [None]:
fig= plt.figure(figsize=(18, 6))
sns.heatmap(df.corr(), annot=True);
plt.xticks(rotation=45);

In [None]:
fig= plt.figure(figsize=(18, 3))
sns.heatmap(pd.crosstab(index=df['job'], columns=df['Target']), annot=True, 
            fmt='g')
plt.xticks(rotation=45);

In [None]:
fig= plt.figure(figsize=(18, 3))
sns.heatmap(pd.crosstab(index=df['marital'], columns=df['Target']), annot=True, 
            fmt='g')
plt.xticks(rotation=45);

In [None]:
fig= plt.figure(figsize=(18, 3))
sns.heatmap(pd.crosstab(index=df['education'], columns=df['Target']), annot=True, 
            fmt='g')
plt.xticks(rotation=45);

In [None]:
fig= plt.figure(figsize=(18, 3))
sns.heatmap(pd.crosstab(index=df['contact'], columns=df['Target']), annot=True, 
            fmt='g')
plt.xticks(rotation=45);

In [None]:
fig= plt.figure(figsize=(18, 3))
sns.heatmap(pd.crosstab(index=df['poutcome'], columns=df['Target']), annot=True, 
            fmt='g')
plt.xticks(rotation=45);

In [None]:
fig= plt.figure(figsize=(18, 3))
sns.heatmap(pd.crosstab(index=df['loan'], columns=df['Target']), annot=True, 
            fmt='g')
plt.xticks(rotation=45);

In [None]:
cols = [col for col in df.columns]
col_missing = []
for col in cols:
    if 'unknown' in df[col].values:
        col_missing.append(col)
        
print("Columns with Unknown Values -",col_missing)  

In [None]:
print("Unknown values count")
for col in col_missing:
    print(col,"-",df[df[col].str.contains('unknown')][col].count())

In [None]:
print(df.groupby(df['job'])['job'].count(),"\n")

In [None]:
print(df.groupby(df['education'])['education'].count(),"\n")

In [None]:
df.loc[(df['education']=='unknown') & (df['job']=='management'), 'education'] = 'tertiary'
df.loc[(df['education']=='unknown') & (df['job']=='services'), 'education'] = 'secondary'
df.loc[(df['education']=='unknown') & (df['job']=='housemaid'), 'education'] = 'primary'

In [None]:
df['education'] = df.education.replace('unknown',df.education.mode()[0])

In [None]:
print(df.groupby(df['education'])['education'].count(),"\n")

In [None]:
print(df.groupby(df['Target'])['Target'].count(),"\n")

In [None]:
df.loc[df['Target']=='no', 'Target'] = 0
df.loc[df['Target']=='yes', 'Target'] = 1

In [None]:
df['yTarget'] = df.Target.replace('no',df.Target.mode()[0])
df['Target'] = df.Target.replace('yes',df.Target.mode()[0])

In [None]:
print(df.groupby(df['Target'])['Target'].count(),"\n")

In [None]:
df.head()

In [None]:
print(df.groupby(df['housing'])['housing'].count(),"\n")

In [None]:
df.loc[df['housing']=='no', 'housing'] = 0
df.loc[df['housing']=='yes', 'housing'] = 1

In [None]:
df['housing'] = df.housing.replace('no',df.housing.mode()[0])
df['housing'] = df.housing.replace('yes',df.Target.mode()[0])

In [None]:
print(df.groupby(df['housing'])['housing'].count(),"\n")

In [None]:
print(df.groupby(df['default'])['default'].count(),"\n")

In [None]:
col1 = {'yes':1 , 'no':0}
df['default'] = df['default'].map(col1)

In [None]:
print(df.groupby(df['default'])['default'].count(),"\n")

In [None]:
print(df.groupby(df['loan'])['loan'].count(),"\n")

In [None]:
col1 = {'yes':1 , 'no':0}
df['loan'] = df['loan'].map(col1)

In [None]:
print(df.groupby(df['loan'])['loan'].count(),"\n")

In [None]:
df.head()

In [None]:
print(df.groupby(df['contact'])['contact'].count(),"\n")

In [None]:
df = df.drop(columns = ['job', 'marital', 'education','contact', 'month', 'poutcome','yTarget'])
df.head()

In [None]:
target = df.Target.values
target = target.astype('int')

In [None]:
t =pd.DataFrame(target, columns = ['y'])
t.head()

In [None]:
x_d = df.drop(['Target'], axis = 1)

In [None]:
# Normalize
x = (x_d - np.min(x_d)) / (np.max(x_d) - np.min(x_d)).values

In [None]:
# split the dataset into trainig and test set to train and evaluate the model respectively

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(x, target, test_size = 0.30, random_state=0)

In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(X_train, y_train)

In [None]:
# predict for the test dataset

y_predict = lr.predict(X_test)

In [None]:
# plot the confusion matrix

from sklearn.metrics import confusion_matrix

sns.heatmap(confusion_matrix(y_test, y_predict), annot=True, fmt='0.0f');

In [None]:
# print the classification report

from sklearn.metrics import classification_report

print(classification_report(y_test, y_predict))

In [None]:
# import imblearn library and resample the original data using SMOTE technique

from imblearn.over_sampling import SMOTE
from collections import Counter

smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X_train, y_train)
print('Original dataset shape %s' % Counter(y_train))
print('Resampled dataset shape %s' % Counter(y_smote))

In [None]:
# train a random forest model on SMOTE data

lr_smote = LogisticRegression()
lr_smote.fit(X_smote, y_smote)

In [None]:
# predict the classes on test data using model built on SMOTE data and plot the confusion matrix

y_predict_smote = lr_smote.predict(X_test)

sns.heatmap(confusion_matrix(y_test, y_predict_smote), annot=True, fmt='0.0f');

In [None]:
# print the classification report

print(classification_report(y_test, y_predict_smote))

In [None]:
!pip install plot_metric

In [None]:
# let's compute the AUC curve for the model we developed on SMOTE data

from plot_metric.functions import BinaryClassification

bc = BinaryClassification(y_test, lr_smote.predict_proba(X_test)[:,1],  labels=[0, 1])
plt.figure(figsize=(16, 8))
bc.plot_roc_curve()
plt.show()

In [None]:
# let's use another probability threshold so that we can get to the elbow position in the above curve

bc = BinaryClassification(y_test, lr_smote.predict_proba(X_test)[:,1], threshold=0.4, labels=[0, 1])
plt.figure(figsize=(16, 8))
bc.plot_roc_curve()
plt.show()

In [None]:
# compute the probabilites of test observations using rf_smote model

y_pred_proba = lr_smote.predict_proba(X_test)[:,1]

In [None]:
# compare these probabilities against the probability threshold of 6% rather than the default threshold of 50%

y_pred_labels = (y_pred_proba >= 0.4)

In [None]:
# plot the confusion matrix

sns.heatmap(confusion_matrix(y_test, y_pred_labels), annot=True, fmt='0.0f');

In [None]:
# print the classification report

print(classification_report(y_test, y_pred_labels))