In [None]:
#importing libraries 
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt

In [None]:
#reading the data
data=pd.read_csv('../input/bank-marketing-dataset/bank.csv')

In [None]:
data.shape

In [None]:
data.head()

In [None]:
data.isnull().sum()

In [None]:
data['job'].unique()

In [None]:
data.dtypes

In [None]:
data.describe()

In [None]:
import seaborn as sns
plt.figure(figsize=(15,8))
sns.boxplot(x='default',y='balance',hue='deposit',data=data)

In [None]:
import seaborn as sns
plt.figure(figsize=(15,8))
sns.barplot(x='job',y='balance',hue='deposit',data=data)

In [None]:
import seaborn as sns
plt.figure(figsize=(15,8))
sns.barplot(x='education',y='balance',hue='deposit',data=data,estimator=lambda x: len(x) / len(data)*100),plt.ylabel('(% balance)')


In [None]:
plt.rcParams['figure.figsize']=(20,10)
plt.subplot(121)
sns.stripplot(x='housing',y='balance',data=data)
plt.subplot(122)
sns.stripplot(x='loan',y='balance',data=data)

In [None]:
plt.figure(figsize=(8,6))
sns.barplot(x=data['deposit'].unique(),y=data['deposit'].value_counts(),data=data,)

In [None]:
labels = ['Have depoist','No deposit']
deposit = plt.pie(data['deposit'].value_counts(), labels=labels, autopct='%1.1f%%', shadow=True)
plt.title('Deposit',fontsize=24)
plt.show()

In [None]:
data['deposit'] = data['deposit'].map({'yes':  1, 'no': 0})
data['housing'] = data['housing'].map({'yes':  1, 'no': 0})
data['loan'] = data['loan'].map({'yes':  1, 'no': 0})
data['contact'] = data['contact'].map({'cellular':  1, 'unknown': 0,'telephone' : 1})
data['month'] = data['month'].map({'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6, 'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec':12})

In [None]:
data.head()

In [None]:
data['default'] = data['default'].map({'yes':  1, 'no': 0})

In [None]:
data.head()

In [None]:
data = pd.get_dummies(data, columns=['marital','poutcome','education'])
data.head().T

In [None]:
from sklearn.model_selection import train_test_split

X = data.drop('deposit', axis=1).drop('job',axis = 1)
y = data['deposit']

In [None]:
X.shape, y.shape

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, stratify=y, test_size=0.3, random_state=12)

In [None]:
print(X_train.shape, y_train.shape)

In [None]:
# distribution in training set
y_train.value_counts(normalize=True)

In [None]:
# distribution in validation set
y_valid.value_counts(normalize=True)

In [None]:
#importing decision tree classifier 
from sklearn.tree import DecisionTreeClassifier

In [None]:
#fitting the model
dt_model = DecisionTreeClassifier(random_state=10)
dt_model.fit(X_train, y_train)

In [None]:
#checking the training score
dt_model.score(X_train, y_train)

In [None]:
#checking the validation score
dt_model.score(X_valid, y_valid)

In [None]:
#predictions on validation set
dt_predict=dt_model.predict(X_valid)
dt_predict

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_valid,dt_predict)

In [None]:
train_accuracy = []
validation_accuracy = []
for depth in range(1,15):
    dt_model = DecisionTreeClassifier(max_depth=depth, random_state=6)
    dt_model.fit(X_train, y_train)
    train_accuracy.append(dt_model.score(X_train, y_train))
    validation_accuracy.append(dt_model.score(X_valid, y_valid))

In [None]:
frame = pd.DataFrame({'max_depth':range(1,15), 'train_acc':train_accuracy, 'Valid_acc':validation_accuracy})
frame.head()

In [None]:
plt.figure(figsize=(14,6))
plt.plot(frame['max_depth'], frame['train_acc'], marker='o')
plt.plot(frame['max_depth'], frame['Valid_acc'], marker='o')
plt.xlabel('Depth of tree')
plt.ylabel('performance')
plt.legend(['train_acc','validation_acc'])

In [None]:
dt_model = DecisionTreeClassifier(max_depth=10, max_leaf_nodes=70, random_state=10)

In [None]:
#fitting the model
dt_model.fit(X_train, y_train)

In [None]:
#Training score
dt_model.score(X_train, y_train)

In [None]:
#Validation score
dt_model.score(X_valid, y_valid)

In [None]:
dt_predict1=dt_model.predict(X_valid)
dt_predict1

In [None]:
accuracy_score(y_valid,dt_predict1)

In [None]:
from sklearn import tree

In [None]:
plt.figure(figsize = (20,10))
tree.plot_tree(dt_model,max_depth=2);

In [None]:
plt.figure(figsize = (20,10))
tree.plot_tree(dt_model);

In [None]:
# Get the confusion Matrix of the Model
from sklearn import metrics
cnf_matrix = metrics.confusion_matrix(y_valid,dt_predict1)
cnf_matrix

In [None]:
# Plot the Confusion Matrix as a HeatMap
plt.figure(figsize=(4,4))
class_names=[0,1] # Name  of classes
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)
# create heatmap
sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="YlGnBu" ,fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')

In [None]:
print(metrics.classification_report(y, dt_model.predict(X),zero_division=1))

# Logistic Regression 

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

In [None]:
cols = X_train.columns
cols

In [None]:
X_train_scaled = scaler.fit_transform(X_train)
X_train_scaled = pd.DataFrame(X_train_scaled, columns=cols)
X_train_scaled.head()

In [None]:
X_valid_scaled = scaler.transform(X_valid)
X_valid_scaled = pd.DataFrame(X_valid_scaled, columns=cols)
X_valid_scaled.head()

In [None]:
#importing Logistic Regression and metric F1-score
from sklearn.linear_model import LogisticRegression as LogReg
from sklearn.metrics import f1_score

In [None]:
logreg = LogReg()

# Fitting the model
logreg.fit(X_train, y_train)

In [None]:
# Predicting over the Train
train_predict = logreg.predict(X_train)
train_predict

In [None]:
# Calculating f1-score
k = f1_score(train_predict, y_train)
print('Training f1_score', k )

In [None]:
# Predicting over the Test Set and f1-score
test_predict = logreg.predict(X_valid)
k = f1_score(test_predict, y_valid)
print('Test f1_score    ', k )

In [None]:
accuracy_score(y_valid,test_predict)

In [None]:
cnf = metrics.confusion_matrix(y_valid,test_predict)
cnf

In [None]:
class_names=[0,1] # Name  of classes
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)
# create heatmap
sns.heatmap(pd.DataFrame(cnf), annot=True, cmap="YlGnBu" ,fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')

In [None]:
from sklearn.metrics import classification_report as rep
print(rep( y , logreg.predict(X),zero_division=1))

In [None]:
# printing the coefficients
logreg.coef_

In [None]:
plt.figure(figsize=(8, 6), dpi=120, facecolor='w', edgecolor='b')
x = range(len(X_train.columns))
c = logreg.coef_.reshape(-1)
plt.bar( x, c )
plt.xlabel( "Variables")
plt.ylabel('Coefficients')
plt.title('Coefficient plot')

In [None]:
Coefficients = pd.DataFrame({
    'Variable'    : X_train.columns,
    'coefficient' : abs(c)
})
Coefficients.head()

In [None]:
#selecting variables with high coefficient
sig_var = Coefficients[Coefficients.coefficient > 0.01]

In [None]:
subset = data[sig_var['Variable'].values]
subset.head()

In [None]:
# Using train test split function
train_x,test_x,train_y,test_y = train_test_split(subset, y, random_state = 56)

In [None]:
# Creating instance of Logistic Regresssion
logreg = LogReg()

# Fitting the model
logreg.fit(train_x, train_y)

In [None]:
# Predicting over the Train
train_predict = logreg.predict(train_x)
k = f1_score(train_predict, train_y)
print('Training f1_score', k )

In [None]:
# Predicting over the Test Set and f1-score
test_predict_1 = logreg.predict(test_x)
k = f1_score(test_predict_1, test_y)
print('Test f1_score    ', k )

In [None]:
cnf = metrics.confusion_matrix(test_y,test_predict_1)
cnf

In [None]:
plt.figure(figsize=(8, 6), dpi=120, facecolor='w', edgecolor='b')
x = range(len(train_x.columns))
c = logreg.coef_.reshape(-1)
plt.bar( x, c )
plt.xlabel( "Variables")
plt.ylabel('Coefficients')
plt.title('Coefficient plot')

# **SVM**