In [None]:

# Objective :
# To run 4 models (KNN, Logistic Regression, SVM and Decision Tree) 
# on the data, pick the best model, then tune the hyperparameter of the selected model
# and present analysis and recommendations to the Client
# Analysis and recoendations on on Powerpoint presentation
import pandas as pd
import numpy as np
from sklearn import datasets
import seaborn as sns
sns.set()
# Sklearn related imports
from sklearn import datasets
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression

import matplotlib.pyplot as plt


In [None]:
import os
#for dirname, _, filenames in os.walk('/kaggle/input'):
#    for filename in filenames:
 #       print(os.path.join(dirname, filename))

# Data loading and preprocessing

In [None]:

#
# Load bank marketing data 
#
df = pd.read_csv('/kaggle/input/bank-term-deposit/Train.csv')

df

In [None]:
df['term_deposit_subscribed'].value_counts()

In [None]:
3394/(28253+3394)

In [None]:
sns.countplot(x='term_deposit_subscribed', data=df)
plt.show()

In [None]:
df.info()

In [None]:
# Look for cells with missing data (i.e. Null or NA)
df.isna().any().any()
# count the number of missing data for each feature
df_na = df.isna().sum()
df_na
# this will show only features that have nonzero missing values
# df_na[df_na!=0]

In [None]:
df= df.fillna(df.mean())

In [None]:
df_na = df.isna().sum()
df_na

In [None]:
df= df.ffill()

In [None]:
del df['days_since_prev_campaign_contact']

In [None]:
df_na = df.isna().sum()
df_na

In [None]:
del df['id'] # drop three irrelevant columns

In [None]:
del df['month']

In [None]:
del df['day_of_month']

In [None]:
df


In [None]:
# limit to categorical data using df.select_dtypes()
df_cat = df.select_dtypes(include=['object'])
df_cat.nunique()

In [None]:
# limit to numerical data using df.select_dtypes()
df_num = df.select_dtypes(include=['number'])
df_num.nunique()

In [None]:
df_cat.columns

# EDA

In [None]:
df.describe() 

In [None]:
_ = sns.pairplot(df, corner=True)

In [None]:
# Visualize correlations drilled down by dependent variable
_ = sns.pairplot(df, corner=True, hue='term_deposit_subscribed')

In [None]:
categorical_var = [i for i in df.columns if df[i].dtypes !='object']
categorical_var

In [None]:
df.hist(bins=40, figsize=(20,20), layout=(10,3), color="#FA5858") 
plt.show()

In [None]:
df.hist(column='balance', bins=50, figsize=(5,5), color="#FA5858") 
plt.show()

In [None]:
df.hist(column='num_contacts_in_campaign', bins=40, figsize=(5,5), color="#FA5858") 
plt.show()

In [None]:
df.hist(column='num_contacts_prev_campaign', bins=80, figsize=(5,5), color="#FA5858") 
plt.show()

In [None]:
df.hist(column='last_contact_duration', bins=40, figsize=(5,5), color="#FA5858") 
plt.show()

In [None]:
# Let's look at all the categorical variables and their impact on churn

# Removing churn variable for analysis
categorical_var = [i for i in df.columns if df[i].dtypes =='object']
catVars_noChurn = categorical_var[:]

fig ,ax = plt.subplots(4,2,figsize=(20,20))
for axi ,var in zip(ax.flat,catVars_noChurn):
    sns.countplot(x=df.term_deposit_subscribed,hue=df[var],ax=axi)

In [None]:
# correlation matrix heatmap visualization
sns.set(style="white")

# Generate a mask for the upper triangle
matrix = np.triu(df.corr())

# Set up the matplotlib figure to control size of heatmap
fig, ax = plt.subplots(figsize=(10,10))


_ = sns.heatmap(df.corr(), annot=True, annot_kws={"size": 12}, square=True, 
cmap='coolwarm' , vmin=-1, vmax=1, fmt='.2f')



# convert categorical data to numeric

In [None]:
# Encode variables with more than 2 Classes
df = pd.get_dummies(df, columns= [i for i in df.columns if df[i].dtypes=='object'],drop_first=True)

In [None]:
df

In [None]:
plt.figure(figsize=(16,8))
_ = sns.heatmap(df.corr(), annot=True, annot_kws={"size": 9})

# Data preparation for training and testing


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score,f1_score,make_scorer,mean_squared_error, mean_absolute_error,r2_score
from sklearn.preprocessing import StandardScaler 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
import imblearn
from imblearn.over_sampling import SMOTE

In [None]:
X=df.drop(['term_deposit_subscribed'],axis=1)
y=df['term_deposit_subscribed']

In [None]:
# 2) Splitting our data into training and testing sets
# Split the Data
X_y_train_test = train_test_split(X, y, test_size = 0.2, shuffle=True, random_state = 0)
X_train, X_test, y_train, y_test = X_y_train_test


In [None]:
X

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
#importing imbalanced Learning library
import imblearn
from imblearn.over_sampling import SMOTE

# transform the dataset using SMOTE
smote = SMOTE()
X_resampled, y_resampled = smote.fit_resample(X_train.values, y_train.values)
X_y_resampled_test = X_resampled, X_test,y_resampled, y_test

In [None]:
# Confirmed training data has been resampled
X_resampled.shape

In [None]:
y_resampled.shape

In [None]:
X_resampled

In [None]:
X_test.shape

In [None]:
y_test.shape

In [None]:
pd.Series(y_resampled).value_counts()

In [None]:
# Create the classifiers
rf = RandomForestClassifier() 
svm_clf = SVC()
log_res = LogisticRegression(max_iter=2000) 
knn = KNeighborsClassifier(n_neighbors=3) 


In [None]:
# Manual pipeline

def train_predict_F1score(model, X_y_train_test):
    X_train, X_test, y_train, y_test = X_y_train_test
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    score = f1_score(y_test, y_pred, average='macro')
    return score

In [None]:
# KNN Model
knn = KNeighborsClassifier(n_neighbors=3)
train_predict_F1score(knn, X_y_train_test)

In [None]:
# Random Forest model 
rf = RandomForestClassifier() 
train_predict_F1score(rf, X_y_train_test)

In [None]:
# Logistic Regression model 
log_res = LogisticRegression(max_iter=2000) 
train_predict_F1score(log_res, X_y_train_test)

In [None]:
# SVM Model
svm_clf = SVC()
train_predict_F1score(svm_clf, X_y_train_test)

# USING SMOTE "RESAMPLED" for training and testing

In [None]:
# Manual pipeline

def train_predict_F1score(model, X_y_resampled_test):
    X_train, X_test, y_train, y_test = X_y_resampled_test
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    score = f1_score(y_test, y_pred, average='macro')
    return score

In [None]:
# KNN model resampled with SMOTE()
knn = KNeighborsClassifier(n_neighbors=3)
train_predict_F1score(knn, X_y_resampled_test)

In [None]:
# Random Forest modelresampled with SMOTE()
rf = RandomForestClassifier() 
train_predict_F1score(rf, X_y_resampled_test)

In [None]:
# Logistic Regression model resampled with SMOTE()
log_res = LogisticRegression(max_iter=2000) 
train_predict_F1score(log_res, X_y_resampled_test)

In [None]:
# SMV model resampled with SMOTE()
svm_clf = SVC()
train_predict_F1score(svm_clf, X_y_resampled_test)

In [None]:
y_pred = knn.predict(X_test)

# Classification Report
print("Classification report for KNN Model with resampled train data:")
print()
print(classification_report(y_test, y_pred))
print()

# Plot the confusion matrix using Seaborn library
print("Correlation Matrix for KNN Model with resampled train data:")
plt.figure(figsize=(5,5))
_ = sns.heatmap(confusion_matrix(y_test, y_pred), 
                annot=True,fmt='', annot_kws={"size": 18},
                cmap=plt.cm.Blues)
_ = plt.ylabel('Actual', fontweight='bold')
_ = plt.xlabel('Predicted', fontweight='bold')

In [None]:
y_pred = rf.predict(X_test)

# Classification Report
print("Classification report for Random Forest Model with resampled train data:")
print()
print(classification_report(y_test, y_pred))
print()

# Plot the confusion matrix using Seaborn library
print("Correlation Matrix for Random Forest Model with resampled train data:")
plt.figure(figsize=(5,5))
_ = sns.heatmap(confusion_matrix(y_test, y_pred), 
                annot=True,fmt='', annot_kws={"size": 18},
                cmap=plt.cm.Blues)
_ = plt.ylabel('Actual', fontweight='bold')
_ = plt.xlabel('Predicted', fontweight='bold')

In [None]:
y_pred = log_res.predict(X_test)

# Classification Report
print("Classification report for Logistic Regression Model with resampled train data:")
print()
print(classification_report(y_test, y_pred))
print()

# Plot the confusion matrix using Seaborn library
print("Correlation Matrix for Logistic Regression Model with resampled train data:")
plt.figure(figsize=(5,5))
_ = sns.heatmap(confusion_matrix(y_test, y_pred), 
                annot=True,fmt='', annot_kws={"size": 18},
                cmap=plt.cm.Blues)
_ = plt.ylabel('Actual', fontweight='bold')
_ = plt.xlabel('Predicted', fontweight='bold')

In [None]:
y_pred = svm_clf.predict(X_test)

# Classification Report
print("Classification report for SVM Model with resampled train data:")
print()
print(classification_report(y_test, y_pred))
print()


# Plot the confusion matrix using Seaborn library
print("Correlation Matrix for SVM Model with resampled train data:")
plt.figure(figsize=(5,5))
_ = sns.heatmap(confusion_matrix(y_test, y_pred), 
                annot=True,fmt='', annot_kws={"size": 18},
                cmap=plt.cm.Blues)
_ = plt.ylabel('Actual', fontweight='bold')
_ = plt.xlabel('Predicted', fontweight='bold')

# Hyperparameters for Random Forest (RESAMPLED)

In [None]:
rf_classifier = RandomForestClassifier()
print(rf.get_params().keys())

In [None]:
# Random forest initially
from sklearn.model_selection import GridSearchCV

rf_classifier = RandomForestClassifier()

param_grid = { 
    'n_estimators': [800,900,1000,1100,1200],
    'max_features':  ['auto'],            
    'max_depth' : [22,24,26,28,30],
    'criterion' :['gini', 'entropy']
}
 # ['auto', 'sqrt', 'log2'],
gs_clf = GridSearchCV(rf_classifier,
                      param_grid, 
                      cv=5,
                      scoring='f1',
                      n_jobs=-1)
gs_clf.fit(X_train, y_train)

In [None]:
gs_clf.best_params_

In [None]:
rf_classifier = RandomForestClassifier()
train_predict_F1score(gs_clf.best_estimator_, X_y_resampled_test)

# post mortem analysis
-What went wrong? Any unexpected results?
-Show features importance (What is the most important predictors? why/expected?)
-Steps needed to improve predictions (e.g. data collections, data preprocessing, feature engineering, chosen model, -chosen hyperparameters) ?



In [None]:
# Creating the feature importances dataframe
feature_importance = np.array(rf.feature_importances_)
feature_names = np.array(X.columns)
sorted_importance = np.array(sorted(list(zip(feature_importance, feature_names)), reverse=True))
sorted_importance.shape
feat_imp = pd.DataFrame({'feature_names':sorted_importance[:,1],'feature_importance':sorted_importance[:,0]})

In [None]:
print("Sorted Feature Importance for Random Forest:")
feat_imp

In [None]:
# Creating the feature importances dataframe
#feature_importance = np.array(rf.feature_importances_)
#feature_names = np.array(X.columns)
#sorted_importance = np.array(sorted(list(zip(feature_importance, feature_names)), reverse=True))
#sorted_importance.shape
#feat_imp = pd.DataFrame({'feature_names':sorted_importance[:,1],'feature_importance':sorted_importance[:,0]})

In [None]:
# Creating the feature importances dataframe
feature_importance = np.array(rf.feature_importances_)
feature_names = np.array(X.columns)

feat_imp = pd.DataFrame({'feature_names':feature_names,'feature_importance':feature_importance})

In [None]:
plt.figure(figsize=(10,8))
sns.barplot(x=feat_imp['feature_importance'], y=feat_imp['feature_names'])