# 1. Importing Libraries & Analyzing Dataframe

In [None]:
# Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Reading Dataset
df = pd.read_csv('../input/bank-marketing-dataset/bank.csv')
df.head()

In [None]:
df.describe()

In [None]:
# Looking for NaNs
df.isnull().sum()

# 2. Exploratory Data Analysis

## Balance of Dataset

In [None]:
fig, ax = plt.subplots()
sns.countplot(x ='deposit', data = df, palette = 'viridis')

plt.title('Deposit Distribution of Bank Customers', fontsize = 16)
plt.xlabel('Deposit', fontsize = 14)
plt.ylabel('Total Customers', fontsize = 14)
plt.xticks(fontsize = 12)

# Show the plot
plt.show()

Dataset is well balanced

## Balance vs Deposits of Customers

In [None]:
from matplotlib.patches import Patch
fig, ax = plt.subplots(1, 2, figsize=(15, 5))
fig.suptitle("Balance vs Deposits of Customers", size = 16)

# Subplot 1
ax[0].hist(df[df["deposit"]=='no']["balance"], bins=30, alpha=0.5, color="green", label="Non-Depositors")
ax[0].hist(df[df["deposit"]=='yes']["balance"], bins=30, alpha=0.5, color="blue", label="Depositors")

ax[0].set_xlabel("Balance", fontsize=14)
ax[0].set_ylabel("Total Customers", fontsize=14)
ax[0].legend(fontsize = 11);

# Subplot 2
sns.boxplot(x="balance", y="deposit", data=df, orient="h", palette={ 'no':"#80e880", 'yes':"#2626ff"}, ax = ax[1])
ax[1].get_yaxis().set_visible(False)
ax[1].set_xlabel('Balance', fontsize=14)

color_patches = [
    Patch(facecolor="#80e880", label="Non-Depositors"),
    Patch(facecolor="#2626ff", label="Depositors")
]
ax[1].legend(handles=color_patches, fontsize=11);

* Most of the customers of bank have <b>smaller balance</b>. 
* Customers with all range of balance <b>make deposits</b>. 
* But, most of the <b>deposits</b> are received from customers with balance in range of <b>(0, 1250)</b>.

## Age vs Deposits of Customers

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(15, 5))
fig.suptitle("Age vs Deposits of Customers", size = 16)

# Subplot 1
ax[0].hist(df[df["deposit"]=='no']["age"], bins=7, alpha=0.5, color="green", label="Non-Depositors")
ax[0].hist(df[df["deposit"]=='yes']["age"], bins=7, alpha=0.5, color="blue", label="Depositors")

ax[0].set_xlabel("Age", fontsize=14)
ax[0].set_ylabel("Total Customers", fontsize=14)
ax[0].legend(fontsize = 11);

# Subplot 2
sns.boxplot(x="age", y="deposit", data=df, orient="h", palette={ 'no':"#80e880", 'yes':"#2626ff"}, ax = ax[1])
ax[1].get_yaxis().set_visible(False)
ax[1].set_xlabel('Age', fontsize=14)

color_patches = [
    Patch(facecolor="#80e880", label="Non-Depositors"),
    Patch(facecolor="#2626ff", label="Depositors")
]
ax[1].legend(handles=color_patches, fontsize=11);

* Customers of <b>all age</b> groups make <b>deposits</b>.
* Most of the customers are in the age group of <b>(25, 55) years</b>. A larger part of them are <b>non-depositors</b>. 
* Most of the customers <b>above 60</b> years make <b>deposits</b>. 
* Most of the customers <b>below 25</b> years make <b>deposits</b>. 

## Job vs Deposit of Customers

In [None]:
fig, ax = plt.subplots()

sns.catplot("job", hue = 'deposit', data=df, kind="count", palette={'no':"#80e880", 'yes':"#2626ff"}, legend = False)

color_patches = [
    Patch(facecolor="#80e880", label="Non-Depositors"),
    Patch(facecolor="#2626ff", label="Depositors")
]

plt.title("Job vs Deposit of Customers", size = 18, y=1.08) 
plt.xlabel("Education", size = 14)
plt.ylabel("Count", size = 14)
plt.xticks(size = 12, rotation = 'vertical')
plt.legend(handles = color_patches, fontsize = 12,  bbox_to_anchor=(1.4,1.05))

plt.close(1) 

* Customers from managment, retired, unemployed and student <b>jobtypes</b> prefer to <b>deposit</b>. 
* Customers in <b>services and blue-collar</b> jobs deposit considerably <b>less</b>. 

## Marital vs Deposit of Customers

In [None]:
fig, ax = plt.subplots()

sns.catplot("marital", hue = 'deposit', data=df, kind="count", palette={'no':"#80e880", 'yes':"#2626ff"}, legend = False)

color_patches = [
    Patch(facecolor="#80e880", label="Non-Depositors"),
    Patch(facecolor="#2626ff", label="Depositors")
]

plt.title("Marital vs Deposit of Customers", size = 18, y=1.08) 
plt.xlabel("Marital", size = 14)
plt.ylabel("Count", size = 14)
plt.xticks(size = 12)
plt.legend(handles = color_patches, fontsize = 12,  bbox_to_anchor=(1.4,1.05))

plt.close(1) 

* <b>Fewer</b> married customers prefer to <b>deposit</b>. 
* <b>Larger</b> single customers prefer to <b>deposit</b>.

## Education vs Deposit of Customers

In [None]:
fig, ax = plt.subplots(figsize = (15,5))

sns.catplot("education", hue = 'deposit', data=df, kind="count", palette={'no':"#80e880", 'yes':"#2626ff"}, legend = False)

color_patches = [
    Patch(facecolor="#80e880", label="Non-Depositors"),
    Patch(facecolor="#2626ff", label="Depositors")
]

plt.title("Education vs Deposit of Customers", size = 18, y=1.08) 
plt.xlabel("Education", size = 14)
plt.ylabel("Count", size = 14)
plt.xticks(size = 14)
plt.legend(handles = color_patches, fontsize = 12,  bbox_to_anchor=(1.4,1.05))

plt.close(1) 

* <b>Tertiary</b> educated customers prefer to <b>deposit</b>. 
* Customers with other education type <b>deposit less</b>. 

# 3. Converting Columns to Categorical Numbers

In [None]:
col_list = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'month', 'poutcome', 'deposit', 'contact']

for col in col_list:
    df[col] = df[col].astype('category')
    df[col] = df[col].cat.codes

df.head()

## Column Distribution

In [None]:
%matplotlib inline
df.hist(bins = 50, figsize=(20,16), color = '#00A86B') 
plt.show()


days, duration, previous, campaign, balance seem to consist of outliers

# 4. Removing Outliers 

## Grubbs Function for Outlier Detection

In [None]:
import scipy
import scipy.stats as stats

def outlier_cols(x): 
    n = len(x)
    mean_x = np.mean(x)
    sd_x = np.std(x)
    numerator = max(abs(x-mean_x))
    g_calculated = numerator/sd_x
    t_value = stats.t.ppf(1 - 0.05 / (2 * n), n - 2)
    g_critical = ((n - 1) * np.sqrt(np.square(t_value))) / (np.sqrt(n) * np.sqrt(n - 2 + np.square(t_value)))
    return col if (g_critical) < g_calculated else 0

## Finding Columns with Outliers

In [None]:
import numpy as np

cols_with_outliers = []
for col in df.columns:
    outlier_col = outlier_cols(df[col])
    cols_with_outliers.append(outlier_col)

while (cols_with_outliers.count(0)):
    cols_with_outliers.remove(0)
print('Columns with outliers are: {}'.format(cols_with_outliers) )

## Removing Outliers Using Scaling

In [None]:
for col in cols_with_outliers:
  if col != 'pdays' and col != 'balance':    # pdays and balance have negative values, so scaling will result in NaNs
    df[col] = (df[col]**(1/3.7))

# 5. Columns that Still have Outliers

In [None]:
# Columns still possessing outliers
any_outlier_col = []
for col in cols_with_outliers:
    outlier_col = outlier_cols(df[col])
    any_outlier_col.append(outlier_col)

while (any_outlier_col.count(0)): 
    any_outlier_col.remove(0)
any_outlier_col

## Grubbs Function Function for Validating Outlier Removal

In [None]:
def grubbs_test(x):
    n = len(x)
    mean_x = np.mean(x)
    sd_x = np.std(x)
    numerator = max(abs(x-mean_x))
    g_calculated = numerator/sd_x
    print("Grubbs Calculated Value:",g_calculated)
    t_value = stats.t.ppf(1 - 0.05 / (2 * n), n - 2)
    g_critical = ((n - 1) * np.sqrt(np.square(t_value))) / (np.sqrt(n) * np.sqrt(n - 2 + np.square(t_value)))
    print("Grubbs Critical Value:",g_critical)
    if g_critical > g_calculated:
        print("From grubbs_test we observe that calculated value is lesser than critical value, Accept null hypothesis and conclude that there is no outlier\n")
    else:
        print("From grubbs_test we observe that calculated value is greater than critical value, Reject null hypothesis and conclude that there is an outliers\n")

## Removing Cubical Transform from Columns with Outliers

In [None]:
for col in any_outlier_col:
  if col != 'pdays' and col != 'balance':
    df[col] = (df[col]**(3.7))  

## Individually Removing Outliers from Columns

* ###  'balance' Column

In [None]:
# Plotting histogram
%matplotlib inline
df['balance'].hist(bins = 50, figsize=(10,7), color = '#00A86B') 
plt.show()

In [None]:
# Removing outliers
cut_off = 11000
for i in df['balance']:
    if i >= cut_off:
        df['balance'] = df['balance'].replace(i, cut_off)

grubbs_test(df['balance'])

* ### 'campaign' Column

In [None]:
# Plotting histogram
%matplotlib inline
df['campaign'].hist(bins = 50, figsize=(10,7), color = '#00A86B') 
plt.show()

In [None]:
# Removing outliers
cut_off = 12
for i in df['campaign']:
    if i >= cut_off:
        df['campaign'] = df['campaign'].replace(i, cut_off)

grubbs_test(df['campaign'])

* ### 'previous' Column

In [None]:
# Plotting histogram
%matplotlib inline
df['previous'].hist(bins = 50, figsize=(10,7), color = '#00A86B') 
plt.show()

In [None]:
# Removing Outliers 
cut_off = 8
for i in df['previous']:
    if i >= cut_off:
        df['previous'] = df['previous'].replace(i, cut_off)

grubbs_test(df['previous'])

* ### 'pdays' Column

In [None]:
# Plotting histogram
%matplotlib inline
df['pdays'].hist(bins = 50, figsize=(10,7), color = '#00A86B') 
plt.show()

In [None]:
# Removing Outliers 
cut_off = 500
for i in df['pdays']:
    if i >= cut_off:
        df['pdays'] = df['pdays'].replace(i, cut_off)

grubbs_test(df['pdays'])

## Verifying no NaN inclusions

In [None]:
df.isnull().sum()

# 6. Computing Correlation

## Plotting Correlation Heatmap

In [None]:
# fig
fig= plt.figure(figsize=(12, 12))

# mask
mask = np.triu(df.corr())

# axes 
axes = fig.add_axes([0, 0, 1, 1])
sns.heatmap(df.dropna().corr(), annot=True, mask=mask, square=True,fmt='.2g',vmin=-1, vmax=1, center= 0, cmap='viridis',
            linecolor='white', cbar_kws= {'orientation': 'vertical'}, ax=axes) 

# title
axes.text(-1, -1.5, 'Correlation', color='black', fontsize=24, fontweight='bold')

fig.show()

## Printing Feature Correlations

In [None]:
corr_matrix = df.corr()
corr_matrix['deposit'].sort_values()

# 7. Adding Features

In [None]:
# feature -I
df['dur_pdays'] = ((df['duration'] + df['previous']**0.15 - (df['contact']**2)**0.0003 )**2)**0.25

# feature -II
df['contact_housing'] = (df['housing'])**(0.2) + (df['campaign'])**(0.35)  + df['contact']**(0.2)

In [None]:
df.head()

In [None]:
corr_matrix = df.corr()
corr_matrix['deposit'].sort_values()

## Computing Correlation 

Added features have good correlation with deposit column.

# 8. Train-Test Splitting

## Splitting Train-Test Set

In [None]:
import sklearn
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state = 65)
for train_index, test_index in split.split(df, df['deposit']):
    train_set = df.loc[train_index]
    test_set = df.loc[test_index]
      
print(f"Rows in train set : {len(train_set)}\nRows in test set: {len(test_set)}\n")

## Preparing Train-Test Sets and Labels

In [None]:
# train_set
train_labels = train_set["deposit"].copy()    # Storing feature in labels variable
train_set = train_set.drop(["deposit", 'default', 'job'], axis = 1)       # Dropping 'default' and 'job' column to improve accuracy

# test_set
test_labels = test_set["deposit"].copy()     # Storing feature in labels variable
test_set = test_set.drop(["deposit", 'default', 'job'], axis = 1)   # Dropping 'default' and 'job' column to improve accuracy

# 9. Classification Using ML Models

## ExtraTreesClassifier

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import ExtraTreesClassifier  

# Declaring and fitting classifier model
etc = ExtraTreesClassifier()
etc.fit(train_set, train_labels)
    
# Test set prediction
etc_predict = etc.predict(test_set)

# Evaluating predictions
etc_accuracy = accuracy_score(test_labels, etc_predict)
etc_cm = confusion_matrix(test_labels, etc_predict)

# Printing classification report 
print('ExtraTreesClassifier Report:\n')
print(classification_report(test_labels, etc_predict))
print('Accuracy of ExtraTreesClassifier is: {}'.format(etc_accuracy))

## AdaBoostClassifier

In [None]:
from sklearn.ensemble import AdaBoostClassifier

# Declaring and fitting classifier model
abc = AdaBoostClassifier()
abc.fit(train_set, train_labels)
    
# Test set prediction
abc_predict = abc.predict(test_set)

# Evaluating predictions
abc_accuracy = accuracy_score(test_labels, abc_predict)
abc_cm = confusion_matrix(test_labels, abc_predict)

# Printing classification report 
print('AdaBoostClassifier Report:\n')
print(classification_report(test_labels, abc_predict))
print('Accuracy of AdaBoostClassifier is: {}'.format(abc_accuracy))

## BaggingClassifier

In [None]:
from sklearn.ensemble import BaggingClassifier

# Declaring and fitting classifier model
bc = BaggingClassifier()
bc.fit(train_set, train_labels)
    
# Test set prediction
bc_predict = bc.predict(test_set)

# Evaluating predictions
bc_accuracy = accuracy_score(test_labels, bc_predict)
bc_cm = confusion_matrix(test_labels, bc_predict)

# Printing classification report 
print('BaggingClassifier Report:\n')
print(classification_report(test_labels, bc_predict))
print('Accuracy of BaggingClassifier is: {}'.format(bc_accuracy))

## GradientBoostingClassifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

# Declaring and fitting classifier model
gbc = GradientBoostingClassifier()
gbc.fit(train_set, train_labels)
    
# Test set prediction
gbc_predict = gbc.predict(test_set)

# Evaluating predictions
gbc_accuracy = accuracy_score(test_labels, gbc_predict)
gbc_cm = confusion_matrix(test_labels, gbc_predict)

# Printing classification report 
print('GradientBoostingClassifier Report:\n')
print(classification_report(test_labels, gbc_predict))
print('Accuracy of GradientBoostingCLassifier is: {}'.format(gbc_accuracy))

## RandomForestClassifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Declaring and fitting classifier model
rfc = RandomForestClassifier()
rfc.fit(train_set, train_labels)
    
# Test set prediction
rfc_predict = rfc.predict(test_set)

# Evaluating predictions
rfc_accuracy = accuracy_score(test_labels, rfc_predict)
rfc_cm = confusion_matrix(test_labels, rfc_predict)

# Printing classification report 
print('RandomForestClassifier Report:\n')
print(classification_report(test_labels, rfc_predict))
print('Accuracy of RandomForestClassifier is: {}'.format(rfc_accuracy))

## DecisionTreeClassifier

In [None]:
from sklearn.tree import DecisionTreeClassifier

# Declaring and fitting classifier model
dtc = DecisionTreeClassifier()
dtc.fit(train_set, train_labels)
    
# Test set prediction
dtc_predict = dtc.predict(test_set)

# Evaluating predictions
dtc_accuracy = accuracy_score(test_labels, dtc_predict)
dtc_cm = confusion_matrix(test_labels, dtc_predict)

# Printing classification report 
print('DecisionTreeClassifier Report:\n')
print(classification_report(test_labels, dtc_predict))
print('Accuracy of DecisionTreeClassifier is: {}'.format(dtc_accuracy))

## Catboost Classifier 

In [None]:
# !pip install catboost

In [None]:
from catboost import CatBoostClassifier

# Declaring classifier parameters

cbc_params = {'loss_function':'Logloss', 
          'eval_metric':'AUC', 
          'verbose': 200, 
          'random_seed': 1,
         }


# Declaring classifier model
cbc = CatBoostClassifier(**cbc_params)

# Fitting classifer to training set
cbc.fit(train_set, train_labels,               
          eval_set=(test_set, test_labels), 
          use_best_model=True, 
          plot=True 
);

# Predicting test set
cbc_predict = cbc.predict(test_set)

In [None]:
# Evaluating predictions
cbc_accuracy = accuracy_score(test_labels, cbc_predict)
cbc_cm = confusion_matrix(test_labels, cbc_predict)

# Printing classification report 
print('CatBoostClassifier Report:\n')
print(classification_report(test_labels, cbc_predict))
print('Accuracy of CatBoostClassifier is: {}'.format(cbc_accuracy))

This is one of the <b>best result</b> obtained from different models. This result may be further <b>improved</b> by assigning <b>categorical features</b> to CatBoostClassifier. 

## XGBoostClassifier

https://towardsdatascience.com/a-beginners-guide-to-xgboost-87f5d4c30ed7

In [None]:
# !pip install xgboost

In [None]:
# Declaring classifier parameters
import xgboost as xbg
from xgboost import XGBClassifier

# Classifier parameters
xgb_param = {
    'eta': 0.3, 
    'max_depth': 3,  
    'objective': 'multi:softprob',  
    'num_class': 3} 

# Declaring classifier model
xgb = XGBClassifier(**xgb_param)

# Fitting classifer to training set
xgb.fit(train_set, train_labels) 

# Predicting test set
xgb_predict = xgb.predict(test_set)

In [None]:
# Evaluating predictions
xgb_accuracy = accuracy_score(test_labels, xgb_predict)
xgb_cm = confusion_matrix(test_labels, xgb_predict)

# Printing classification report 
print('XGBoostClassifier Report:\n')
print(classification_report(test_labels, xgb_predict))
print('Accuracy of XGBClassifier is: {}'.format(xgb_accuracy))

In [None]:
# !pip install lightgbm

## LGBM Classifier

In [None]:
# Declaring classifier parameters
import lightgbm as lgb
from lightgbm import LGBMClassifier

# Declaring classifier parameters
lgbm_param = {
        "objective" : "regression",
        "metric" : "rmse",
        "num_leaves" : 40,
        "learning_rate" : 0.004,
        "bagging_fraction" : 0.6,
        "feature_fraction" : 0.6,
        "bagging_frequency" : 6,
        "bagging_seed" : 42,
        "verbosity" : -1,
        "seed": 42
    }
         
# Declaring classifier model
lgbm = lgb.LGBMClassifier(**lgbm_param)

# Fitting classifer to training set
lgbm.fit(train_set, train_labels) 

# Predicting test set
lgbm_predict = lgbm.predict(test_set)

In [None]:
# Evaluating predictions
lgbm_accuracy = accuracy_score(test_labels, lgbm_predict)
lgbm_cm = confusion_matrix(test_labels, lgbm_predict)

# Printing classification report 
print('LGBMClassifier Report:\n')
print(classification_report(test_labels, lgbm_predict))
print('Accuracy of LGBMClassifier is: {}'.format(lgbm_accuracy))

# 10. Performance Comparison of All Models

In [None]:
model_names = ['ExtraTreesClassifier', 'AdaBoostClassifier', 'BaggingClassifier', 'GradientBoostingClassifier', 'RandomForestClassifier', 'DecisionTreeClassifier', 'CatBoostClassifier', 'XGBClassifier', 'LGBMClassifier']
accuracies = [etc_accuracy, abc_accuracy, bc_accuracy, gbc_accuracy, rfc_accuracy, dtc_accuracy, cbc_accuracy, xgb_accuracy, lgbm_accuracy]

accuracy_table = pd.DataFrame({'Model':model_names, 'Accuracy':accuracies})
accuracy_table = accuracy_table.sort_values(by=['Accuracy'], axis=0, ascending = False)
accuracy_table.reset_index(inplace = True, drop=True)
accuracy_table.index += 1
accuracy_table.head(15)

## Top #3 Models Evaluation

In [None]:
# Defining custom color map
import matplotlib.colors

norm = matplotlib.colors.Normalize(-1,1)
colors = [[norm(-1.0), "#e9fcdc"], 
          [norm(-0.6), "#d9f0c9"], 
          [norm( 0.6), "#4CBB17"],
          [norm( 1.0), "#0B6623"]]

cmap = matplotlib.colors.LinearSegmentedColormap.from_list("", colors)

fig, ax=plt.subplots()
x = np.arange(10)
y = np.linspace(-1,1,10)
sc = ax.scatter(x,y, c=y, norm=norm, cmap=cmap)
fig.colorbar(sc, orientation="horizontal")
plt.show()

In [None]:
import pylab

fig = plt.figure(figsize=(14,4.5))
plt.suptitle("Comparison of Top #3 Classifiers", family='Serif', size=16, ha='center')

# ------------ subplot #1 ----------------
plt.subplot(131)
plt.title('CatBoost Classifier', size = 15)


# Declaring heatmap labels
group_counts = ['{0:0.0f}'.format(value) for value in cbc_cm.flatten()]
group_percentages = ['{0:.2%}'.format(value) for value in cbc_cm.flatten()/np.sum(cbc_cm)]

labels = [f"{v2}\n{v3}" for v2, v3 in zip(group_counts, group_percentages)]
labels = np.asarray(labels).reshape(2,2)

# Plotting heatmap 
sns.heatmap(cbc_cm, annot=labels, annot_kws={"size": 15}, fmt = '', cmap=cmap)

# Adding figure labels
plt.ylabel('Actual Values')
plt.xlabel('Predicted Values \n \n Accuracy: {}'.format(round(cbc_accuracy, 4)))


# ------------ subplot #2 ----------------
plt.subplot(132)
plt.title('XGBoost Classifier', size = 15)

# Declaring heatmap labels
group_counts = ['{0:0.0f}'.format(value) for value in xgb_cm.flatten()]
group_percentages = ['{0:.2%}'.format(value) for value in xgb_cm.flatten()/np.sum(xgb_cm)]

labels = [f"{v2}\n{v3}" for v2, v3 in zip(group_counts, group_percentages)]
labels = np.asarray(labels).reshape(2,2)

# Plotting heatmap 
sns.heatmap(xgb_cm, annot=labels, annot_kws={"size": 15}, fmt = '', cmap=cmap)


# Adding figure labels
plt.ylabel('Actual Values')
plt.xlabel('Predicted Values \n \n Accuracy: {}'.format(round(xgb_accuracy, 4)))



# ------------ subplot #3 ----------------
plt.subplot(133)
plt.title('GradientBoosting Classifier', size = 15)

# Declaring heatmap labels
group_counts = ['{0:0.0f}'.format(value) for value in gbc_cm.flatten()]
group_percentages = ['{0:.2%}'.format(value) for value in gbc_cm.flatten()/np.sum(gbc_cm)]

labels = [f"{v2}\n{v3}" for v2, v3 in zip(group_counts, group_percentages)]
labels = np.asarray(labels).reshape(2,2)

# Plotting heatmap 
sns.heatmap(gbc_cm, annot=labels, annot_kws={"size": 15}, fmt = '', cmap=cmap)

# Adding figure labels
plt.ylabel('Actual Values')
plt.xlabel('Predicted Values \n \n Accuracy: {}'.format(round(gbc_accuracy, 4)))

fig.tight_layout()
plt.show()

# 11. Best Model Optimization

## Feature Importance for CatBoostClassifier

In [None]:
# Computing feature importance
feature_importances = pd.DataFrame(cbc.feature_importances_,
                                   index = train_set.columns,
                                   columns=['importance']).sort_values('importance',ascending=False)

# Plotting feature importance
plt.figure(figsize=(20,8))
plt.plot(feature_importances)
plt.scatter(y=feature_importances.importance,x=feature_importances.index)
plt.title('Importance of Features in Dataframe', fontsize = 16)
plt.ylabel('Importance', fontsize=14)
plt.xlabel('Features', fontsize = 14)
plt.grid()
plt.show()

<b>month</b>, <b>dur_pdays</b> and <b>duration</b>  are important features

In [None]:
train_set.columns, train_set.month.value_counts()

## CatBoost with Categorical Features

In [None]:
# Assigning categorical features
cat_features = [1, 2, 3, 7, 8]  

# Declaring classifer parameters
cbc_params = {'loss_function':'Logloss',
          'eval_metric':'AUC',
          'cat_features': cat_features,
          'verbose': 200,
          'random_seed': 1,
          'iterations': 1000,
          'max_depth': 7    
         }

# Declaring model
cbc_improved = CatBoostClassifier(**cbc_params)

# Fitting model to train set 
cbc_improved.fit(train_set, train_labels,
          eval_set=(test_set, test_labels),
          use_best_model=True,
          plot=True
         );

# Predicting test set
cbc_improved_predict = cbc_improved.predict(test_set)

There is slight improvement in <b>bestTest score</b>.  

In [None]:
# Evaluating predictions
cbc_improved_accuracy = accuracy_score(test_labels, cbc_improved_predict)
cbc_improved_cm = confusion_matrix(test_labels, cbc_improved_predict)

# Printing classification report 
print('CatBoostClassifier Report:\n')
print(classification_report(test_labels, cbc_improved_predict))
print('Accuracy of CatBoostClassifier is: {}'.format(cbc_improved_accuracy))

Model accuracy improved to <b>0.8773</b> 

## Evaluating Improvement

In [None]:
fig = plt.figure(figsize=(14,4.5))
title = plt.suptitle("Evaluating Improvement in CatBoost Classifier", family='Serif', size=16, ha='center')
title.set_position([0.48, 1.05])

# ------------ subplot #1 ----------------
plt.subplot(121)
plt.title('Improved CatBoost Classifier', size = 15)


# Declaring heatmap labels
group_counts = ['{0:0.0f}'.format(value) for value in cbc_improved_cm.flatten()]
group_percentages = ['{0:.2%}'.format(value) for value in cbc_improved_cm.flatten()/np.sum(cbc_improved_cm)]

labels = [f"{v2}\n{v3}" for v2, v3 in zip(group_counts, group_percentages)]
labels = np.asarray(labels).reshape(2,2)

# Plotting heatmap 
sns.heatmap(cbc_improved_cm, annot=labels, annot_kws={"size": 15}, fmt = '', cmap=cmap)

# Adding figure labels
plt.ylabel('Actual Values', size = 12)
plt.xlabel('Predicted Values \n \n Accuracy: {}'.format(round(cbc_improved_accuracy, 4)), size = 12)


# ------------ subplot #2 ----------------
plt.subplot(122)
plt.title('Previous CatBoost Classifier', size = 15)

# Declaring heatmap labels
group_counts = ['{0:0.0f}'.format(value) for value in cbc_cm.flatten()]
group_percentages = ['{0:.2%}'.format(value) for value in cbc_cm.flatten()/np.sum(cbc_cm)]

labels = [f"{v2}\n{v3}" for v2, v3 in zip(group_counts, group_percentages)]
labels = np.asarray(labels).reshape(2,2)

# Plotting heatmap 
sns.heatmap(cbc_cm, annot=labels, annot_kws={"size": 15}, fmt = '', cmap=cmap)

# Adding figure labels
plt.ylabel('Actual Values', size=12)
plt.xlabel('Predicted Values \n \n Accuracy: {}'.format(round(cbc_accuracy, 4)), size = 12)
plt.show()

* There is increase in <b>true positives</b>.
* False positives have reduced by optimizing the parameters of CatBoostClassifier. 

# 12. References: 

##### <b>I admire the content shared in the following links. These have significantly helped in completing this notebook.</b>  

* Data Visualization: https://medium.com/analytics-vidhya/tutorial-exploratory-data-analysis-eda-with-categorical-variables-6a569a3aea55

* Plotly: https://www.kaggle.com/subinium/basic-of-statistical-viz-plotly-seaborn/notebook

* Outliers: https://www.kaggle.com/nareshbhat/outlier-the-silent-killer

* Outliers: https://www.kaggle.com/vikram92/units-sold-prediction-for-e-commerce

* Notebook Inspiration: https://www.kaggle.com/kurazh/diabetes-prediction-score-0-92-and-eda

* Notebook Inspiration: https://www.kaggle.com/emdemor/brazilian-predictions-of-deaths-and-recoveries

* Notebook Inspiration: https://www.kaggle.com/dorianvoydie/eda-modelling-heart-attack-90-accuracy-score

* Notebook Inspiration: https://www.kaggle.com/mohamedzayton/drug-classification-rf-nn#Building-Model-1--%3E-RF

* Hyperparameter Optimization: https://www.kaggle.com/sh0wmaker/compare-algorithms-in-drug-classification#Model-Implementation

* Catboost: https://www.kaggle.com/mitribunskiy/tutorial-catboost-overview

* Model Evaluation: https://www.kaggle.com/gcmadhan/bank-campaign-eda-classification-83-accu

* LGBM: https://towardsdatascience.com/a-quick-guide-to-lightgbm-library-ef5385db8d10

* Classification Models: https://www.kaggle.com/samratp/lightgbm-xgboost-catboost

* Confusion Matrix: https://medium.com/@dtuk81/confusion-matrix-visualization-fc31e3f30fea

* Confusion Matrix: https://www.kaggle.com/agungor2/various-confusion-matrix-plots

* Subplots: https://www.kaggle.com/asimislam/tutorial-python-subplots

* Accuracy Improvement: https://www.kaggle.com/avelinocaio/top-5-voting-classifier-in-python