In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Problem Statement

Find the best strategies to improve for the next marketing campaign. How can the financial institution have a greater effectiveness for future marketing campaigns? In order to answer this, we have to analyze the last marketing campaign the bank performed and identify the patterns that will help us find conclusions in order to develop future strategies

# Please note the analysis is in progress.

# Read Dataset

In [None]:
df = pd.read_csv('../input/bank-marketing-dataset/bank.csv')

In [None]:
df.head()

In [None]:
df.info()

# Feature/column description
1. Age - Age of the customer - Integer value
2. job - Job of the customer - Categorical feature
3. marital - Marital status of the customer- Categorical feature
4. education - eduction status - categorical feature
5. default - whether the custome is defaulter or not - categorical feature
6. balance - yearly account balance of the customer - continueous feature
7. housing - housing status of the customer - categorical feature
8. loan - whether the customer availed any loans - categorical feature
9. contact - how many times the customer has been contacted - categorical feature
10. day - day from last contact - discrete feature
11. month - month from last contacted date - categorical feature. 
12. duration - duration of last contact in hours - contineous feature
13. campaign - contact with how many campaign - categorical feature


# Exploratory Data Analysis

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.gridspec import GridSpec
cols= ['#00876c','#85b96f','#f7e382','#f19452','#d43d51']

In [None]:
sns.palplot(cols)

In [None]:
fig=plt.figure(figsize=(15,8), facecolor=(0.2,0.0,0.0,0.0), edgecolor='black')
plt.suptitle("Compare the deposit by Age", family='Serif', size=15,weight='bold')

plt.figtext(0.5,0.93,"Histogram and boxplot to identify the mid value of Age by deposits", family='Serif', size=12, ha='center')
gs = GridSpec(nrows=2, ncols=4, figure=fig)
ax1=plt.subplot(gs[0,:3])


sns.histplot(data=df, x='age', bins=10, ax=ax1, kde=True, hue='deposit', multiple='layer', element='bars', palette=['#00876c','#d43d51']);
ax2=plt.subplot(gs[0,3:4], sharey=ax1)
sns.histplot(data=df[df['deposit']=='yes'], x='age', bins=10, ax=ax2, kde=True, color=['#00876c']);
ax2.yaxis.set_visible(False)



ax4=plt.subplot(gs[1,:4])
sns.boxplot(data=df[df['deposit']=='yes'], x='age', ax=ax4, palette=['#00876c']);
ax4.yaxis.set_visible(False)
ax4.text(60,0.15,"Mean value: {:.2f}".format(df[df['deposit']=='yes']['age'].mean()))
ax4.text(60,0.20,"Median value: {:.2f}".format(df[df['deposit']=='yes']['age'].median()))
ax4.text(60,0.25,"Frequent age : {:.2f}".format(df[df['deposit']=='yes']['age'].mode().max()))
for i in ['left','right','bottom','top']:
    ax1.spines[i].set_visible(False)
    ax2.spines[i].set_visible(False)
    ax4.spines[i].set_visible(False)

**Observation:** Interesting outcome from the above graph. deposit starts at the age of 20+ and peack is between 30, 50. frequntly deposit age is 32. in addtion Non deposite age is between 30-50 years with mean is around 35-40 years

In [None]:
fig = plt.figure(figsize=(12,8))
gs = GridSpec(ncols=3, nrows=2, figure=fig)
plt.suptitle("Box plot to compare the age by marital status and deposits", family='Serif', weight='bold', size=15)
for i,c in enumerate(df['marital'].unique()):
    ax=plt.subplot(gs[0,i])
    ax=sns.boxplot(y=df[df['marital']==c]['age'], x=df['deposit'],palette=['#00876c','#d43d51']);
    ax.spines['left'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.spines['bottom'].set_visible(False)

ax=plt.subplot(gs[1,:])
ax=sns.boxplot(data=df[df['deposit']=='yes'],y='marital',x='age', palette=['#00876c','#d43d51','#f7e382'])
ax.spines['left'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['bottom'].set_visible(False)

plt.figtext(0.05,-0.05,"Observation:\n Obivous that the single's deposit is less compared to married & divorced.\n Married average age starts from 35+ to 60.\n divorced ages is between 40 to 60 & singel ages is between 28 to 35.\n Reason could be that the single might get married after 35 years approximately",
           family='San', size=12, ha='left')


In [None]:
fig = plt.figure(figsize=(12,8))
ax=sns.countplot(data=df, x='loan', hue='deposit', palette=['#00876c','#d43d51'])
ax.set_title('Comparison of Loan and deposit', font='Serif', weight='bold', size=15)
ax.spines['left'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['bottom'].set_visible(False)
plt.figtext(0.05,-0.05,"Observation: People who has loan, have not deposited",
           family='San', size=12, ha='left')

In [None]:
fig = plt.figure(figsize=(12,8))
gs = GridSpec(ncols=3, nrows=2, figure=fig)
plt.suptitle("Box plot to compare the age by marital status and deposits", family='Serif', weight='bold', size=15)
for i,c in enumerate(df['loan'].unique()):
    ax=plt.subplot(gs[0,i])
    ax=sns.boxplot(y=df[df['loan']==c]['age'], x=df['deposit'],palette=['#00876c','#d43d51']);
    ax.spines['left'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.spines['bottom'].set_visible(False)
    ax.set_title("Loan : {}".format(c))

ax=plt.subplot(gs[1,:])
ax=sns.boxplot(data=df[df['deposit']=='yes'],y='loan',x='age', palette=['#00876c','#d43d51','#f7e382'])
ax.spines['left'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['bottom'].set_visible(False)

plt.figtext(0.05,-0.05,"Observation:\n Loan has bigger impact on the deposits, people who has loans has less deposits.\n people deposited with loan is between the age of 30 to 50. people deposited without loan is from 30 to 60 year. so people getting olde preferes deposits than the loan",
           family='San', size=12, ha='left')

In [None]:
df.groupby(['loan']).describe()['age']

In [None]:
fig = plt.figure(figsize=(12,8))
ax=sns.scatterplot(data=df, x='age',y='balance', hue='deposit',palette=['#00876c','#d43d51'])
ax.set_ylim(0,4000)
plt.figtext(0.05,-0.05,"Observation: There is no significant relationship in age and balance. ",
           family='San', size=12, ha='left')


In [None]:
fig=plt.figure(figsize=(15,8))

plt.suptitle("Comparision of Age & Balance by Deposit", family='Serif', size=15, ha='center', weight='bold')
plt.figtext(0.5,0.93,"Line plot shows that the increase in balance after 80 year", family='Serif', size=12, ha='center')
gs = GridSpec(nrows=3, ncols=1, height_ratios=[5,2,2])
ax1=plt.subplot(gs[0,0])
ax1=sns.lineplot(data=df,y='balance',x='age', hue='deposit',palette=['#00876c','#d43d51'])
ax2=plt.subplot(gs[1,0])
ax2=sns.barplot(data=df[df['deposit']=='yes'],y='balance',x='age', hue='deposit', palette=['#00876c'], ci=False)
ax3=plt.subplot(gs[2,0])
ax3=sns.barplot(data=df[df['deposit']=='no'],y='balance',x='age', hue='deposit', palette=['#d43d51'], ci=False)
for i in ['left','right','bottom','top']:
    ax1.spines[i].set_visible(False)
    ax2.spines[i].set_visible(False)
    ax3.spines[i].set_visible(False)
plt.figtext(0.05,-0.05,"Observation: Balance in deposit increases by Age",
           family='San', size=12, ha='left')

In [None]:
df.groupby(['age'])['balance'].mean().nlargest(5).to_frame().T

In [None]:
fig=plt.figure(figsize=(12,8))
ax=sns.kdeplot(df['balance'], fill=True,palette=['#00876c','#d43d51'])
ax.axvline(df['balance'].mean(),c='r',ls='--')
ax.text(x=df['balance'].mean(),y=0.0002,s="mean value", rotation=90)
fig=plt.figure(figsize=(12,8))
ax.axvline(df['balance'].median(),c='blue',ls='--')
ax.text(x=df['balance'].median(),y=0.0002,s="mean value", rotation=90)
ax.set_xlim(-5000,20000)

In [None]:
# lets check if campaign has significant change in deposit
fig=plt.figure(figsize=(12,8))
sns.countplot(data=df,x='campaign',hue='deposit', palette=['#00876c','#d43d51'])


In [None]:
fig = plt.figure(figsize=(12,8))
plt.suptitle("Comparision of Education with deposit", family='Serif', size=15, ha='center', weight='bold')
plt.figtext(0.5,0.93,"comparing the ecucation impact on deposit", family='Serif', size=12, ha='center')
gs = GridSpec(nrows=1, ncols=2, width_ratios=[5,2])
ax1=plt.subplot(gs[0,0])
ax1=plt.pie(df[df['deposit']=='yes']['education'].value_counts(), labels=df[df['deposit']=='yes']['education'].unique(), autopct='%2d', colors=cols)
ax2=plt.subplot(gs[0,1])
ax2=plt.pie(df[df['deposit']=='no']['education'].value_counts(), labels=df[df['deposit']=='no']['education'].unique(), autopct='%2d', colors=cols)

In [None]:
fig = plt.figure(figsize=(12,8))
df1=df.groupby('education')['balance'].sum().reset_index()
plt.suptitle("Comparison of balance with education", family='Serif', size=15, ha='center', weight='bold')
plt.figtext(0.5,0.93,"compare and see if the eduction increases the balance", family='Serif', size=12, ha='center')
ax=sns.barplot(data=df1, y='education',x='balance', palette=cols, ci=False)
#ax.set_xlim(0,20000)
for i in ['left','right','bottom','top']:
    ax.spines[i].set_visible(False)

for y,x in enumerate(df1['balance']):
    ax.text(x=x/2,y=y, s=x)

plt.figtext(0.05,-0.05,"Observation: Education level Secondary & Tertiary has higher balance",
           family='San', size=12, ha='left')

In [None]:
sns.pairplot(df, hue='deposit')

In [None]:
fig = plt.figure(figsize=(12,8))
sns.heatmap(df.corr(), cmap=cols, annot=True, linewidths=0.5)

# Data preparation

In [None]:
df.isna().sum()

In [None]:
df.head()

# One hot encoding

In [None]:
df['deposit']=df['deposit'].map({'yes':1,'no':0})

In [None]:
df2=pd.get_dummies(df,drop_first=True)

In [None]:
X=df2.drop(['deposit'], axis=1)
y=df2['deposit']

# Train Test Split

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=42)

# Standardise the Variables

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

X_train=scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Model Creation

In [None]:
from sklearn.linear_model import LogisticRegression
from yellowbrick.classifier import ROCAUC, ClassificationReport, ClassificationScoreVisualizer
model = LogisticRegression()
model.fit(X_train, y_train)
model.score(X_test ,y_test)
pred=model.predict(X_test)
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
print(accuracy_score(y_test,pred))
print(classification_report(y_test,pred))

## Yellow brick reports
fig = plt.figure(figsize=(20,8))
gs=GridSpec(nrows=1, ncols=2)
plt.suptitle("Classification Reports", family='Serif', size=15, ha='center', weight='bold')
plt.figtext(0.5,0.93,"Classification report based on the Logisitic regression model", family='Serif', size=12, ha='center')
ax1=plt.subplot(gs[0,0])
ax1.set(title='ROC Curve')
visual = ROCAUC(model, classes=[0,1])
visual.fit(X_train,y_train)
ax1=visual.score(X_test,y_test)

ax2=plt.subplot(gs[0,1])
ax2.set(title='Classification report')
ax2=ClassificationReport(model,classes=[0,1], support=True).fit(X_train,y_train).score(X_test,y_test)

plt.figtext(0.05,-0.05,"Observation: Logistic Regression performed well with Accuracy score of 83%",
           family='Serif', size=14, ha='left', weight='bold')

# Support Vector Machine

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from yellowbrick.classifier import ROCAUC, ClassificationReport, ClassificationScoreVisualizer
model = KNeighborsClassifier()
model.fit(X_train, y_train)
model.score(X_test ,y_test)
pred=model.predict(X_test)
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
print(accuracy_score(y_test,pred))
print(classification_report(y_test,pred))

## Yellow brick reports
fig = plt.figure(figsize=(20,8))
gs=GridSpec(nrows=1, ncols=2)
plt.suptitle("Classification Reports", family='Serif', size=15, ha='center', weight='bold')
plt.figtext(0.5,0.93,"Classification report based on the KNeighborsClassifier model", family='Serif', size=12, ha='center')
ax1=plt.subplot(gs[0,0])
ax1.set(title='ROC Curve')
visual = ROCAUC(model, classes=[0,1])
visual.fit(X_train,y_train)
ax1=visual.score(X_test,y_test)

ax2=plt.subplot(gs[0,1])
ax2.set(title='Classification report')
ax2=ClassificationReport(model,classes=[0,1], support=True).fit(X_train,y_train).score(X_test,y_test)

plt.figtext(0.05,-0.05,"Observation: KNeighborsClassifier performed well with Accuracy score of 77%",
           family='Serif', size=14, ha='left', weight='bold')

In [None]:
from sklearn.tree import DecisionTreeClassifier
from yellowbrick.classifier import ROCAUC, ClassificationReport, ClassificationScoreVisualizer
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
model.score(X_test ,y_test)
pred=model.predict(X_test)
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
print(accuracy_score(y_test,pred))
print(classification_report(y_test,pred))

## Yellow brick reports
fig = plt.figure(figsize=(20,8))
gs=GridSpec(nrows=1, ncols=2)
plt.suptitle("Classification Reports", family='Serif', size=15, ha='center', weight='bold')
plt.figtext(0.5,0.93,"Classification report based on the DecisionTreeClassifier", family='Serif', size=12, ha='center')
ax1=plt.subplot(gs[0,0])
ax1.set(title='ROC Curve')
visual = ROCAUC(model, classes=[0,1])
visual.fit(X_train,y_train)
ax1=visual.score(X_test,y_test)

ax2=plt.subplot(gs[0,1])
ax2.set(title='Classification report')
ax2=ClassificationReport(model,classes=[0,1], support=True).fit(X_train,y_train).score(X_test,y_test)

plt.figtext(0.05,-0.05,"Observation: DecisionTreeClassifier performed well with Accuracy score of 79%",
           family='Serif', size=14, ha='left', weight='bold')