In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

**Load the dataset**

In [None]:
data = pd.read_csv('../input/bank-marketing-dataset/bank.csv')
data

In [None]:
print("The provided dataset consists of {rows} rows.".format(rows = len(data)))

**Checking for NUll values in the dataset**

In [None]:
data.isnull().sum()

**Fortunately there are no null values in any of the columns**

**Basic statistical description of the data**

In [None]:
data.describe()

Mean Age is aproximately 41 years old. (Minimum: 18 years old and Maximum: 95 years old.)

The mean balance is 1,528. However, the Standard Deviation (std) is a high number so we can understand through this that the balance is heavily distributed across the dataset.

**Correlation Heatmap**

In [None]:
plt.figure(figsize=(16,10))
sns.heatmap(data.corr(method='pearson'), annot=True)

**Univariate Analysis of the Columns**

**First let's look up at the categorical variables**

In [None]:
plt.figure(figsize=[15,10])
sns.countplot(data['job'])

In [None]:
plt.figure(figsize=[6,6])
sns.countplot(data['marital'])

In [None]:
plt.figure(figsize=[6,6])
sns.countplot(data['education'])

In [None]:
plt.figure(figsize=[5,5])
sns.countplot(data['housing'])

In [None]:
plt.figure(figsize=[5,5])
sns.countplot(data['loan'])

In [None]:
plt.figure(figsize=[5,5])
sns.countplot(data['default'])

In [None]:
plt.figure(figsize=[5,5])
sns.countplot(data['contact'])

In [None]:
plt.figure(figsize=[15,10])
sns.countplot(data['month'])

In [None]:
plt.figure(figsize=[7,7])
sns.countplot(data['poutcome'])

**Now for the Numerical / continuous variables**

The most convinient way to study the continuous variables is through the histograms

In [None]:
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')

data.hist(bins=20, figsize=(14,10), color='#E14906')
plt.show()

As we can see from the above plots, most of the numerical columns have outliers and are not normally diatributed. So we need to have a closer look at these columns to find out noisy data present in them.

In [None]:
data[['pdays', 'campaign', 'previous']].describe()

Percentage of 'pdays' values above 400

In [None]:
len (data[data['pdays'] > 400] ) / len(data) * 100

'pdays' holds the number of days that passed by after the client was last contacted from a previous campaign Looking closer into 'pdays' data we can see that only 1.2% of values above 400.

Percentage of 'campaign' values above 20:

In [None]:
len (data[data['campaign'] > 34] ) / len(data) * 100

Percentage of 'previous' values above 20:

In [None]:
len (data[data['previous'] > 34] ) / len(data) * 100

**let's look at our target column**

In [None]:
sns.countplot(data['deposit'])

There is no high imbalance in the deposit column so there is no need to do anything for treating imbalance

****

In [None]:
fig = plt.figure(figsize=(20,20))
ax1 = fig.add_subplot(221)
ax2 = fig.add_subplot(222)
ax3 = fig.add_subplot(212)

g = sns.boxplot(x="default", y="balance", hue="deposit",
                    data=data, palette="muted", ax=ax1)

g.set_title("Amount of Balance by Term Suscriptions")

# ax.set_xticklabels(df["default"].unique(), rotation=45, rotation_mode="anchor")

g1 = sns.boxplot(x="job", y="balance", hue="deposit",
                 data=data, palette="RdBu", ax=ax2)

g1.set_xticklabels(data["job"].unique(), rotation=90, rotation_mode="anchor")
g1.set_title("Type of Work by Term Suscriptions")

g2 = sns.violinplot(data=data, x="education", y="balance", hue="deposit", palette="RdBu_r")

g2.set_title("Distribution of Balance by Education")

In [None]:
# Admin and management are basically the same let's put it under the same categorical value
lst = [data]

for col in lst:
    col.loc[col["job"] == "admin.", "job"] = "management"

**let's see how 'deposit' column value varies depending on other categorical columns' values:**

In [None]:
sns.catplot(x="education", y="duration",hue="deposit",
                data=data, kind="bar");


In [None]:
sns.catplot(x="marital", y="duration",hue="deposit",
                data=data, kind="bar");

In [None]:
sns.catplot(x="marital", y="balance",hue="deposit",
                data=data, kind="bar");

In [None]:
sns.catplot(x="education", y="balance",hue="deposit",
                data=data, kind="bar");

In [None]:
#job and deposit
df = pd.DataFrame()

df['yes'] = data[data['deposit'] == 'yes']['job'].value_counts()
df['no'] = data[data['deposit'] == 'no']['job'].value_counts()

df.plot.bar(title = 'Job and deposit')

In [None]:
sns.countplot(data['deposit'])

In [None]:
#balance and deposit

balance_data = pd.DataFrame()
balance_data['balance_yes'] = (data[data['deposit'] == 'yes'][['deposit','balance']].describe())['balance']
balance_data['balance_no'] = (data[data['deposit'] == 'no'][['deposit','balance']].describe())['balance']

balance_data

In [None]:
balance_data.drop(['count', '25%', '50%', '75%']).plot.bar(title = 'Balance and deposit statistics')

In [None]:
#age and deposit

age_data = pd.DataFrame()
age_data['age_yes'] = (data[data['deposit'] == 'yes'][['deposit','age']].describe())['age']
age_data['age_no'] = (data[data['deposit'] == 'no'][['deposit','age']].describe())['age']

age_data

In [None]:
age_data.drop(['count', '25%', '50%', '75%']).plot.bar(title = 'Age and deposit statistics')

In [None]:
#number of contacts performed during this campaign ('campaign') and deposit
contact_data = pd.DataFrame()
contact_data['campaign_yes'] = (data[data['deposit'] == 'yes'][['deposit','campaign']].describe())['campaign']
contact_data['campaign_no'] = (data[data['deposit'] == 'no'][['deposit','campaign']].describe())['campaign']

contact_data

In [None]:
contact_data.drop(['count', '25%', '50%', '75%']).plot.bar(title = 'Number of contacts performed during this campaign and deposit statistics')

In [None]:
#number of contacts performed during previous campaign ('previous') and deposit
p_data = pd.DataFrame()
p_data['previous_yes'] = (data[data['deposit'] == 'yes'][['deposit','previous']].describe())['previous']
p_data['previous_no'] = (data[data['deposit'] == 'no'][['deposit','previous']].describe())['previous']

p_data

In [None]:
p_data.drop(['count', '25%', '50%', '75%']).plot.bar(title = 'Number of contacts performed during previous campaign and deposit statistics')

**So what story does the above plots tell us?**
* Customers with 'blue-collar' and 'services' jobs are less likely to subscribe for term deposit.
* Married customers are less likely to subscribe for term deposit.
* Customers with 'cellular' type of contact are less likely to subscribe for term deposit.
* People who subscribed for term deposit tend to have greater balance and age values.
* People who subscribed for term deposit tend to have fewer number of contacts during this campaign.

**Let's prepare our dataset for applying machine learning algorithm**

First we need to convert the categorical columns into numeric as we cannot feed the columns with string values to our ML model. so here we shall use Label Encoding to convert string values to numeric

In [None]:
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
data['job']=labelencoder.fit_transform(data['job'])
data['marital']=labelencoder.fit_transform(data['marital'])
data['education']=labelencoder.fit_transform(data['education'])
data['default']=labelencoder.fit_transform(data['default'])
data['housing']=labelencoder.fit_transform(data['housing'])
data['loan']=labelencoder.fit_transform(data['loan'])
data['contact']=labelencoder.fit_transform(data['contact'])
data['month']=labelencoder.fit_transform(data['month'])
data['poutcome']=labelencoder.fit_transform(data['poutcome'])
data['deposit']=labelencoder.fit_transform(data['deposit'])

Let's see our dataset once after encoding 

In [None]:
data

Splitting dataset into features and label

In [None]:
data.columns

In [None]:
features = ['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome']
label = ['deposit']

In [None]:
X = data[features]
y = data[label]

Splitting our dataset into train and test

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test, y_train,y_test=train_test_split(X,y,test_size=0.33,random_state=42)


Scaling of fhe data

In [None]:
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()

X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
X_train

In [None]:
X_test

**XGboost classifier algorithm**

In [None]:
#train XGBoost model
import xgboost
from sklearn.metrics import accuracy_score
#model = xgb.XGBClassifier()
xgb = xgboost.XGBClassifier(n_estimators=100, learning_rate=0.08, gamma=0, subsample=0.75,
                           colsample_bytree=1, max_depth=7)
xgb.fit(X_train,y_train.squeeze().values)

#calculate and print scores for the model for top 15 features
y_train_preds = xgb.predict(X_train)
y_test_preds = xgb.predict(X_test)

print('XGB accuracy score for train: %.3f: test: %.3f' % (
        accuracy_score(y_train, y_train_preds),
        accuracy_score(y_test, y_test_preds)))

In [None]:
from sklearn.metrics import f1_score, roc_auc_score,accuracy_score,confusion_matrix, precision_recall_curve, auc, roc_curve, recall_score, classification_report 
classification_report = classification_report(y_test, y_test_preds)
print(classification_report)

so we get an accuracy of **85%**. F1 scores show that the model is performing quite well. So it's good.

In [None]:
cm = confusion_matrix(y_test, y_test_preds)
cm

In [None]:
predicted_probab_log = xgb.predict_proba(X_test)
predicted_probab_log = predicted_probab_log[:, 1]
fpr, tpr, _ = roc_curve(y_test, predicted_probab_log)

**ROC-AUC curve**

In [None]:
from matplotlib import pyplot
pyplot.plot(fpr, tpr, marker='.', label='')
pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')
pyplot.legend()
pyplot.show()

**Some recommendations for future marketting campaigns**
* The customer's account balance has a huge influence on the campaign's outcome. People with account balance above 1490$ are more likely to subscribe for term deposit, so future address those customers.

* The customer's age affects campaign outcome as well. Future campains should concentrate on customers from age categories below 30 years old and above 50 years old.

* Number of contacts with the customer during the campaign is also very important. The number of contacts with the customer shouldn't exceed 4.