In [None]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split

import seaborn as sns
from pylab import rcParams

In [None]:
%matplotlib inline
rcParams['figure.figsize'] = 20, 8
sns.set_style('whitegrid')
plt.rc("font", size = 18)

In [None]:
data = pd.read_csv('bank.csv')
data.head()

In [None]:
print(data.shape)
print(data.columns)

### Input variables

- age (numeric)
- job : type of job (categorical: “admin”, “blue-collar”, “entrepreneur”, “housemaid”, “management”, “retired”, “self-- employed”, “services”, “student”, “technician”, “unemployed”, “unknown”)
- marital : marital status (categorical: “divorced”, “married”, “single”, “unknown”)
- education (categorical: “basic.4y”, “basic.6y”, “basic.9y”, “high.school”, “illiterate”, “professional.course”, “university.degree”, “unknown”)
- default: has credit in default? (categorical: “no”, “yes”, “unknown”)
- housing: has housing loan? (categorical: “no”, “yes”, “unknown”)
- loan: has personal loan? (categorical: “no”, “yes”, “unknown”)
- contact: contact communication type (categorical: “cellular”, “telephone”)
- month: last contact month of year (categorical: “jan”, “feb”, “mar”, …, “nov”, “dec”)
- day_of_week: last contact day of the week (categorical: “mon”, “tue”, “wed”, “thu”, “fri”)
- duration: last contact duration, in seconds (numeric). Important note: this attribute highly affects the output target (e.g., if duration=0 then y=’no’). The duration is not known before a call is performed, also, after the end of the call, y is obviously known. Thus, this input should only be included for benchmark purposes and should be discarded if the intention is to have a realistic predictive model
- campaign: number of contacts performed during this campaign and for this client (numeric, includes last contact)
- pdays: number of days that passed by after the client was last contacted from a previous campaign (numeric; 999 means client was not previously contacted)
- previous: number of contacts performed before this campaign and for this client (numeric)
- poutcome: outcome of the previous marketing campaign (categorical: “failure”, “nonexistent”, “success”)
- emp.var.rate: employment variation rate — (numeric)
- cons.price.idx: consumer price index — (numeric)
- cons.conf.idx: consumer confidence index — (numeric)
- euribor3m: euribor 3 month rate — (numeric)
- nr.employed: number of employees — (numeric)


### Predict variable (desired target):

- y: has the client subscribed a term deposit? (binary: “1”, means “Yes”, “0” means “No”)

In [None]:
data['education'].unique()

In [None]:
data.loc[data['education'] == 'basic.4y', 'education'] = 'basic'

data.loc[data['education'] == 'basic.6y', 'education'] = 'basic'

data.loc[data['education'] == 'basic.9y', 'education'] = 'basic'

In [None]:
data['education'].unique()

## Data Exploration

In [None]:
data['y'].value_counts()

In [None]:
sns.countplot(x = 'y', data = data, palette = 'hls')
plt.show()
plt.savefig('count_plot')

There are 36548 no’s and 4640 yes’s in the outcome variables.

Let’s get a sense of the numbers across the two classes.

In [None]:
data.groupby('y').mean()

#### Observations:

- The average age of customers who bought the term deposit is higher than that of the customers who didn’t.
- The pdays (days since the customer was last contacted) is understandably lower for the customers who bought it. The lower the pdays, the better the memory of the last call and hence the better chances of a sale.
- Surprisingly, campaigns (number of contacts or calls made during the current campaign) are lower for customers who bought the term deposit.

We can calculate categorical means for other categorical variables such as education and marital status to get a more detailed sense of our data.

In [None]:
data.groupby('job').mean()

In [None]:
data.groupby('marital').mean()

In [None]:
data.groupby('education').mean()

## Visualizations

In [None]:
%matplotlib inline

pd.crosstab(data.job, data.y).plot(kind='bar')

plt.title('Purchase Frequency for Job Title')
plt.xlabel('Job')
plt.ylabel('Frequency of Purchase')

plt.savefig('purchase_fre_job')

The frequency of purchase of the deposit depends a great deal on the job title. Thus, the job title can be a good predictor of the outcome variable.

In [None]:
pd.crosstab(data.marital, data.y).plot(kind='bar')

plt.title('Stacked Bar Chart of Marital Status vs Purchase')
plt.xlabel('Marital Status')
plt.ylabel('Proportion of Customers')
plt.savefig('mariral_vs_pur_stack')

In [None]:
pd.crosstab(data.education, data.y).plot(kind='bar')

plt.title('Stacked Bar Chart of Marital Status vs Purchase')
plt.xlabel('Marital Status')
plt.ylabel('Proportion of Customers')
plt.savefig('mariral_vs_pur_stack')

In [None]:
pd.crosstab(data.day_of_week, data.y).plot(kind='bar')

plt.title('Stacked Bar Chart of Marital Status vs Purchase')
plt.xlabel('Marital Status')
plt.ylabel('Proportion of Customers')
plt.savefig('mariral_vs_pur_stack')

In [None]:
pd.crosstab(data.month, data.y).plot(kind='bar')

plt.title('Stacked Bar Chart of Marital Status vs Purchase')
plt.xlabel('Marital Status')
plt.ylabel('Proportion of Customers')
plt.savefig('mariral_vs_pur_stack')

In [None]:
data.age.hist()

plt.title('Histogram of Age')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.savefig('hist_age')

Most of the customers of the bank in this dataset are in the age range of 30–40.

In [None]:
pd.crosstab(data.poutcome, data.y).plot(kind='bar')

plt.title('Purchase Frequency for Poutcome')
plt.xlabel('Poutcome')
plt.ylabel('Frequency of Purchase')
plt.savefig('pur_fre_pout_bar')

## Create dummy variable

In [None]:
data.isnull().sum()

In [None]:
job = pd.get_dummies(data.job, prefix = 'job', prefix_sep='_', drop_first = True)
job.head()

In [None]:
marital = pd.get_dummies(data.marital, prefix = 'marital', prefix_sep='_', drop_first = True)
marital.head()

In [None]:
education = pd.get_dummies(data.education, prefix = 'education', prefix_sep='_', drop_first = True)
education.head()

In [None]:
default = pd.get_dummies(data.default, prefix = 'default', prefix_sep='_', drop_first = True)
default.head()

In [None]:
housing = pd.get_dummies(data.housing, prefix = 'housing', prefix_sep='_', drop_first = True)
housing.head()

In [None]:
loan = pd.get_dummies(data.loan, prefix = 'loan', prefix_sep='_', drop_first = True)
loan.head()

In [None]:
contact = pd.get_dummies(data.contact, prefix = 'contact', prefix_sep='_', drop_first = True)
contact.head()

In [None]:
month = pd.get_dummies(data.month, prefix = 'month', prefix_sep='_', drop_first = True)
month.head()

In [None]:
day_of_week = pd.get_dummies(data.day_of_week, prefix = 'day_of_week', prefix_sep='_', drop_first = True)
day_of_week.head()

In [None]:
poutcome = pd.get_dummies(data.poutcome, prefix = 'poutcome', prefix_sep='_', drop_first = True)
poutcome.head()

In [None]:
data.drop(['poutcome', 'day_of_week', 'month', 'contact', 'loan', 'housing', 'default', 'education', 'marital', 'job'],axis=1,inplace=True)
data.head()

In [None]:
data_final = pd.concat([poutcome, day_of_week, month, contact, loan, housing, default, education, marital, job, data],axis=1)
data_final.head()

In [None]:
data_final.columns

## Checking for independence between features

In [None]:
sns.heatmap(data_final.corr())

In [None]:
X = data_final.iloc[:, :-1].values
y = data_final.iloc[:, 0].values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 25)

## Deploying and evaluating the model

In [None]:
LogReg = LogisticRegression()
LogReg.fit(X_train, y_train)

In [None]:
y_pred = LogReg.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
confusion_matrix

The results from the confusion matrix are telling us that 1100 and 7138 are the number of correct predictions. 0 and 0 are the number of incorrect predictions.

## Training and Testing accuracy

In [None]:
print('Training score {}'.format(LogReg.score(X_train, y_train)))
print('Testing score {}'.format(LogReg.score(X_test, y_test)))