In [2]:
# Jupyter Notebook with Matplotlib Inline
%matplotlib notebook

# Importing necessary modules
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as plt
from sklearn.model_selection import train_test_split

In [4]:
# Reading the csv file.
df = pd.read_csv("bank-additional-full.csv", delimiter=';')

In [5]:
# Columns in the data
df.columns

Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed', 'y'],
      dtype='object')

In [6]:
# Renaming columns
df.columns = ['age', 'job', 'marital', 'education', 'default', 'housing', 'personal', 'contact', 'month', 'day_of_week', 
              'duration', 'campaign', 'pdays', 'pcontacts', 'poutcome', 'evr', 'cpi', 'cci', 'euribor', 'employees', 'y']

In [7]:
# Top 5 rows of the data 
df.head()

Unnamed: 0,age,job,marital,education,default,housing,personal,contact,month,day_of_week,...,campaign,pdays,pcontacts,poutcome,evr,cpi,cci,euribor,employees,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [8]:
# New columns
df.columns

Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'personal',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'pcontacts', 'poutcome', 'evr', 'cpi', 'cci', 'euribor', 'employees',
       'y'],
      dtype='object')

In [9]:
# Dividing the data into train and test sets
train, test = train_test_split(df, test_size=0.2) 

In [10]:
# Shape of the training dataset
train.shape

(32950, 21)

In [3]:
# Exporting the files
out_train = 'train.csv'
train.to_csv(out_train)
out_test = 'test.csv'
test.to_csv(out_test)

In [11]:
# Reading newly created test and training data as train_sample and test_sample variables
train_sample = pd.read_csv("train.csv", delimiter=',') 
test_sample = pd.read_csv("test.csv", delimiter=',')

In [12]:
#New data set contains 22 columns as opposed to 21
train_sample.shape

(32950, 21)

In [297]:
#unnamed:0 is the extra column
train_sample.columns

Index(['Unnamed: 0', 'age', 'job', 'marital', 'education', 'default',
       'housing', 'loan', 'contact', 'month', 'day_of_week', 'duration',
       'campaign', 'pdays', 'pcontacts', 'poutcome', 'evr', 'cpi', 'cci',
       'euribor', 'employees', 'y'],
      dtype='object')

In [298]:
train_sample.head()

Unnamed: 0.1,Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,...,campaign,pdays,pcontacts,poutcome,evr,cpi,cci,euribor,employees,y
0,29169,41,admin.,single,professional.course,no,yes,yes,cellular,apr,...,1,999,0,nonexistent,-1.8,93.075,-47.1,1.405,5099.1,no
1,16663,55,management,married,basic.9y,no,yes,no,cellular,jul,...,2,999,0,nonexistent,1.4,93.918,-42.7,4.963,5228.1,no
2,28623,55,admin.,married,high.school,no,yes,no,cellular,apr,...,3,999,1,failure,-1.8,93.075,-47.1,1.415,5099.1,no
3,26979,36,admin.,divorced,university.degree,no,no,yes,cellular,nov,...,2,999,0,nonexistent,-0.1,93.2,-42.0,4.076,5195.8,no
4,543,32,entrepreneur,single,university.degree,no,yes,no,telephone,may,...,2,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [299]:
#Dropping extra column inplace for test_sample
train_sample.drop('Unnamed: 0', axis = 1, inplace = True)

In [None]:
# train = pd.read_csv("train.csv", delimiter=',', index_col = 0).reset_index().drop('index',axis=1)
# test = pd.read_csv("test.csv", delimiter=',', index_col = 0).reset_index().drop('index',axis=1)

In [300]:
train_sample.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,pcontacts,poutcome,evr,cpi,cci,euribor,employees,y
0,41,admin.,single,professional.course,no,yes,yes,cellular,apr,fri,...,1,999,0,nonexistent,-1.8,93.075,-47.1,1.405,5099.1,no
1,55,management,married,basic.9y,no,yes,no,cellular,jul,wed,...,2,999,0,nonexistent,1.4,93.918,-42.7,4.963,5228.1,no
2,55,admin.,married,high.school,no,yes,no,cellular,apr,wed,...,3,999,1,failure,-1.8,93.075,-47.1,1.415,5099.1,no
3,36,admin.,divorced,university.degree,no,no,yes,cellular,nov,thu,...,2,999,0,nonexistent,-0.1,93.2,-42.0,4.076,5195.8,no
4,32,entrepreneur,single,university.degree,no,yes,no,telephone,may,tue,...,2,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [301]:
#Dropping extra column inplace for train_sample 
test_sample.drop('Unnamed: 0', axis = 1, inplace = True)

In [302]:
train_sample.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,pcontacts,poutcome,evr,cpi,cci,euribor,employees,y
0,41,admin.,single,professional.course,no,yes,yes,cellular,apr,fri,...,1,999,0,nonexistent,-1.8,93.075,-47.1,1.405,5099.1,no
1,55,management,married,basic.9y,no,yes,no,cellular,jul,wed,...,2,999,0,nonexistent,1.4,93.918,-42.7,4.963,5228.1,no
2,55,admin.,married,high.school,no,yes,no,cellular,apr,wed,...,3,999,1,failure,-1.8,93.075,-47.1,1.415,5099.1,no
3,36,admin.,divorced,university.degree,no,no,yes,cellular,nov,thu,...,2,999,0,nonexistent,-0.1,93.2,-42.0,4.076,5195.8,no
4,32,entrepreneur,single,university.degree,no,yes,no,telephone,may,tue,...,2,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [303]:
#Export the newly cleaned datasets 
out_train = 'train.csv'
train_sample.to_csv(out_train)
out_test = 'test.csv'
test_sample.to_csv(out_test)

In [306]:
#................Data split into Train and Test. Dropped unnecessary columns and load the file..................

In [13]:
# Read the cleaned datasets
train = pd.read_csv("train.csv", delimiter=',') 
test = pd.read_csv("test.csv", delimiter=',')

In [14]:
# Shape of the Training dataset 
train.shape

(32950, 21)

In [15]:
# Shape of test dataset
test.shape

(8238, 21)

In [32]:
# Change the column names
train.columns = ['age', 'job', 'marital', 'education', 'default', 'housing', 'personal',
       'contact_type', 'month', 'day', 'duration', 'dcontacts', 'pdays',
       'pcontacts', 'poutcome', 'evr', 'cpi', 'cci', 'euribor', 'employees',
       'y']

In [33]:
# Counts of each outcome
train.groupby('y')['y'].count()

y
no     29208
yes     3742
Name: y, dtype: int64

In [34]:
train['y'].value_counts()

no     29208
yes     3742
Name: y, dtype: int64

In [112]:
# EDA with respect to Categorical Variables.

In [113]:
# 'job' variable

In [74]:
pd.crosstab(train.job, train.y)

y,no,yes
job,Unnamed: 1_level_1,Unnamed: 2_level_1
admin.,7290,1066
blue-collar,6846,506
entrepreneur,1095,102
housemaid,741,89
management,2073,267
retired,1027,355
self-employed,1023,125
services,2902,259
student,492,232
technician,4784,590


In [75]:
# Countplot for job vs 'y'
sns.countplot(y="job", hue="y", data=train, palette="RdBu")
# palette = (None, "muted", "RdBu", "Blues_d", "husl" )
# https://seaborn.pydata.org/generated/seaborn.color_palette.html#seaborn.color_palette

<IPython.core.display.Javascript object>

In [None]:
# Since the number of entries for each category in a categorical variable is quite varying, percentage of each category would
# give a better look at the data (Hence we normalize the index in 'crosstab')
# The proportions shown in the output are row proportions

In [36]:
# Normalizing index because we are analyzing @ "category-level"
#1-0 Encoding Creating dummy varz (https://www.youtube.com/watch?v=0s_1IsROgDc)
pd.crosstab(train.job, train.y, normalize='index') 

y,no,yes
job,Unnamed: 1_level_1,Unnamed: 2_level_1
admin.,0.872427,0.127573
blue-collar,0.931175,0.068825
entrepreneur,0.914787,0.085213
housemaid,0.892771,0.107229
management,0.885897,0.114103
retired,0.743126,0.256874
self-employed,0.891115,0.108885
services,0.918064,0.081936
student,0.679558,0.320442
technician,0.890212,0.109788


In [37]:
pd.crosstab(train.job, train.y, normalize='columns') 

y,no,yes
job,Unnamed: 1_level_1,Unnamed: 2_level_1
admin.,0.249589,0.284874
blue-collar,0.234388,0.135222
entrepreneur,0.03749,0.027258
housemaid,0.02537,0.023784
management,0.070974,0.071352
retired,0.035162,0.094869
self-employed,0.035025,0.033405
services,0.099356,0.069214
student,0.016845,0.061999
technician,0.163791,0.15767


In [76]:
# 'Marital' variable 

In [79]:
pd.crosstab(train.marital, train.y)

y,no,yes
marital,Unnamed: 1_level_1,Unnamed: 2_level_1
divorced,3262,390
married,17940,2044
single,7954,1298
unknown,52,10


In [78]:
sns.countplot(y="marital", hue="y", data=train, palette="RdBu")

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x10111eb8>

In [80]:
pd.crosstab(train.marital, train.y, normalize='index')

y,no,yes
marital,Unnamed: 1_level_1,Unnamed: 2_level_1
divorced,0.893209,0.106791
married,0.897718,0.102282
single,0.859706,0.140294
unknown,0.83871,0.16129


In [114]:
pd.crosstab(train.marital, train.y, normalize='columns')

y,no,yes
marital,Unnamed: 1_level_1,Unnamed: 2_level_1
divorced,0.111682,0.104222
married,0.614215,0.546232
single,0.272323,0.346873
unknown,0.00178,0.002672


In [None]:
# 'Education' variable

In [115]:
pd.crosstab(train.education, train.y)

y,no,yes
education,Unnamed: 1_level_1,Unnamed: 2_level_1
basic.4y,2974,356
basic.6y,1672,143
basic.9y,4474,372
high.school,6780,855
illiterate,9,3
professional.course,3697,497
university.degree,8396,1320
unknown,1206,196


In [116]:
sns.countplot(y="education", hue="y", data=train, palette="RdBu")

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x10110c18>

In [117]:
pd.crosstab(train.education, train.y, normalize='index') 

y,no,yes
education,Unnamed: 1_level_1,Unnamed: 2_level_1
basic.4y,0.893093,0.106907
basic.6y,0.921212,0.078788
basic.9y,0.923236,0.076764
high.school,0.888016,0.111984
illiterate,0.75,0.25
professional.course,0.881497,0.118503
university.degree,0.864142,0.135858
unknown,0.8602,0.1398


In [118]:
pd.crosstab(train.education, train.y, normalize='columns')

y,no,yes
education,Unnamed: 1_level_1,Unnamed: 2_level_1
basic.4y,0.101821,0.095136
basic.6y,0.057245,0.038215
basic.9y,0.153177,0.099412
high.school,0.232128,0.228487
illiterate,0.000308,0.000802
professional.course,0.126575,0.132817
university.degree,0.287455,0.352753
unknown,0.04129,0.052378


In [120]:
# default

In [121]:
pd.crosstab(train.default, train.y)

y,no,yes
default,Unnamed: 1_level_1,Unnamed: 2_level_1
no,22666,3381
unknown,6539,361
yes,3,0


In [122]:
sns.countplot(y="default", hue="y", data=train, palette="RdBu")

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0xe606630>

In [123]:
pd.crosstab(train.default, train.y, normalize='index') 

y,no,yes
default,Unnamed: 1_level_1,Unnamed: 2_level_1
no,0.870196,0.129804
unknown,0.947681,0.052319
yes,1.0,0.0


In [124]:
pd.crosstab(train.default, train.y, normalize='columns') 

y,no,yes
default,Unnamed: 1_level_1,Unnamed: 2_level_1
no,0.77602,0.903528
unknown,0.223877,0.096472
yes,0.000103,0.0


In [None]:
# 'Housing' variable

In [82]:
pd.crosstab(train.housing, train.y)

y,no,yes
housing,Unnamed: 1_level_1,Unnamed: 2_level_1
no,13254,1640
unknown,704,81
yes,15250,2021


In [83]:
sns.countplot(y="housing", hue="y", data=train, palette="RdBu")

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x1053e668>

In [84]:
pd.crosstab(train.housing, train.y, normalize='index') 

y,no,yes
housing,Unnamed: 1_level_1,Unnamed: 2_level_1
no,0.889889,0.110111
unknown,0.896815,0.103185
yes,0.882983,0.117017


In [85]:
pd.crosstab(train.housing, train.y, normalize='columns') 

y,no,yes
housing,Unnamed: 1_level_1,Unnamed: 2_level_1
no,0.45378,0.438268
unknown,0.024103,0.021646
yes,0.522117,0.540086


In [86]:
# Personal

In [18]:
pd.crosstab(train.personal, train.y)

y,no,yes
personal,Unnamed: 1_level_1,Unnamed: 2_level_1
no,24044,3110
unknown,704,81
yes,4460,551


In [87]:
sns.countplot(y="personal", hue="y", data=train, palette="RdBu")

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x10549b38>

In [88]:
pd.crosstab(train.personal, train.y, normalize='index') 

y,no,yes
personal,Unnamed: 1_level_1,Unnamed: 2_level_1
no,0.885468,0.114532
unknown,0.896815,0.103185
yes,0.890042,0.109958


In [89]:
pd.crosstab(train.personal, train.y, normalize='columns') 

y,no,yes
personal,Unnamed: 1_level_1,Unnamed: 2_level_1
no,0.823199,0.831106
unknown,0.024103,0.021646
yes,0.152698,0.147247


In [126]:
train.columns

Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'personal',
       'contact_type', 'month', 'day', 'duration', 'dcontacts', 'pdays',
       'pcontacts', 'poutcome', 'evr', 'cpi', 'cci', 'euribor', 'employees',
       'y'],
      dtype='object')

In [90]:
# 'Contact_type' variable

In [91]:
pd.crosstab(train.contact_type, train.y)

y,no,yes
contact_type,Unnamed: 1_level_1,Unnamed: 2_level_1
cellular,17784,3115
telephone,11424,627


In [92]:
sns.countplot(y="contact_type", hue="y", data=train, palette="RdBu")

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0xf460d30>

In [93]:
pd.crosstab(train.contact_type, train.y, normalize='index') 

y,no,yes
contact_type,Unnamed: 1_level_1,Unnamed: 2_level_1
cellular,0.85095,0.14905
telephone,0.947971,0.052029


In [94]:
pd.crosstab(train.contact_type, train.y, normalize='columns') 

y,no,yes
contact_type,Unnamed: 1_level_1,Unnamed: 2_level_1
cellular,0.608874,0.832443
telephone,0.391126,0.167557


In [95]:
# 'Month' variable

In [96]:
pd.crosstab(train.month, train.y)

y,no,yes
month,Unnamed: 1_level_1,Unnamed: 2_level_1
apr,1681,428
aug,4433,530
dec,75,69
jul,5191,532
jun,3812,458
mar,211,218
may,10302,725
nov,2930,319
oct,329,253
sep,244,210


In [97]:
sns.countplot(y="month", hue="y", data=train, palette="RdBu")

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0xde6dcc0>

In [98]:
pd.crosstab(train.month, train.y, normalize='index') 

y,no,yes
month,Unnamed: 1_level_1,Unnamed: 2_level_1
apr,0.79706,0.20294
aug,0.89321,0.10679
dec,0.520833,0.479167
jul,0.907042,0.092958
jun,0.89274,0.10726
mar,0.491841,0.508159
may,0.934252,0.065748
nov,0.901816,0.098184
oct,0.565292,0.434708
sep,0.537445,0.462555


In [100]:
pd.crosstab(train.month, train.y, normalize='columns') 

y,no,yes
month,Unnamed: 1_level_1,Unnamed: 2_level_1
apr,0.057553,0.114377
aug,0.151773,0.141635
dec,0.002568,0.018439
jul,0.177725,0.14217
jun,0.130512,0.122394
mar,0.007224,0.058258
may,0.352712,0.193747
nov,0.100315,0.085249
oct,0.011264,0.067611
sep,0.008354,0.05612


In [101]:
# 'Day' variable

In [21]:
pd.crosstab(train.day, train.y)

y,no,yes
day,Unnamed: 1_level_1,Unnamed: 2_level_1
fri,5549,690
mon,6117,686
thu,6028,853
tue,5753,763
wed,5761,750


In [102]:
sns.countplot(y="day", hue="y", data=train, palette="RdBu")

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0xe305198>

In [103]:
pd.crosstab(train.day, train.y, normalize='index') 

y,no,yes
day,Unnamed: 1_level_1,Unnamed: 2_level_1
fri,0.889405,0.110595
mon,0.899162,0.100838
thu,0.876035,0.123965
tue,0.882904,0.117096
wed,0.88481,0.11519


In [105]:
pd.crosstab(train.day, train.y, normalize='columns') 

y,no,yes
day,Unnamed: 1_level_1,Unnamed: 2_level_1
fri,0.189982,0.184393
mon,0.209429,0.183324
thu,0.206382,0.227953
tue,0.196967,0.203902
wed,0.19724,0.200428


In [106]:
# 'Poutcome' variable

In [107]:
pd.crosstab(train.poutcome, train.y)

y,no,yes
poutcome,Unnamed: 1_level_1,Unnamed: 2_level_1
failure,2902,483
nonexistent,25936,2546
success,370,713


In [108]:
sns.countplot(y="poutcome", hue="y", data=train, palette="RdBu")

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x10117588>

In [109]:
pd.crosstab(train.poutcome, train.y, normalize='index') 

y,no,yes
poutcome,Unnamed: 1_level_1,Unnamed: 2_level_1
failure,0.857312,0.142688
nonexistent,0.91061,0.08939
success,0.341644,0.658356


In [110]:
pd.crosstab(train.poutcome, train.y, normalize='columns') 

y,no,yes
poutcome,Unnamed: 1_level_1,Unnamed: 2_level_1
failure,0.099356,0.129075
nonexistent,0.887976,0.680385
success,0.012668,0.19054


In [193]:
# varz = [age, duration, campaign, pdays, previous, emp.var.rate, cons.price.idx, cons.conf.idx, euribor3m, nr.employed]

In [206]:
# varz = train[['age','duration','campaign', 'pdays', 'previous', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']]
# #           [, duration, campaign, pdays, previous, emp.var.rate, cons.price.idx, cons.conf.idx, euribor3m, nr.employed]
# type(varz)

In [212]:
#Summary statistics of Quantitative variables
train.describe()

Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
count,32950.0,32950.0,32950.0,32950.0,32950.0,32950.0,32950.0,32950.0,32950.0,32950.0
mean,40.004917,258.300698,2.569833,962.262671,0.174476,0.079411,93.575734,-40.514847,3.617256,5166.856995
std,10.446447,258.261566,2.756195,187.437908,0.498745,1.571101,0.578937,4.625622,1.73592,72.36693
min,17.0,0.0,1.0,0.0,0.0,-3.4,92.201,-50.8,0.634,4963.6
25%,32.0,102.0,1.0,999.0,0.0,-1.8,93.075,-42.7,1.344,5099.1
50%,38.0,180.0,2.0,999.0,0.0,1.1,93.749,-41.8,4.857,5191.0
75%,47.0,320.0,3.0,999.0,0.0,1.4,93.994,-36.4,4.961,5228.1
max,98.0,4199.0,56.0,999.0,7.0,1.4,94.767,-26.9,5.045,5228.1


In [223]:
#............................EDA with Numeric Variables............................

In [225]:
sns.boxplot(x="y", y = "age", data = train) 
#binning the age ...plot hist of age....bin according to job category.....bin hosuld of the same magnitude
#binning the age ...plot hist of age....bin hosuld of the same magnitude....bin according to job category

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x13df4ba8>

In [337]:
sns.boxplot(x="y", y="age", data=train)
sns.violinplot(x="y", y="age", data=train, size=6)

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x22ce8c18>

In [331]:
# train.boxplot(by="y", figsize=(12, 6))

In [359]:
# ax = sns.boxplot(x="y", y="age", data=train)
# ax = sns.stripplot(x="y", y="age", data=train, jitter=True, edgecolor="gray")
categorical = ['age', 'duration', 'campaign', 'pdays', 'previous', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']

In [338]:
sns.boxplot(x="y", y = "duration", data = train)
sns.violinplot(x="y", y="duration", data=train, size=6) 
# include upper bound, also try bins  

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x22d6ebe0>

In [339]:
sns.boxplot(x="y", y = "campaign", data = train)
sns.violinplot(x="y", y="campaign", data=train, size=6) 

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x231c0ac8>

In [340]:
sns.boxplot(x="y", y = "pdays", data = train)
sns.violinplot(x="y", y="pdays", data=train, size=6) # replace the ouliers by lower and upper bounds

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x236042b0>

In [343]:
sns.boxplot(x="y", y = "pcontacts", data = train)
sns.violinplot(x="y", y="pcontacts", data=train, size=6) 

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x23a64ba8>

In [342]:
train.columns

Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'pcontacts', 'poutcome', 'evr', 'cpi', 'cci', 'euribor', 'employees',
       'y'],
      dtype='object')

In [346]:
sns.boxplot(x="y", y = "evr", data = train)
sns.violinplot(x="y", y="evr", data=train, size=6) 

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x22945400>

In [347]:
sns.boxplot(x="y", y = "cpi", data = train)
sns.violinplot(x="y", y="cpi", data=train, size=6) 

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x2298e400>

In [348]:
sns.boxplot(x="y", y = "cci", data = train)
sns.violinplot(x="y", y="cci", data=train, size=6) 

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x22aa4b00>

In [349]:
sns.boxplot(x="y", y = "euribor", data = train)
sns.violinplot(x="y", y="euribor", data=train, size=6) 

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x24318748>

In [350]:
sns.boxplot(x="y", y = "employees", data = train)
sns.violinplot(x="y", y="employees", data=train, size=6) 

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x24ceb9e8>

In [354]:
sns.pairplot(train, hue = "y", size = 1);

<IPython.core.display.Javascript object>

In [357]:
sns.FacetGrid(train, hue = "y", size = 5).map(sns.distplot, "age").add_legend();

<IPython.core.display.Javascript object>

In [367]:
df.columns

Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'pcontacts', 'poutcome', 'evr', 'cpi', 'cci', 'euribor', 'employees',
       'y'],
      dtype='object')

In [361]:
sns.FacetGrid(train, hue = "y", size = 5).map(sns.distplot, "duration").add_legend();

<IPython.core.display.Javascript object>

In [365]:
sns.FacetGrid(train, hue = "y", size = 6).map(sns.distplot, "campaign").add_legend();

<IPython.core.display.Javascript object>

In [364]:
sns.FacetGrid(train, hue = "y", size = 6).map(sns.distplot, "pdays").add_legend();

<IPython.core.display.Javascript object>

In [369]:
sns.FacetGrid(train, hue = "y", size = 6).map(sns.distplot, "pcontacts").add_legend();

<IPython.core.display.Javascript object>

In [370]:
sns.FacetGrid(train, hue = "y", size = 5).map(sns.distplot, "evr").add_legend();

<IPython.core.display.Javascript object>

In [371]:
sns.FacetGrid(train, hue = "y", size = 5).map(sns.distplot, "cpi").add_legend();

<IPython.core.display.Javascript object>

In [372]:
sns.FacetGrid(train, hue = "y", size = 5).map(sns.distplot, "cci").add_legend();

<IPython.core.display.Javascript object>

In [373]:
sns.FacetGrid(train, hue = "y", size = 5).map(sns.distplot, "euribor").add_legend();

<IPython.core.display.Javascript object>

In [374]:
sns.FacetGrid(train, hue = "y", size = 5).map(sns.distplot, "employees").add_legend();

<IPython.core.display.Javascript object>

In [59]:
sns.boxplot(data=train, orient="h");
# We notice that there is a huge disparity among the variables which requires us to bring all the variables to a comman scale.

<IPython.core.display.Javascript object>