Installing the CatBoost

In [1]:
!pip install catboost

Collecting catboost
[?25l  Downloading https://files.pythonhosted.org/packages/7e/c1/c1c4707013f9e2f8a96899dd3a87f66c9167d6d776a6dc8fe7ec8678d446/catboost-0.24.3-cp36-none-manylinux1_x86_64.whl (66.3MB)
[K     |████████████████████████████████| 66.3MB 77kB/s 
Installing collected packages: catboost
Successfully installed catboost-0.24.3


Importing the libraries

In [2]:
import numpy as np
import pandas as pd

Importing the dataset

In [3]:
dataset = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Bank Marketing/bank-additional-full.csv", sep=';')

In [4]:
dataset.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,261,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,149,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,226,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,151,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,307,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


Input variables:
# bank client data:
1 - age (numeric)

2 - job : type of job (categorical: 'admin.','blue-collar','entrepreneur','housemaid','management','retired','self-employed','services','student','technician','unemployed','unknown')

3 - marital : marital status (categorical: 'divorced','married','single','unknown'; note: 'divorced' means divorced or widowed)

4 - education (categorical: 'basic.4y','basic.6y','basic.9y','high.school','illiterate','professional.course','university.degree','unknown')

5 - default: has credit in default? (categorical: 'no','yes','unknown')

6 - housing: has housing loan? (categorical: 'no','yes','unknown')

7 - loan: has personal loan? (categorical: 'no','yes','unknown')

# related with the last contact of the current campaign:

8 - contact: contact communication type (categorical: 'cellular','telephone')

9 - month: last contact month of year (categorical: 'jan', 'feb', 'mar', ..., 'nov', 'dec')

10 - day_of_week: last contact day of the week (categorical: 'mon','tue','wed','thu','fri')

11 - duration: last contact duration, in seconds (numeric). Important note: this attribute highly affects the output target (e.g., if duration=0 then y='no'). Yet, the duration is not known before a call is performed. Also, after the end of the call y is obviously known. Thus, this input should only be included for benchmark purposes and should be discarded if the intention is to have a realistic predictive model.

# other attributes:

12 - campaign: number of contacts performed during this campaign and for this client (numeric, includes last contact)

13 - pdays: number of days that passed by after the client was last contacted from a previous campaign (numeric; 999 means client was not previously contacted)

14 - previous: number of contacts performed before this campaign and for this client (numeric)

15 - poutcome: outcome of the previous marketing campaign (categorical: 'failure','nonexistent','success')

# social and economic context attributes

16 - emp.var.rate: employment variation rate - quarterly indicator (numeric)

17 - cons.price.idx: consumer price index - monthly indicator (numeric)

18 - cons.conf.idx: consumer confidence index - monthly indicator (numeric)

19 - euribor3m: euribor 3 month rate - daily indicator (numeric)

20 - nr.employed: number of employees - quarterly indicator (numeric)

Output variable (desired target):

21 - y: has the client subscribed a term deposit? (binary: 'yes','no')

In [5]:
dataset.drop(['duration','contact','month','day_of_week','default','pdays',],axis=1,inplace=True)

In [6]:
dataset.head()

Unnamed: 0,age,job,marital,education,housing,loan,campaign,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,1,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,no,no,1,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,yes,no,1,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,1,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,yes,1,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [7]:
dataset.shape

(41188, 15)

In [8]:
dataset.isnull().sum()

age               0
job               0
marital           0
education         0
housing           0
loan              0
campaign          0
previous          0
poutcome          0
emp.var.rate      0
cons.price.idx    0
cons.conf.idx     0
euribor3m         0
nr.employed       0
y                 0
dtype: int64

In [9]:
dataset.replace(['basic.6y','basic.4y', 'basic.9y'], 'basic', inplace=True)

In [10]:
dataset.head()

Unnamed: 0,age,job,marital,education,housing,loan,campaign,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic,no,no,1,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,no,no,1,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,yes,no,1,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic,no,no,1,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,yes,1,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [11]:
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

Data preprocessing

In [12]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [13]:
y

array([0, 0, 0, ..., 0, 1, 0])

In [14]:

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1, 2, 3, 4, 5, 8])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

In [15]:
X

array([[0.0, 0.0, 0.0, ..., -36.4, 4.857, 5191.0],
       [0.0, 0.0, 0.0, ..., -36.4, 4.857, 5191.0],
       [0.0, 0.0, 0.0, ..., -36.4, 4.857, 5191.0],
       ...,
       [0.0, 0.0, 0.0, ..., -50.8, 1.028, 4963.6],
       [0.0, 0.0, 0.0, ..., -50.8, 1.028, 4963.6],
       [0.0, 0.0, 0.0, ..., -50.8, 1.028, 4963.6]], dtype=object)

Splitting the dataset into training and testing set

In [16]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

Feature Scalling

In [17]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

Training the CatBoost model on the training set 

In [18]:
from catboost import CatBoostClassifier
classifier = CatBoostClassifier()
classifier.fit(X_train, y_train)

Learning rate set to 0.04582
0:	learn: 0.6464934	total: 66.7ms	remaining: 1m 6s
1:	learn: 0.6046788	total: 84.7ms	remaining: 42.3s
2:	learn: 0.5677298	total: 101ms	remaining: 33.6s
3:	learn: 0.5351034	total: 117ms	remaining: 29.2s
4:	learn: 0.5064314	total: 133ms	remaining: 26.5s
5:	learn: 0.4814414	total: 150ms	remaining: 24.8s
6:	learn: 0.4592051	total: 165ms	remaining: 23.5s
7:	learn: 0.4405248	total: 181ms	remaining: 22.5s
8:	learn: 0.4237037	total: 200ms	remaining: 22s
9:	learn: 0.4083313	total: 216ms	remaining: 21.4s
10:	learn: 0.3947682	total: 232ms	remaining: 20.9s
11:	learn: 0.3826443	total: 248ms	remaining: 20.4s
12:	learn: 0.3719561	total: 264ms	remaining: 20.1s
13:	learn: 0.3621270	total: 282ms	remaining: 19.9s
14:	learn: 0.3539639	total: 298ms	remaining: 19.6s
15:	learn: 0.3465489	total: 316ms	remaining: 19.4s
16:	learn: 0.3401295	total: 332ms	remaining: 19.2s
17:	learn: 0.3337446	total: 349ms	remaining: 19s
18:	learn: 0.3284595	total: 365ms	remaining: 18.8s
19:	learn: 0.3

<catboost.core.CatBoostClassifier at 0x7fba97335198>

Making Confusing Matrix

In [19]:
from sklearn.metrics import confusion_matrix
y_pred =  classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[7179  140]
 [ 692  227]]


Accuracy

In [20]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.8990046127700898

Applying K fold cross validation

In [21]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
6:	learn: 0.4654359	total: 110ms	remaining: 15.7s
7:	learn: 0.4467855	total: 125ms	remaining: 15.5s
8:	learn: 0.4299640	total: 140ms	remaining: 15.5s
9:	learn: 0.4160991	total: 155ms	remaining: 15.3s
10:	learn: 0.4022223	total: 170ms	remaining: 15.3s
11:	learn: 0.3900686	total: 185ms	remaining: 15.3s
12:	learn: 0.3789634	total: 213ms	remaining: 16.2s
13:	learn: 0.3687257	total: 230ms	remaining: 16.2s
14:	learn: 0.3602215	total: 246ms	remaining: 16.1s
15:	learn: 0.3524493	total: 263ms	remaining: 16.2s
16:	learn: 0.3457538	total: 278ms	remaining: 16.1s
17:	learn: 0.3389547	total: 293ms	remaining: 16s
18:	learn: 0.3333462	total: 309ms	remaining: 15.9s
19:	learn: 0.3282636	total: 325ms	remaining: 15.9s
20:	learn: 0.3232668	total: 340ms	remaining: 15.8s
21:	learn: 0.3189457	total: 353ms	remaining: 15.7s
22:	learn: 0.3154018	total: 369ms	remaining: 15.7s
23:	learn: 0.3124301	total: 384ms	remaining: 15.6s
24:	learn: 0.3093303	to

Predicting the test set result

In [22]:
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[0 1]
 [0 0]
 [0 0]
 ...
 [0 1]
 [0 0]
 [0 0]]


In [23]:
# print(classifier.predict(sc.transform([[1, 152, 79, 36, 0, 33.7, 0.783, 36]])))