In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import KFold
import xgboost as xgb
from xgboost import XGBClassifier
import sklearn.metrics as metrics
import math
import matplotlib.pyplot as plt

In [2]:
import warnings; warnings.simplefilter('ignore')

First, lets read our dataset and check the attributes.

In [3]:
dataset = pd.read_csv('dataset.csv')
dataset.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,no


We have 13 attributes.
Here we can see both numerical and nominal attributes. We can label nominal attributes.
Also we need to categorise numerical attributes because using them as they are now doesn't mean anything. And we need to check unknown attributes.

In [4]:
NanDf = dataset.replace('unknown',np.nan)

In [5]:
NanDf.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,y
0,58,management,married,tertiary,no,2143,yes,no,,5,may,261,1,no
1,44,technician,single,secondary,no,29,yes,no,,5,may,151,1,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,,5,may,76,1,no
3,47,blue-collar,married,,no,1506,yes,no,,5,may,92,1,no
4,33,,single,,no,1,no,no,,5,may,198,1,no


In [6]:
#Percentage of NAN Values 
NAN = [(c, NanDf[c].isna().mean()*100) for c in NanDf]
NAN = pd.DataFrame(NAN, columns=["column_name", "percentage"])

In [7]:
NAN = NAN[NAN.percentage > 50]
NAN.sort_values("percentage", ascending=False)

Unnamed: 0,column_name,percentage


We don't have any attributes with more than %50 percent unknown information, so we can use all of our attributes for model training.

In [8]:
object_columns_df = NanDf.select_dtypes(include=['object'])
numerical_columns_df = NanDf.select_dtypes(exclude=['object'])

Categorical Features:

In [9]:
object_columns_df.dtypes

job          object
marital      object
education    object
default      object
housing      object
loan         object
contact      object
month        object
y            object
dtype: object

Numerical Features:

In [10]:
numerical_columns_df.dtypes


age         int64
balance     int64
day         int64
duration    int64
campaign    int64
dtype: object

We need to find null values in numerical features.

In [11]:
#Number of null values in each feature
null_counts = object_columns_df.isnull().sum()
print("Number of null values in each column:\n{}".format(null_counts))

Number of null values in each column:
job            235
marital          0
education     1531
default          0
housing          0
loan             0
contact      12765
month            0
y                0
dtype: int64


We have 3 features that needs to be filled, since they have high NA I will fill them with None.

In [12]:
columns_None = ['job','education','contact']
object_columns_df[columns_None] = object_columns_df[columns_None].fillna('None')

Lets recheck our null values.

In [13]:
null_counts = object_columns_df.isnull().sum()
print("Number of null values in each column:\n{}".format(null_counts))

Number of null values in each column:
job          0
marital      0
education    0
default      0
housing      0
loan         0
contact      0
month        0
y            0
dtype: int64


Dealing with categorical features

In [14]:
#Number of null values in each feature
null_counts = object_columns_df.isnull().sum()
print("Number of null values in each column:\n{}".format(null_counts))

Number of null values in each column:
job          0
marital      0
education    0
default      0
housing      0
loan         0
contact      0
month        0
y            0
dtype: int64


We don't have any null value at categorical features.

We will encode categorical data.

In [15]:
for col in object_columns_df:
    print('\n', col, ':')
    print(object_columns_df[col].unique())


 job :
['management' 'technician' 'entrepreneur' 'blue-collar' 'None' 'retired'
 'admin' 'services' 'self-employed' 'unemployed' 'housemaid' 'student']

 marital :
['married' 'single' 'divorced']

 education :
['tertiary' 'secondary' 'None' 'primary']

 default :
['no' 'yes']

 housing :
['yes' 'no']

 loan :
['no' 'yes']

 contact :
['None' 'cellular' 'telephone']

 month :
['may' 'jun' 'jul' 'aug' 'oct' 'nov' 'dec' 'jan' 'feb' 'mar' 'apr']

 y :
['no' 'yes']


I will create a mapping which includes all unique data.

In [16]:
bin_map = {'no':0, 'yes' : 1, 'None' : 0, 'cellular' : 1, 'telephone' : 2, 'primary' : 1, 'secondary' : 2 , 'tertiary' : 3, 'single' : 0 , 'married' : 1, 'divorced' : 2, 'management' : 1, 'technician' : 2, 'entrepreneur' : 3, 'blue-collar' : 4, 'retired' : 5,
 'admin' : 6, 'services' : 7, 'self-employed' : 8, 'unemployed' : 9, 'housemaid' : 10, 'student' : 11, 'may' : 5, 'jun' : 6, 'jul' : 7, 'aug' : 8, 'oct' : 10, 'nov' : 11, 'dec' : 12, 'jan' : 1, 'feb' : 2, 'mar' : 3, 'apr' : 4 }

In [17]:
for col in object_columns_df:
    object_columns_df[col] = object_columns_df[col].map(bin_map)

In [18]:
object_columns_df.head()

Unnamed: 0,job,marital,education,default,housing,loan,contact,month,y
0,1,1,3,0,1,0,0,5,0
1,2,0,2,0,1,0,0,5,0
2,3,1,2,0,1,1,0,5,0
3,4,1,0,0,1,0,0,5,0
4,0,0,0,0,0,0,0,5,0


Here I didn't categorize the numerical attributes such as age to young/middle aged/aged.
According to success ratio, I can change age, balance and duration features to be more useful.

In [19]:
#concat categorical and numerical features
df_final = pd.concat([object_columns_df, numerical_columns_df], axis=1,sort=False)
df_final.head()

Unnamed: 0,job,marital,education,default,housing,loan,contact,month,y,age,balance,day,duration,campaign
0,1,1,3,0,1,0,0,5,0,58,2143,5,261,1
1,2,0,2,0,1,0,0,5,0,44,29,5,151,1
2,3,1,2,0,1,1,0,5,0,33,2,5,76,1
3,4,1,0,0,1,0,0,5,0,47,1506,5,92,1
4,0,0,0,0,0,0,0,5,0,33,1,5,198,1


As we can see age, balance and duration features are not looking good(they can be improved), but for now I am not gonna touch them. If we don't get the wanted accuracy at the end I will return and make changes.

In [20]:
dataset = df_final.to_numpy()

In [21]:
Y = dataset[:,8]

In [22]:
X1 = dataset[:,0:8]
X2 = dataset[:,9:15]
X = np.concatenate((X1,X2),axis=1)

Here I created features and output arrays. Then combine them in a Dmatrix format which works faster with xgboost.

In [23]:
data_dmatrix = xgb.DMatrix(data=X, label=Y)

After finishing preprocessing, I selected my model's parameters. I want it to give binary output because we have 2 different output 0 and 1 which corresponds to yes or no.

In [24]:
params = {'objective':'binary:hinge','colsample_bytree':0.3,'learning_rate':0.1,'max_depth':5, 'alpha':10}

After specifying model parameters, I wanted to test model with 5-fold CV to do this, I used xgb.cv which works similar to sklearn k-fold CV. I selected error as output metric because it gives error = 1-accuracy as output.

In [25]:
cv_results = xgb.cv(dtrain=data_dmatrix, params=params, nfold=5, num_boost_round=50, metrics = 'error', early_stopping_rounds=10, as_pandas=True, seed=0) 

In [26]:
cv_results

Unnamed: 0,train-error-mean,train-error-std,test-error-mean,test-error-std
0,0.9276,0.000281,0.9276,0.001125
1,0.9276,0.000281,0.9276,0.001125
2,0.9276,0.000281,0.9276,0.001125
3,0.9276,0.000281,0.9276,0.001125
4,0.9276,0.000281,0.9276,0.001125
5,0.140581,0.016081,0.13855,0.013503
6,0.075869,0.007324,0.076,0.003371
7,0.068881,0.003186,0.06965,0.004165


In [27]:
print((cv_results["test-error-mean"]).tail())

3    0.92760
4    0.92760
5    0.13855
6    0.07600
7    0.06965
Name: test-error-mean, dtype: float64


Here we calculated WrongCases/AllCases as 0.06965 this means

Our accuracy RightCases/AllCases = 1 - WrongCases/AllCases ~ 0.93 which is better than wanted accuracy 0.81

In [28]:
np.savetxt("X.csv", X, delimiter=",")
np.savetxt("Y.csv", Y, delimiter=",")

I saved features and output as csv files to use them in next time.

In [29]:
df_final.to_csv('df_final.csv')