### Decision Tree Induction

#### Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#### Data Preparation

In [2]:
df = pd.read_csv('./adult.csv')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [3]:
df.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education.num     0
marital.status    0
occupation        0
relationship      0
race              0
sex               0
capital.gain      0
capital.loss      0
hours.per.week    0
native.country    0
income            0
dtype: int64

In [4]:
df['workclass'].value_counts()

Private             22696
Self-emp-not-inc     2541
Local-gov            2093
?                    1836
State-gov            1298
Self-emp-inc         1116
Federal-gov           960
Without-pay            14
Never-worked            7
Name: workclass, dtype: int64

In [5]:
# drop the rows where workclass is ?
df = df[df['workclass'] != '?']

In [6]:
df['education'].value_counts()

HS-grad         9969
Some-college    6777
Bachelors       5182
Masters         1675
Assoc-voc       1321
11th            1057
Assoc-acdm      1020
10th             833
7th-8th          574
Prof-school      558
9th              463
Doctorate        398
12th             393
5th-6th          303
1st-4th          156
Preschool         46
Name: education, dtype: int64

In [7]:
from sklearn import preprocessing

label_encoder = preprocessing.LabelEncoder()
df['education'] = label_encoder.fit_transform(df['education'])

In [8]:
df['marital.status'].value_counts()

Married-civ-spouse       14340
Never-married             9917
Divorced                  4259
Separated                  959
Widowed                    840
Married-spouse-absent      389
Married-AF-spouse           21
Name: marital.status, dtype: int64

In [9]:
df['marital.status'] = label_encoder.fit_transform(df['marital.status'])

In [10]:
df = df[df['occupation'] != '?']

In [11]:
df['occupation'].value_counts()

Prof-specialty       4140
Craft-repair         4099
Exec-managerial      4066
Adm-clerical         3770
Sales                3650
Other-service        3295
Machine-op-inspct    2002
Transport-moving     1597
Handlers-cleaners    1370
Farming-fishing       994
Tech-support          928
Protective-serv       649
Priv-house-serv       149
Armed-Forces            9
Name: occupation, dtype: int64

In [12]:
df['occupation'] = label_encoder.fit_transform(df['occupation'])

In [13]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
1,82,Private,132870,11,9,6,3,Not-in-family,White,Female,0,4356,18,United-States,<=50K
3,54,Private,140359,5,4,0,6,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,15,10,5,9,Own-child,White,Female,0,3900,40,United-States,<=50K
5,34,Private,216864,11,9,0,7,Unmarried,White,Female,0,3770,45,United-States,<=50K
6,38,Private,150601,0,6,5,0,Unmarried,White,Male,0,3770,40,United-States,<=50K


In [14]:
df['workclass'].value_counts()

Private             22696
Self-emp-not-inc     2541
Local-gov            2093
State-gov            1298
Self-emp-inc         1116
Federal-gov           960
Without-pay            14
Name: workclass, dtype: int64

In [15]:
df['workclass'] = label_encoder.fit_transform(df['workclass'])

In [16]:
df['relationship'].value_counts()

Husband           12704
Not-in-family      7865
Own-child          4525
Unmarried          3271
Wife               1435
Other-relative      918
Name: relationship, dtype: int64

In [17]:
df['relationship'] = label_encoder.fit_transform(df['relationship'])

In [18]:
df['race'].value_counts()

White                 26301
Black                  2909
Asian-Pac-Islander      974
Amer-Indian-Eskimo      286
Other                   248
Name: race, dtype: int64

In [19]:
df['race'] = label_encoder.fit_transform(df['race'])

In [20]:
df['sex'].value_counts()

Male      20788
Female     9930
Name: sex, dtype: int64

In [21]:
df['sex'] = label_encoder.fit_transform(df['sex'])

In [22]:
df['native.country'].value_counts()

United-States                 27504
Mexico                          610
?                               556
Philippines                     188
Germany                         128
Puerto-Rico                     109
Canada                          107
El-Salvador                     100
India                           100
Cuba                             92
England                          86
Jamaica                          80
South                            71
China                            68
Italy                            68
Dominican-Republic               67
Vietnam                          64
Guatemala                        63
Japan                            59
Poland                           56
Columbia                         56
Iran                             42
Haiti                            42
Taiwan                           42
Portugal                         34
Nicaragua                        33
Peru                             30
Greece                      

In [23]:
df['native.country'] = label_encoder.fit_transform(df['native.country'])


In [24]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
1,82,2,132870,11,9,6,3,1,4,0,0,4356,18,39,<=50K
3,54,2,140359,5,4,0,6,4,4,0,0,3900,40,39,<=50K
4,41,2,264663,15,10,5,9,3,4,0,0,3900,40,39,<=50K
5,34,2,216864,11,9,0,7,4,4,0,0,3770,45,39,<=50K
6,38,2,150601,0,6,5,0,4,4,1,0,3770,40,39,<=50K


In [25]:
df['income'].value_counts()

<=50K    23068
>50K      7650
Name: income, dtype: int64

In [26]:
df['income'] = label_encoder.fit_transform(df['income'])

In [27]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
1,82,2,132870,11,9,6,3,1,4,0,0,4356,18,39,0
3,54,2,140359,5,4,0,6,4,4,0,0,3900,40,39,0
4,41,2,264663,15,10,5,9,3,4,0,0,3900,40,39,0
5,34,2,216864,11,9,0,7,4,4,0,0,3770,45,39,0
6,38,2,150601,0,6,5,0,4,4,1,0,3770,40,39,0


In [28]:
df.shape

(30718, 15)

In [30]:
# correlation matrix
corr = df.corr()
corr

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
age,1.0,0.081046,-0.07654,-0.002256,0.043567,-0.276793,-0.006037,-0.246038,0.025219,0.082117,0.080392,0.060409,0.101879,-0.003487,0.242431
workclass,0.081046,1.0,-0.032241,0.017182,0.038323,-0.035143,0.015053,-0.06694,0.043371,0.075101,0.034439,0.008347,0.050181,0.003923,0.019128
fnlwgt,-0.07654,-0.032241,1.0,-0.026684,-0.043509,0.031702,0.000677,0.007957,-0.023431,0.026569,-0.000189,-0.010011,-0.02281,-0.053417,-0.009446
education,-0.002256,0.017182,-0.026684,1.0,0.345762,-0.040591,-0.037854,-0.012304,0.011293,-0.028155,0.029772,0.015088,0.058821,0.06665,0.078255
education.num,0.043567,0.038323,-0.043509,0.345762,1.0,-0.063863,0.089105,-0.092244,0.029823,0.007443,0.124247,0.079321,0.151241,0.053229,0.33464
marital.status,-0.276793,-0.035143,0.031702,-0.040591,-0.063863,1.0,0.022375,0.179141,-0.067836,-0.12036,-0.042994,-0.035608,-0.18677,-0.029265,-0.194582
occupation,-0.006037,0.015053,0.000677,-0.037854,0.089105,0.022375,1.0,-0.053,-0.00125,0.062068,0.021418,0.013041,0.018296,-0.011085,0.050144
relationship,-0.246038,-0.06694,0.007957,-0.012304,-0.092244,0.179141,-0.053,1.0,-0.115378,-0.585792,-0.056754,-0.062581,-0.257288,-0.005068,-0.251255
race,0.025219,0.043371,-0.023431,0.011293,0.029823,-0.067836,-0.00125,-0.115378,1.0,0.086147,0.011526,0.020965,0.04481,0.141011,0.070927
sex,0.082117,0.075101,0.026569,-0.028155,0.007443,-0.12036,0.062068,-0.585792,0.086147,1.0,0.047757,0.04701,0.230321,-0.010061,0.216626


In [32]:
# drop 'fmlgwt', 'marital.status', 'relationship', 'income' from x  and incoe is y
x = df.drop(['fnlwgt', 'marital.status', 'relationship', 'income'], axis=1)
y = df['income']

In [42]:
x = df.drop('income', axis=1)
y = df['income']

#### Model Preparation

In [43]:
# split the data into train and test
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=42)

In [44]:
x_train.shape, x_test.shape

((24574, 14), (6144, 14))

In [45]:
y_train.shape, y_test.shape

((24574,), (6144,))

In [46]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

gini_clf = DecisionTreeClassifier(criterion='gini', max_depth=3, random_state=42)

In [47]:
gini_clf.fit(x_train, y_train)

In [48]:
y_pred = gini_clf.predict(x_test)

In [49]:
print(y_pred)

[0 0 0 ... 0 0 0]


#### Accuracy Scores

In [50]:
# accuracy score
print(accuracy_score(y_test, y_pred))

0.8347981770833334


In [51]:
# classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.96      0.90      4639
           1       0.78      0.45      0.57      1505

    accuracy                           0.83      6144
   macro avg       0.81      0.71      0.74      6144
weighted avg       0.83      0.83      0.82      6144

