# Applying Decision Trees

The data is from the 1994 census, and contains information on an individual's marital status, age, type of work, and more. The target column, or what we want to predict, is whether individuals make less than or equal to 50k a year, or more than 50k a year.

In [1]:
import math
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score

In [2]:
income = pd.read_csv("income.csv", index_col = False)

In [3]:
income.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,high_income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


### Converting the categorical variables to numeric variables

In [4]:
categorical_cols = ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country', 'high_income']
for cat in categorical_cols:
    income[cat] = pd.Categorical(income[cat]).codes

### Using Decision Trees With scikit-learn

In [5]:
columns = ["age", "workclass", "education_num", "marital_status", "occupation", "relationship", "race", "sex", "hours_per_week", "native_country"]

In [6]:
clf = DecisionTreeClassifier(random_state = 1)

In [7]:
clf.fit(income[columns],income["high_income"])

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=1,
            splitter='best')

### Splitting our data into training and testing sets

In [8]:
np.random.seed(1)
income = income.reindex(np.random.permutation(income.index))

In [9]:
income.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,high_income
9646,62,6,26911,5,4,6,8,1,4,0,0,0,66,39,0
709,18,4,208103,1,7,4,8,2,4,1,0,0,25,39,0
7385,25,4,102476,9,13,4,5,3,4,1,27828,0,50,39,1
16671,33,4,511517,11,9,2,10,0,4,1,0,0,40,39,0
21932,36,4,292570,1,7,4,7,4,4,0,0,0,40,39,0


In [10]:
split_rows = math.floor(0.8 * income.shape[0])

train = income.iloc[:split_rows]
test = income.iloc[split_rows:]

In [11]:
train.head(2)

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,high_income
9646,62,6,26911,5,4,6,8,1,4,0,0,0,66,39,0
709,18,4,208103,1,7,4,8,2,4,1,0,0,25,39,0


In [12]:
test.head(2)

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,high_income
22528,30,4,329425,11,9,2,3,0,4,1,0,0,40,39,0
31546,35,4,48779,11,9,2,5,0,4,1,0,0,35,39,0


### Computing error using AUC

In [13]:
clf = DecisionTreeClassifier(random_state=1)
clf.fit(train[columns], train["high_income"])
predictions = clf.predict(test[columns])
predictions

array([0, 0, 0, ..., 0, 1, 1], dtype=int8)

In [14]:
error = roc_auc_score(test["high_income"], predictions)
error

0.69346563247461923

In [15]:
# Computing error on the training set

predictions_train = clf.predict(train[columns])
error_train = roc_auc_score(train["high_income"], predictions_train)
error_train

0.94712445014374547

### Reducing Overfitting

In [16]:
# Reducing the minimum number of rows a node should have before it can be split

clf = DecisionTreeClassifier(random_state = 1, min_samples_split = 13)
clf.fit(train[columns], train["high_income"])
predictions = clf.predict(test[columns])

In [17]:
test_error = roc_auc_score(test["high_income"], predictions)
test_error

0.6995617145150872

In [18]:
predictions_train = clf.predict(train[columns])
train_error = roc_auc_score(train["high_income"], predictions_train)
train_error

0.84214318492754126

In [19]:
# Restricting the depth of the tree

clf = DecisionTreeClassifier(random_state=1,max_depth=7,min_samples_split=13)
clf.fit(train[columns], train["high_income"])
predictions = clf.predict(test[columns])

In [20]:
test_error = roc_auc_score(test["high_income"], predictions)
test_error

0.7436344996725136

In [21]:
predictions_train = clf.predict(train[columns])
train_error = roc_auc_score(train["high_income"], predictions_train)
train_error

0.74803770830920902

In [22]:
# Tweaking the parameters more aggressively

clf = DecisionTreeClassifier(random_state=1,max_depth=2,min_samples_split=100)
clf.fit(train[columns], train["high_income"])
predictions = clf.predict(test[columns])

In [23]:
test_error = roc_auc_score(test["high_income"], predictions)
test_error

0.65531384818764993

In [24]:
predictions_train = clf.predict(train[columns])
train_error = roc_auc_score(train["high_income"], predictions_train)
train_error

0.66245080421614833

### Inducing variance by adding noise

In [37]:
np.random.seed(1)

In [38]:
income["noise"] = np.random.randint(4, size=income.shape[0])

In [39]:
columns = ["noise", "age", "workclass", "education_num", "marital_status", "occupation", "relationship", "race", "sex", "hours_per_week", "native_country"]

In [40]:
split = math.floor(0.8 * income.shape[0])
train = income.iloc[:split]
test = income.iloc[split:]

In [41]:
clf = DecisionTreeClassifier(random_state = 1)
clf.fit(train[columns],train["high_income"])
predictions = clf.predict(test[columns])

In [42]:
test_error = roc_auc_score(test["high_income"], predictions)
test_error

0.69140600139413477

In [43]:
predictions_train = clf.predict(train[columns])
train_error = roc_auc_score(train["high_income"], predictions_train)
train_error

0.97507616143508014