# Introduction to Decision Trees

The data is from the 1994 census, and contains information on an individual's marital status, age, type of work, and more. The target column, or what we want to predict, is whether individuals make less than or equal to 50k a year, or more than 50k a year.

In [1]:
import pandas as pd
import numpy as np
import math

### Exploring the Data

In [2]:
income = pd.read_csv("income.csv", index_col = False)

In [3]:
income.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,high_income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
income.shape

(32561, 15)

In [5]:
income.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education_num',
       'marital_status', 'occupation', 'relationship', 'race', 'sex',
       'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
       'high_income'],
      dtype='object')

### Converting the categorical variables to numeric variables

In [6]:
categorical_cols = ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 
                    'race', 'sex', 'native_country', 'high_income']

In [7]:
for cat in categorical_cols:
    income[cat] = pd.Categorical(income[cat]).codes

In [8]:
print(income['sex'].head())

0    1
1    1
2    1
3    1
4    0
Name: sex, dtype: int8


In [9]:
print(income['education'].head())

0     9
1     9
2    11
3     1
4     9
Name: education, dtype: int8


### Splitting the Data

In [10]:
private_incomes = income[income['workclass'] == 4]
private_incomes.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,high_income
2,38,4,215646,11,9,0,6,1,4,1,0,0,40,39,0
3,53,4,234721,1,7,2,6,0,2,1,0,0,40,39,0
4,28,4,338409,9,13,2,10,5,2,0,0,0,40,5,0
5,37,4,284582,12,14,2,4,5,4,0,0,0,40,39,0
6,49,4,160187,6,5,3,8,1,2,0,0,0,16,23,0


In [11]:
public_incomes = income[income['workclass'] != 4]
public_incomes.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,high_income
0,39,7,77516,9,13,4,1,1,4,1,2174,0,40,39,0
1,50,6,83311,9,13,2,4,0,4,1,0,0,13,39,0
7,52,6,209642,11,9,2,4,0,4,1,0,0,45,39,1
11,30,7,141297,9,13,2,10,0,1,1,0,0,40,19,1
16,25,6,176756,11,9,4,5,3,4,1,0,0,35,39,0


In [12]:
private_incomes.shape

(22696, 15)

In [13]:
public_incomes.shape

(9865, 15)

### Computing the Entropy

In [14]:
low_income_prob = income[income["high_income"] == 0].shape[0] / income.shape[0]
high_income_prob = income[income["high_income"] == 1].shape[0] / income.shape[0]

In [15]:
income_entropy = -(low_income_prob * math.log(low_income_prob, 2) + high_income_prob * math.log(high_income_prob, 2))
income_entropy

0.7963839552022132

### Information Gain

In [16]:
# A function to calculate entropy

def calc_entropy(column):
    """
    Calculate entropy given a pandas series, list, or numpy array.
    """
    # Compute the counts of each unique value in the column
    counts = np.bincount(column)
    # Divide by the total column length to get a probability
    probabilities = counts / len(column)
    
    # Initialize the entropy to 0
    entropy = 0
    # Loop through the probabilities, and add each one to the total entropy
    for prob in probabilities:
        if prob > 0:
            entropy += prob * math.log(prob, 2)
    
    return -entropy

In [17]:
left = income[income["age"] <= income["age"].median()]
right = income[income["age"] > income["age"].median()]

In [18]:
age_information_gain = calc_entropy(income["high_income"]) - ((left.shape[0] / income.shape[0]) * calc_entropy(left["high_income"]) + ((right.shape[0] / income.shape[0]) * calc_entropy(right["high_income"])))
age_information_gain

0.047028661304691965

### Finding the best variable to split a node on

In [19]:
def calc_information_gain(data, split_name, target_name):
    """
    Calculate information gain given a data set, column to split on, and target
    """
    # Calculate the original entropy
    original_entropy = calc_entropy(data[target_name])
    
    # Find the median of the column we're splitting
    column = data[split_name]
    median = column.median()
    
    # Make two subsets of the data, based on the median
    left_split = data[column <= median]
    right_split = data[column > median]
    
    # Loop through the splits and calculate the subset entropies
    to_subtract = 0
    for subset in [left_split, right_split]:
        prob = (subset.shape[0] / data.shape[0]) 
        to_subtract += prob * calc_entropy(subset[target_name])
    
    # Return information gain
    return original_entropy - to_subtract

In [20]:
calc_information_gain(income, "age", "high_income")

0.047028661304691965

In [24]:
cols = ["age", "workclass", "education_num", "marital_status", "occupation", "relationship", "race", "sex", "hours_per_week", "native_country"]

information_gains = []

for col in cols:
    information_gain = calc_information_gain(income, col, "high_income")
    information_gains.append(information_gain)

information_gains

[0.047028661304691965,
 0.006810984054396618,
 0.06501298413277423,
 0.1114272573715438,
 0.0015822303843424645,
 0.04736241665026941,
 0.0,
 0.0,
 0.04062246867123487,
 0.00013457344495848567]

In [25]:
highest_gain_index = information_gains.index(max(information_gains))
highest_gain_index

3

In [26]:
highest_gain = cols[highest_gain_index]
highest_gain

'marital_status'