# Decision Trees vs Random Forests - Practical Example

We'll use both methods to try to predict the income based on census data.

We have one csv file as training dataset and another as test dataset.

In [None]:
# import the relevant packages
import pandas as pd

from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

import matplotlib.pyplot as plt

## Preprocessing the data

### Training dataset

In [None]:
# load the dataset
train_data = pd.read_csv("data/Census_income_train.csv")

In [None]:
# inspect the dataset
# all columns are features, except the 'income' column, which will be the target
train_data.head()

In [None]:
train_data.shape

In [None]:
# No null or NaN values
train_data.isnull().sum()

#### Removing rows with unknown values ('?')

In [None]:
# All missing or unknow values, however, are marked with a question mark (?)
# There are 3 columns which contain '?' - Workclass, Occupation, Native-country

In [None]:
# Let's start with the Workclass column
# We can obtain a list of boolean values indicating whether there is a '?' on the current row
train_data["Workclass"].str.contains("\?")

In [None]:
# Let's reverse all the boolean values
train_data["Workclass"].str.contains("\?") == False

In [None]:
# Take the subset of the dataframe rows which don't contain '?'
clean_train_data = train_data[train_data["Workclass"].str.contains("\?") == False]

In [None]:
len(clean_train_data)

In [None]:
# Let's do the same for 'Occupation'
clean_train_data = clean_train_data[clean_train_data["Occupation"].str.contains("\?") == False]

In [None]:
len(clean_train_data)

In [None]:
# And for 'Native-country'
clean_train_data = clean_train_data[clean_train_data["Native-country"].str.contains("\?") == False]

In [None]:
len(clean_train_data)

In [None]:
# Finally, let's reset the index
clean_train_data = clean_train_data.reset_index(drop=True)

#### Creating dummy variables and separating inputs and targets

In [None]:
# In the original data, there are both categorical and numerical data
# Decision trees and random forest can work with categorical data in general
# However, this is not implemented in sklearn
# So, we need to convert the categorical data to numerical
# We will do that with one hot encoding

In [None]:
# Pandas can automatically do that for us with '.get_dummies'
train_dummies = pd.get_dummies(clean_train_data)

In [None]:
train_dummies.head()

In [None]:
# The last 2 columns are whether the income <= 50k and whether it is >50k
# Both of these carry the same information, so we will remove one of them
train_dummies = train_dummies.drop(['Income_ <=50K'],axis=1)

In [None]:
train_dummies.head()

In [None]:
# The input features are everything besides the last column
train_input = train_dummies.iloc[:,:-1]

# The target/output is just the last column
train_target = train_dummies.iloc[:,-1]

In [None]:
train_input.head()

In [None]:
train_target.head()

### Test dataset

In [None]:
# Let's do the same preprocessing on the test dataset

In [None]:
# Load test data
test_data = pd.read_csv("data/Census_income_test.csv")

In [None]:
test_data.head()

In [None]:
len(test_data)

#### Cleaning unknown ('?') values

In [None]:
clean_test_data = test_data[test_data["Workclass"].str.contains("\?") == False]

In [None]:
len(clean_test_data)

In [None]:
clean_test_data = clean_test_data[clean_test_data["Occupation"].str.contains("\?") == False]

In [None]:
len(clean_test_data)

In [None]:
clean_test_data = clean_test_data[clean_test_data["Native-country"].str.contains("\?") == False]

In [None]:
len(clean_test_data)

In [None]:
clean_test_data = clean_test_data.reset_index(drop=True)

#### Creating dummy variables and separating inputs and targets

In [None]:
test_dummies = pd.get_dummies(clean_test_data)

In [None]:
test_dummies.head()

In [None]:
test_dummies = test_dummies.drop(['Income_ <=50K.'],axis=1)

In [None]:
test_dummies.head()

In [None]:
test_input = test_dummies.iloc[:,:-1]
test_target = test_dummies.iloc[:,-1]

In [None]:
test_target.head()

## Decision Tree Model

### Creating and visualizing the tree

In [None]:
# Define the model as a decision tree classifier
clf = tree.DecisionTreeClassifier()

In [None]:
# Train the model
clf.fit(train_input,train_target)

In [None]:
# Plot the tree
plt.figure(figsize=(10,10))
tree.plot_tree(clf, filled=True)
plt.show()

# NOTE: It may take a lot of time (~3 min) untill the whole tree is drawn

In [None]:
# This picture is very complicated indicating a tree with enormous proportions
# This indicates a high probability that the model has overfitted

### Testing the model

In [None]:
# get the predictions based on the test inputs
test_pred = clf.predict(test_input)

In [None]:
# print the metrics obtained from the real targets and our model's predictions
print(classification_report(test_target, test_pred))

### Create the tree with pruning

In [None]:
# Define the model as a decision tree classifier with pruning in order to avoid overfitting
# The value of 0.001 for the pruning seems like a good spot for this particular model
clf = tree.DecisionTreeClassifier(ccp_alpha=0.001)

In [None]:
# Train the tree
clf.fit(train_input,train_target)

In [None]:
# Plot the tree
plt.figure(figsize=(15,10))
tree.plot_tree(clf, filled=True, 
          class_names=["<=50k", ">50k"])
plt.show()

In [None]:
# This time the tree looks much more manageable

### Testing the model

In [None]:
# get the predictions based on the test inputs
test_pred = clf.predict(test_input)

In [None]:
# print the metrics obtained from the real targets and our model's predictions
print(classification_report(test_target, test_pred))

In [None]:
# The accuracy jumped from 80% to 85% after pruning
# This confirms our suspicions that the first tree has overfitted

## Random Forest Model

### Creating and training the model

In [None]:
# Initialize the model as a random forest classifier
clf = RandomForestClassifier()

In [None]:
# Train the model
clf.fit(train_input,train_target)

### Testing the model

In [None]:
# Obtain the model's predictions on the test dataset
test_pred = clf.predict(test_input)

In [None]:
# Print the metrics obtained from the real targets and our model's predictions
print(classification_report(test_target, test_pred))

### Creating and training the model

In [None]:
# Initialize the model as a random forest classifier with 150 trees (default is 100 trees)
clf = RandomForestClassifier(n_estimators = 150)

In [None]:
# Train the model
clf.fit(train_input,train_target)

### Testing the model

In [None]:
# Obtain the model's predictions on the test dataset
test_pred = clf.predict(test_input)

In [None]:
# Print the metrics obtained from the real targets and our model's predictions
print(classification_report(test_target, test_pred))

In [None]:
# The result is basically the same as before, so the additional trees didn't help at all

### Creating and training the model with pruning

In [None]:
# Initialize the model as a random forest classifier with pruning
clf = RandomForestClassifier(ccp_alpha = 0.0001)

In [None]:
# Train the model
clf.fit(train_input,train_target)

### Testing the model

In [None]:
# Obtain the model's predictions on the test dataset
test_pred = clf.predict(test_input)

In [None]:
# Print the metrics obtained from the real targets and our model's predictions
print(classification_report(test_target, test_pred))

In [None]:
# A slight increase in accuracy however it is insignificant
# This is the limit of the performance on this dataset