# Random Forest Independence Test v2 

In this notebook, we modify our algorithm to average over H(Y|Xi) instead of averaging over trees. 
We will try out two methods: 
1. manually calculating the posterior distribution
2. using random forest's approximation of the class probabilities

In [57]:
# have it so it splits training for you
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
import graphviz
from scipy.stats import entropy
#TODO: clean up code better
#TODO: modularize and other stuff

def estimate_conditional_entropy_hard_voting(X, y, n_trees = 10, max_depth = None, bootstrap = True):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    model = RandomForestClassifier(bootstrap = bootstrap, n_estimators =n_trees, max_depth = max_depth, random_state = 0)
    model.fit(X_train, y_train)
    print(X_test.flatten())
    print(y_test)
    class_counts = np.zeros((X_test.shape[0], model.n_classes_))
    for tree_in_forest in model:
        # get number of training elements in each partition
        node_counts = tree_in_forest.tree_.n_node_samples
        # get counts w.r.t. testing data now
        partition_counts = np.asarray([node_counts[x] for x in tree_in_forest.apply(X_test)])
        # get probability
        class_probs = tree_in_forest.predict_proba(X_test)
        # why are there decimals?!
        # bootstrap approximation in sklearn
        elems = np.multiply(class_probs, partition_counts[:, np.newaxis])
        class_counts += elems
    probs = class_counts/class_counts.sum(axis=1, keepdims=True)
    entropies = -np.sum(np.log(probs)*probs, axis = 1)
    return np.mean(entropies)

def estimate_conditional_entropy_soft_voting(X, y, n_trees = 10, max_depth = None, bootstrap = True):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    model = RandomForestClassifier(n_estimators = n_trees, max_depth = max_depth, random_state = 0, bootstrap = bootstrap)
    model.fit(X_train, y_train)
    probs = model.predict_proba(X_test)
    entropies = -np.sum(np.log(probs)*probs, axis = 1)
    return np.mean(entropies)

SKlearn Bootstrapping uses some weird approximation thing:
    

In [33]:
x = [0]*20 + [1]*20 + [2]*20 + [3]*20
y = [0, 1, 0, 1, 0]*4 + [ 1, 1, 1, 1, 0]*4 + [1, 0, 1, 0, 1]*4 + [0, 0, 0, 0, 1]*4
X = np.array(x).reshape(-1, 1)
y = np.array(y)

# Hand calculations  
H(X) = 1.386294  
H(Y) = 0.693147  
H(X, Y) = 1.97300  
H(Y|X) = .5867  

In [60]:
estimate_conditional_entropy_hard_voting( X, y, 10, bootstrap = True)

[1 0 1 1 0 1 0 3 0 0 2 1 3 1 3 2 3 3 2 2 0 3 0 2]
[1 0 1 1 1 1 0 0 0 0 1 1 0 1 0 1 0 0 1 1 0 1 0 1]


0.6143977929957609

Testing data Entropy calculation:
H(X) = -(7/24log(7/24) + 6/24log(6/24) + 6/24log(6/24) + 5/24log(5/24)) = 1.397  
H(y) = -(13/24log(13/24) + 11/24log(11/24)) = .68967  
H(X, Y) = 6/24log(6/24) + 6/24log(6/24) + 1/24log(1/24) + 5/24log(5/24) + 1/24log(1/24) + 5/24log(5/24) = 1.61157  
H(Y|X) =  

In [28]:
estimate_conditional_entropy_soft_voting(X, y, 10, bootstrap = True)

0.619873263400197

You can adjust the size of the data. The more data the better it does. However, it doesn't do as well as the previoius algorithm which uses weighted conditional entropy and first averages across trees.

# Improvement 1
We can turn bootstrapping off because there is some approximation going on:
https://stats.stackexchange.com/questions/130206/sklearn-tree-export-graphviz-values-do-not-add-up-to-samples

In [34]:
estimate_conditional_entropy_soft_voting( X, y, 100, bootstrap = False)

0.624708868541473

In [35]:
estimate_conditional_entropy_hard_voting(X, y, 100, bootstrap = False)

0.6247088685414731

# Improvement 2
We can use all the data. This is by far where most of the error is coming from. Makes no sense to compare conditional entropy of test dataset to entire dataset. If we want to measure conditional entrop of our sample dataset, we should just use everything. What is important is just that random forest was able to capture dependences.

How does this affect robustness? I.e. sample data is dependent but actually not dependent.

In [67]:
# manual one
def estimate_conditional_entropy_hard_voting(X, y, n_trees = 10, max_depth = None, bootstrap = True):
    model = RandomForestClassifier(bootstrap = bootstrap, n_estimators =n_trees, max_depth = max_depth, random_state = 0)
    model.fit(X, y)
    class_counts = np.zeros((X.shape[0], model.n_classes_))
    for tree_in_forest in model:
        # get number of training elements in each partition
        node_counts = tree_in_forest.tree_.n_node_samples
        # get counts w.r.t. testing data now
        partition_counts = np.asarray([node_counts[x] for x in tree_in_forest.apply(X)])
        # get probability
        class_probs = tree_in_forest.predict_proba(X)
        # why are there decimals?!
        class_counts += elems
    probs = class_counts/class_counts.sum(axis=1, keepdims=True)
    entropies = -np.sum(np.log(probs)*probs, axis = 1)
    return np.mean(entropies)

def estimate_conditional_entropy_soft_voting(X, y, n_trees = 10, max_depth = None, bootstrap = True):
    model = RandomForestClassifier(n_estimators = n_trees, max_depth = max_depth, random_state = 0, bootstrap = bootstrap)
    model.fit(X, y)
    probs = model.predict_proba(X)
    entropies = -np.sum(np.log(probs)*probs, axis = 1)
    return np.mean(entropies)

In [49]:
estimate_conditional_entropy_soft_voting( X, y, 10, bootstrap = False)

0.5867070452737222

In [50]:
estimate_conditional_entropy_hard_voting(X, y, 10, bootstrap = False)

0.6247088685414731

In [51]:
estimate_conditional_entropy_soft_voting( X, y, 10, bootstrap = True)

0.5747883706093118

In [52]:
estimate_conditional_entropy_rf_hard_voting(X, y, 10, bootstrap = True)

0.5746704869940018

In [61]:
from sklearn import datasets
import pandas as pd

iris = datasets.load_iris()
# take first 2 features
X = iris.data[:100, :2]
# take first 100 (only two classes)
y = iris.target[:100]

In [62]:
import matplotlib.pyplot as plt
plt.scatter(X[:50, 1], X[:50, 0], c = "b", label = "setosa")
plt.scatter(X[50:, 1], X[50:, 0], c = "g", label = "vericolour")
plt.legend()
plt.title('Iris Dataset Scatter Plot')
plt.xticks([])
plt.yticks([])

([], <a list of 0 Text yticklabel objects>)

In [68]:
import scipy

def calculate_entropy(X):
    probs = np.bincount(X)/len(X)
    return scipy.stats.entropy(probs)
entropy = calculate_entropy(y)
cond_entropy = estimate_conditional_entropy_hard_voting(X, y, 10, bootstrap = False)
print("H(Y):", entropy)
print("Conditional Entropy:", cond_entropy)
print("Mutual Information:", entropy - cond_entropy)

H(Y): 0.6931471805599453
Conditional Entropy: nan
Mutual Information: nan


