In [1]:
%load_ext line_profiler

import csv,operator,sys,os
import numpy as np
import sklearn
import json

from functools import reduce

sys.path.append('../arch-forest/data/adult/')
sys.path.append('../arch-forest/data/')
sys.path.append('../arch-forest/code/')
import trainForest
import Tree

import DecisionSnippetFeatures

In [2]:
# grab adult training data in the format used by the random forests in forests/adult/text/

X = []
Y = []

f = open("../data/adult/adult.data")
for row in f:
    if (len(row) > 1):
        entries = row.replace("\n", "").replace(" ", "").split(",")

        x = trainForest.getFeatureVector(entries)

        if (entries[-1] == "<=50K"):
            y = 0
        else:
            y = 1

        Y.append(y)
        X.append(x)

X = np.array(X).astype(dtype=np.int32)
Y = np.array(Y)
f.close()

In [3]:
# load one of the readily available decision trees
f = open('../forests/adult/text/DT_5.json')
dt = json.load(f)
f.close()


In [4]:
# this creates a transformation of the dataset, assigning each example the vertex id of the leaf vertex 
# the example turns out at. So far, so boring
fts = DecisionSnippetFeatures.FrequentSubtreeFeatures(dt).fit_transform(X[:10, :])
print(fts)

[[32]
 [47]
 [29]
 [43]
 [ 9]
 [28]
 [28]
 [47]
 [35]
 [46]]


# Something New, Something Interesting

Let's get funky.

For this, we need

- [ ] Frequent Patterns with split values
    - [x] New Data transformator JSON -> GRAPH
    - [x] Transform to GRAPH
    - [x] Mining frequent patterns ('Initial Rooted Frequent Subtree Mining (without embedding computation) -- With Split Values in Labels.ipynb')
    - [x] New Data transformator CSTRING -> JSON (cString2json.py updated)
    - [x] Transform to JSON ('Find All Occurrences of All Frequent Patterns of Size up to 6.ipynb')
    - [ ] fix missing member problem in SubtreeFeatures

In [5]:
f = open('../forests/rootedFrequentTrees/adult/WithLeafEdgesWithSplitValues/leq6/RF_15_t2.json')
frequentpatterns = json.load(f)
f.close()

In [6]:
%time dsf = DecisionSnippetFeatures.FrequentSubtreeFeatures(map(lambda x: x['pattern'], frequentpatterns[-100:]))

CPU times: user 2.79 ms, sys: 333 µs, total: 3.13 ms
Wall time: 3.06 ms


In [7]:
%time fts = dsf.fit_transform(X)


CPU times: user 16.7 s, sys: 18.3 ms, total: 16.7 s
Wall time: 16.7 s


In [8]:
from sklearn.preprocessing import OneHotEncoder
%time fts_onehot = OneHotEncoder(n_values=dsf.get_n_values()).fit_transform(fts)

CPU times: user 85.9 ms, sys: 52.1 ms, total: 138 ms
Wall time: 137 ms


In [9]:
print(X.shape)
print(fts_onehot.shape)
print(fts.shape)
print(Y.shape)

(32561, 64)
(32561, 590)
(32561, 100)
(32561,)


# Classification Performance of Decision Tree Snippet Features vs. Normal Features 

In [10]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

### Linear Regression

In [11]:
model = LinearRegression()
%time fts_onehot_cv_score = cross_val_score(model, fts_onehot, Y, cv=5, scoring='neg_mean_squared_error')
print(fts_onehot_cv_score)

CPU times: user 15.3 s, sys: 17.8 s, total: 33.1 s
Wall time: 10.2 s
[-0.1138037  -0.1156247  -0.11062547 -0.11274397 -0.11262436]


In [None]:
model = LinearRegression()
%time normalfeatures_cv_score = cross_val_score(model, X, Y, cv=5, scoring='neg_mean_squared_error')
print(normalfeatures_cv_score)

### Naive Bayes

In [12]:
model = GaussianNB()
%time fts_onehot_nb_cv_score = cross_val_score(model, fts_onehot, Y, cv=5, scoring='f1')
print(fts_onehot_nb_cv_score)

TypeError: A sparse matrix was passed, but dense data is required. Use X.toarray() to convert to a dense numpy array.

In [13]:
model = GaussianNB()
%time normalfeatures_nb_cv_score = cross_val_score(model, X, Y, cv=5, scoring='f1')
print(normalfeatures_nb_cv_score)

CPU times: user 179 ms, sys: 949 µs, total: 180 ms
Wall time: 182 ms
[0.4222318  0.41802575 0.42875817 0.40732665 0.43421624]


### Thresholded Linear Regression

In [14]:
class LinRegClassifier(LinearRegression):
    def __init__(self, threshold=0.5):
        super(LinRegClassifier, self).__init__()
        self.threshold = threshold
    
    def predict(self, X):
        p = super(LinRegClassifier, self).predict(X)
        return (p > self.threshold)


In [15]:
model = LinRegClassifier()
%time fts_onehot_nb_cv_score = cross_val_score(model, fts_onehot, Y, cv=5, scoring='f1')
print(fts_onehot_nb_cv_score)

CPU times: user 16 s, sys: 19 s, total: 35 s
Wall time: 11 s
[0.62030905 0.62056213 0.63053097 0.61715559 0.62558223]


In [16]:
model = LinRegClassifier()
%time normalfeatures_nb_cv_score = cross_val_score(model, X, Y, cv=5, scoring='f1')
print(normalfeatures_nb_cv_score)

CPU times: user 1.93 s, sys: 1.36 s, total: 3.29 s
Wall time: 1.01 s
[0.58562691 0.59600614 0.60770111 0.59946133 0.61109044]


## Comparison to DT and RF on Train

In [17]:
model = DecisionTreeClassifier(max_depth=15)
%time normalfeatures_nb_cv_score = cross_val_score(model, X, Y, cv=5, scoring='f1')
print(normalfeatures_nb_cv_score)

CPU times: user 1.11 s, sys: 2.38 ms, total: 1.12 s
Wall time: 1.14 s
[0.64101706 0.64496644 0.66332916 0.68626796 0.67036554]


In [18]:
model = RandomForestClassifier(max_depth=15)
%time normalfeatures_nb_cv_score = cross_val_score(model, X, Y, cv=5, scoring='f1')
print(normalfeatures_nb_cv_score)

CPU times: user 1.28 s, sys: 0 ns, total: 1.28 s
Wall time: 1.28 s
[0.65053172 0.65441176 0.67101449 0.67328188 0.6734031 ]


## Comparison of LinearRegression on DT features -> Should be more or less identical

In [19]:
# load one of the readily available decision trees
f = open('../forests/adult/text/DT_5.json')
dt = json.load(f)
f.close()
dt_fsf = DecisionSnippetFeatures.FrequentSubtreeFeatures(dt)
dt_fts = dt_fsf.fit_transform(X)
dt_fts_onehot = OneHotEncoder(n_values=dt_fsf.get_n_values()).fit_transform(dt_fts)
print(dt_fts_onehot.shape)

(32561, 55)


In [20]:
from sklearn.base import BaseEstimator
class UntrainableDTClassifier(BaseEstimator):
    def __init__(self):
        super(UntrainableDTClassifier, self).__init__()
        self.decisionTreeModel = DecisionSnippetFeatures.FrequentSubtreeFeatures(dt)

        tree = self.decisionTreeModel.patterns[0]
        self.linreg_weights = np.zeros(self.decisionTreeModel.get_n_values())
        for i in range(dt_fsf.get_n_values()):
            try:
                self.linreg_weights[i] = tree.nodes[i].prediction[0]
            except TypeError:
                self.linreg_weights[i] = 0
        
    def fit(self, X, y):
        pass

    def predict(self, X):
        return np.dot(X, self.linreg_weights)


model = UntrainableDTClassifier()
print(dt_fts_onehot.shape)
%time 
dtfeatures_lr_cv_score = cross_val_score(model, dt_fts_onehot, Y, cv=5, scoring='f1')
print(dtfeatures_lr_cv_score)

(32561, 55)
CPU times: user 1e+03 ns, sys: 1e+03 ns, total: 2 µs
Wall time: 4.29 µs


ValueError: Found input variables with inconsistent numbers of samples: [6513, 55]

In [None]:
a = np.array([1,2,3])
b = np.array([[1,2,3], [1,2,3]])
print(a)
print(b)
print(np.dot(b,a))

## Comparison of NaiveBayes on DT features -> Should be more or less identical

In [21]:
# load one of the readily available decision trees
f = open('../forests/adult/text/DT_5.json')
dt = json.load(f)
f.close()
dt_fsf = DecisionSnippetFeatures.FrequentSubtreeFeatures(dt)
dt_fts = dt_fsf.fit_transform(X)
# dt_fts_onehot = OneHotEncoder(n_values=dt_fsf.get_n_values()).fit_transform(fts)
print(dt_fsf.get_n_values())

55


In [22]:
model = GaussianNB()
%time dtfeatures_nb_cv_score = cross_val_score(model, dt_fts, Y, cv=5, scoring='f1')
print(dtfeatures_nb_cv_score)

CPU times: user 38.7 ms, sys: 0 ns, total: 38.7 ms
Wall time: 38 ms
[0.57308176 0.57518322 0.57555049 0.57035553 0.579     ]
