In [1]:
%load_ext line_profiler

import csv,operator,sys,os
import numpy as np
import sklearn
import json

from functools import reduce

sys.path.append('../arch-forest/data/adult/')
sys.path.append('../arch-forest/data/')
sys.path.append('../arch-forest/code/')
import trainForest
import Tree

import DecisionSnippetFeatures

In [2]:
# grab adult training data in the format used by the random forests in forests/adult/text/

X = []
Y = []

f = open("../data/adult/adult.data")
for row in f:
    if (len(row) > 1):
        entries = row.replace("\n", "").replace(" ", "").split(",")

        x = trainForest.getFeatureVector(entries)

        if (entries[-1] == "<=50K"):
            y = 0
        else:
            y = 1

        Y.append(y)
        X.append(x)

X = np.array(X).astype(dtype=np.int32)
Y = np.array(Y)
f.close()

In [3]:
# load one of the readily available decision trees
f = open('../forests/adult/text/DT_5.json')
dt = json.load(f)
f.close()


In [None]:
# this creates a transformation of the dataset, assigning each example the vertex id of the leaf vertex 
# the example turns out at
fts = DecisionSnippetFeatures.FrequentSubtreeFeatures(dt).fit_transform(X[:10, :])
print(fts)

In [None]:
# same here, but one hot encoded. hurra. so far, so boring.
fts_onehot = DecisionSnippetFeatures.OneHotFrequentSubtreeFeatures(dt).fit_transform(X[:10, :])
print(fts_onehot)

# Something New, Something Interesting

Let's get funky.

For this, we need

- [ ] Frequent Patterns with split values
    - [x] New Data transformator JSON -> GRAPH
    - [x] Transform to GRAPH
    - [x] Mining frequent patterns ('Initial Rooted Frequent Subtree Mining (without embedding computation) -- With Split Values in Labels.ipynb')
    - [x] New Data transformator CSTRING -> JSON (cString2json.py updated)
    - [x] Transform to JSON ('Find All Occurrences of All Frequent Patterns of Size up to 6.ipynb')
    - [ ] fix missing member problem in SubtreeFeatures

In [3]:
f = open('../forests/rootedFrequentTrees/adult/WithLeafEdgesWithSplitValues/leq6/RF_15_t2.json')
frequentpatterns = json.load(f)
f.close()

In [None]:
frequentpatterns[:10]

In [None]:
X[:10, :]

In [4]:
%time dsf_onehot = DecisionSnippetFeatures.OneHotFrequentSubtreeFeatures(map(lambda x: x['pattern'], frequentpatterns[-100:]))


CPU times: user 5 ms, sys: 86 µs, total: 5.08 ms
Wall time: 5.16 ms


In [5]:
# %lprun -f DecisionSnippetFeatures.OneHotFeatureGeneratingTree.get_features dsf.fit_transform(X)
%time fts_onehot = dsf_onehot.fit_transform(X)

CPU times: user 16.3 s, sys: 112 ms, total: 16.4 s
Wall time: 16.4 s


In [8]:
%time dsf = DecisionSnippetFeatures.FrequentSubtreeFeatures(map(lambda x: x['pattern'], frequentpatterns[-100:]))

CPU times: user 4.23 ms, sys: 2 µs, total: 4.23 ms
Wall time: 4.24 ms


In [9]:
%time fts = dsf.fit_transform(X)


CPU times: user 8.15 s, sys: 4 ms, total: 8.15 s
Wall time: 8.15 s


In [10]:
from sklearn.preprocessing import OneHotEncoder
%time fts_onehot_transform = OneHotEncoder(n_values=dsf.get_n_values()).fit_transform(fts)

CPU times: user 89.7 ms, sys: 36 ms, total: 126 ms
Wall time: 124 ms


In [11]:
print(X.shape)
print(fts_onehot.shape)
print(fts.shape)
print(fts_onehot_transform.shape)
print(Y.shape)

(32561, 64)
(32561, 590)
(32561, 100)
(32561, 590)
(32561,)


In [19]:
diff = fts_onehot - fts_onehot_transform.toarray()
np.unique(diff)

array([0.])

# Classification Performance of Decision Tree Snippet Features vs. Normal Features 

In [12]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

### Linear Regression

In [13]:
model = LinearRegression()
%time fts_onehot_cv_score = cross_val_score(model, fts_onehot, Y, cv=5, scoring='neg_mean_squared_error')
print(fts_onehot_cv_score)

CPU times: user 51.9 s, sys: 828 ms, total: 52.8 s
Wall time: 52.6 s
[-0.11414485 -0.11708251 -0.11171276 -0.11307106 -0.11351519]


In [16]:
model = LinearRegression()
%time fts_onehot_transform_cv_score = cross_val_score(model, fts_onehot_transform, Y, cv=5, scoring='neg_mean_squared_error')
print(fts_onehot_transform_cv_score)

CPU times: user 5.24 s, sys: 84 ms, total: 5.33 s
Wall time: 5.33 s
[-0.11380369 -0.1156247  -0.11062547 -0.11274396 -0.11262437]


In [15]:
model = LinearRegression()
normalfeatures_cv_score = cross_val_score(model, X, Y, cv=5, scoring='neg_mean_squared_error')
print(normalfeatures_cv_score)

[-0.11673837 -0.11713078 -0.11536309 -0.11525609 -0.11559877]


### Naive Bayes

In [29]:
model = GaussianNB()
fts_onehot_nb_cv_score = cross_val_score(model, fts_onehot, Y, cv=5, scoring='f1')
print(fts_onehot_nb_cv_score)

[0.62390533 0.61197548 0.61228317 0.61614266 0.61734814]


In [30]:
model = GaussianNB()
normalfeatures_nb_cv_score = cross_val_score(model, X, Y, cv=5, scoring='f1')
print(normalfeatures_nb_cv_score)

[0.4222318  0.41802575 0.42875817 0.40732665 0.43421624]


### Thresholded Linear Regression

In [35]:
class LinRegClassifier(LinearRegression):
    def __init__(self, threshold=0.5):
        super(LinRegClassifier, self).__init__()
        self.threshold = threshold
    
    def predict(self, X):
        p = super(LinRegClassifier, self).predict(X)
        return (p > self.threshold)


In [36]:
model = LinRegClassifier()
fts_onehot_nb_cv_score = cross_val_score(model, fts_onehot, Y, cv=5, scoring='f1')
print(fts_onehot_nb_cv_score)

[0.63606072 0.63023952 0.64129616 0.62804878 0.65166909]


In [38]:
model = LinRegClassifier()
normalfeatures_nb_cv_score = cross_val_score(model, X, Y, cv=5, scoring='f1')
print(normalfeatures_nb_cv_score)

[0.58562691 0.59600614 0.60770111 0.59946133 0.61109044]


## Comparison to DT and RF on Train

In [39]:
model = DecisionTreeClassifier(max_depth=15)
normalfeatures_nb_cv_score = cross_val_score(model, X, Y, cv=5, scoring='f1')
print(normalfeatures_nb_cv_score)

[0.63925109 0.65052136 0.66291432 0.68250251 0.66883328]


In [40]:
model = RandomForestClassifier(max_depth=15)
normalfeatures_nb_cv_score = cross_val_score(model, X, Y, cv=5, scoring='f1')
print(normalfeatures_nb_cv_score)

[0.65478356 0.65532544 0.67052441 0.68114368 0.66642495]
