In [1]:
%load_ext line_profiler

import csv,operator,sys,os
import numpy as np
import sklearn
import json

from functools import reduce

sys.path.append('../arch-forest/data/adult/')
sys.path.append('../arch-forest/data/')
sys.path.append('../arch-forest/code/')
import trainForest
import Tree

import DecisionSnippetFeatures

In [2]:
# grab adult training data in the format used by the random forests in forests/adult/text/

X = []
Y = []

f = open("../data/adult/adult.data")
for row in f:
    if (len(row) > 1):
        entries = row.replace("\n", "").replace(" ", "").split(",")

        x = trainForest.getFeatureVector(entries)

        if (entries[-1] == "<=50K"):
            y = 0
        else:
            y = 1

        Y.append(y)
        X.append(x)

X = np.array(X).astype(dtype=np.int32)
Y = np.array(Y)
f.close()

In [3]:
# load one of the readily available decision trees
f = open('../forests/adult/text/DT_5.json')
dt = json.load(f)
f.close()


In [None]:
# this creates a transformation of the dataset, assigning each example the vertex id of the leaf vertex 
# the example turns out at. So far, so boring
fts = DecisionSnippetFeatures.FrequentSubtreeFeatures(dt).fit_transform(X[:10, :])
print(fts)

# Something New, Something Interesting

Let's get funky.

For this, we need

- [ ] Frequent Patterns with split values
    - [x] New Data transformator JSON -> GRAPH
    - [x] Transform to GRAPH
    - [x] Mining frequent patterns ('Initial Rooted Frequent Subtree Mining (without embedding computation) -- With Split Values in Labels.ipynb')
    - [x] New Data transformator CSTRING -> JSON (cString2json.py updated)
    - [x] Transform to JSON ('Find All Occurrences of All Frequent Patterns of Size up to 6.ipynb')
    - [ ] fix missing member problem in SubtreeFeatures

In [3]:
f = open('../forests/rootedFrequentTrees/adult/WithLeafEdgesWithSplitValues/leq6/RF_15_t2.json')
frequentpatterns = json.load(f)
f.close()

In [4]:
%time dsf = DecisionSnippetFeatures.FrequentSubtreeFeatures(map(lambda x: x['pattern'], frequentpatterns[-100:]))

CPU times: user 1.81 ms, sys: 131 µs, total: 1.94 ms
Wall time: 1.95 ms


In [5]:
%time fts = dsf.fit_transform(X)


CPU times: user 8.5 s, sys: 7.77 ms, total: 8.51 s
Wall time: 8.51 s


In [6]:
from sklearn.preprocessing import OneHotEncoder
%time fts_onehot = OneHotEncoder(n_values=dsf.get_n_values()).fit_transform(fts)

CPU times: user 81.1 ms, sys: 64.1 ms, total: 145 ms
Wall time: 144 ms


In [7]:
print(X.shape)
print(fts_onehot.shape)
print(fts.shape)
print(Y.shape)

(32561, 64)
(32561, 590)
(32561, 100)
(32561,)


# Classification Performance of Decision Tree Snippet Features vs. Normal Features 

In [8]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

### Linear Regression

In [13]:
model = LinearRegression()
%time fts_onehot_cv_score = cross_val_score(model, fts_onehot, Y, cv=5, scoring='neg_mean_squared_error')
print(fts_onehot_cv_score)

CPU times: user 51.9 s, sys: 828 ms, total: 52.8 s
Wall time: 52.6 s
[-0.11414485 -0.11708251 -0.11171276 -0.11307106 -0.11351519]


In [22]:
model = LinearRegression()
%time normalfeatures_cv_score = cross_val_score(model, X, Y, cv=5, scoring='neg_mean_squared_error')
print(normalfeatures_cv_score)

CPU times: user 934 ms, sys: 0 ns, total: 934 ms
Wall time: 933 ms
[-0.11673837 -0.11713078 -0.11536309 -0.11525609 -0.11559877]


### Naive Bayes

In [23]:
model = GaussianNB()
%time fts_onehot_nb_cv_score = cross_val_score(model, fts_onehot, Y, cv=5, scoring='f1')
print(fts_onehot_nb_cv_score)

CPU times: user 1.32 s, sys: 712 ms, total: 2.03 s
Wall time: 2.03 s
[0.58989373 0.59108781 0.59519168 0.59317585 0.59445407]


In [24]:
model = GaussianNB()
%time normalfeatures_nb_cv_score = cross_val_score(model, X, Y, cv=5, scoring='f1')
print(normalfeatures_nb_cv_score)

CPU times: user 178 ms, sys: 3.86 ms, total: 182 ms
Wall time: 180 ms
[0.4222318  0.41802575 0.42875817 0.40732665 0.43421624]


### Thresholded Linear Regression

In [28]:
class LinRegClassifier(LinearRegression):
    def __init__(self, threshold=0.5):
        super(LinRegClassifier, self).__init__()
        self.threshold = threshold
    
    def predict(self, X):
        p = super(LinRegClassifier, self).predict(X)
        return (p > self.threshold)


In [26]:
model = LinRegClassifier()
%time fts_onehot_nb_cv_score = cross_val_score(model, fts_onehot, Y, cv=5, scoring='f1')
print(fts_onehot_nb_cv_score)

CPU times: user 47.8 s, sys: 754 ms, total: 48.6 s
Wall time: 48.5 s
[0.61959335 0.62084257 0.62017804 0.62280702 0.62026686]


In [25]:
model = LinRegClassifier()
%time normalfeatures_nb_cv_score = cross_val_score(model, X, Y, cv=5, scoring='f1')
print(normalfeatures_nb_cv_score)

CPU times: user 964 ms, sys: 0 ns, total: 964 ms
Wall time: 960 ms
[0.58562691 0.59600614 0.60770111 0.59946133 0.61109044]


## Comparison to DT and RF on Train

In [27]:
model = DecisionTreeClassifier(max_depth=15)
%time normalfeatures_nb_cv_score = cross_val_score(model, X, Y, cv=5, scoring='f1')
print(normalfeatures_nb_cv_score)

CPU times: user 917 ms, sys: 0 ns, total: 917 ms
Wall time: 1.12 s
[0.6408286  0.65229111 0.6622807  0.68045364 0.6668846 ]


In [28]:
model = RandomForestClassifier(max_depth=15)
%time normalfeatures_nb_cv_score = cross_val_score(model, X, Y, cv=5, scoring='f1')
print(normalfeatures_nb_cv_score)

CPU times: user 1.05 s, sys: 0 ns, total: 1.05 s
Wall time: 1.05 s
[0.65703971 0.66981818 0.67585207 0.66642012 0.6750179 ]


## Comparison of LinearRegression on DT features -> Should be more or less identical

In [47]:
# load one of the readily available decision trees
f = open('../forests/adult/text/DT_5.json')
dt = json.load(f)
f.close()
dt_fsf = DecisionSnippetFeatures.FrequentSubtreeFeatures(dt)
dt_fts = dt_fsf.fit_transform(X)
dt_fts_onehot = OneHotEncoder(n_values=dt_fsf.get_n_values()).fit_transform(dt_fts)
print(dt_fts_onehot.shape)

(32561, 55)


In [51]:
from sklearn.base import BaseEstimator
class UntrainableDTClassifier(BaseEstimator):
    def __init__(self):
        super(UntrainableDTClassifier, self).__init__()
        self.decisionTreeModel = DecisionSnippetFeatures.FrequentSubtreeFeatures(dt)

        tree = self.decisionTreeModel.patterns[0]
        self.linreg_weights = np.zeros(self.decisionTreeModel.get_n_values())
        for i in range(dt_fsf.get_n_values()):
            try:
                self.linreg_weights[i] = tree.nodes[i].prediction[0]
            except TypeError:
                self.linreg_weights[i] = 0
        
    def fit(self, X, y):
        pass

    def predict(self, X):
        return np.dot(X, self.linreg_weights)


model = UntrainableDTClassifier()
print(dt_fts_onehot.shape)
%time 
dtfeatures_lr_cv_score = cross_val_score(model, dt_fts_onehot, Y, cv=5, scoring='f1')
print(dtfeatures_lr_cv_score)

(32561, 55)
CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 5.96 µs


ValueError: Found input variables with inconsistent numbers of samples: [6513, 55]

In [34]:
a = np.array([1,2,3])
b = np.array([[1,2,3], [1,2,3]])
print(a)
print(b)
print(np.dot(b,a))

[1 2 3]
[[1 2 3]
 [1 2 3]]
[14 14]


## Comparison of NaiveBayes on DT features -> Should be more or less identical

In [11]:
# load one of the readily available decision trees
f = open('../forests/adult/text/DT_5.json')
dt = json.load(f)
f.close()
dt_fsf = DecisionSnippetFeatures.FrequentSubtreeFeatures(dt)
dt_fts = dt_fsf.fit_transform(X)
# dt_fts_onehot = OneHotEncoder(n_values=dt_fsf.get_n_values()).fit_transform(fts)
print(dt_fsf.get_n_values())

55


In [12]:
model = GaussianNB()
%time dtfeatures_nb_cv_score = cross_val_score(model, dt_fts, Y, cv=5, scoring='f1')
print(dtfeatures_nb_cv_score)

CPU times: user 39.3 ms, sys: 120 µs, total: 39.4 ms
Wall time: 39 ms
[0.57308176 0.57518322 0.57555049 0.57035553 0.579     ]
