In [1]:
import numpy as np
from preproc import *
import pandas as pd
from sklearn.metrics import accuracy_score, roc_auc_score
import pyximport
pyximport.install(language_level=3)
from FastDecisionTree import *

In [2]:
df = pd.read_csv('resources/train.csv', index_col=0)
test_df = pd.read_csv('resources/test.csv', index_col=0)
sub_df = pd.read_csv('resources/sampleSubmission.csv', index_col=0)

delta = df.iloc[:, 0]
Y = df.iloc[:, 1:401].values
X = df.iloc[:, 401:].values.reshape(-1, 20, 20)

kernel_size = 3
X_i = X[delta == 1]
Y_i = Y[delta == 1]

X_train, Y_train = prepare_data(X_i, Y_i, kernel_size=kernel_size)

In [3]:
X_train = X_train.astype(np.float64)

In [4]:
Y_train = Y_train.astype(np.float64)

In [5]:
Xt, Yt = X_train, Y_train

In [6]:
fdt = FastDecisionTree(max_depth=3, random_state=42)
fdt.fit(Xt, Yt)
accuracy_score(Yt, fdt.predict(Xt)), roc_auc_score(Yt, fdt.predict_proba(Xt)[:, 1])

(0.883042004048583, 0.8257417033215595)

In [7]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(max_depth=3, random_state=42)
dt.fit(Xt, Yt)
accuracy_score(Yt, dt.predict(Xt)), roc_auc_score(Yt, dt.predict_proba(Xt)[:, 1])

(0.883042004048583, 0.8257417033215595)

In [8]:
%%timeit
dt.fit(Xt, Yt)

2.53 s ± 265 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [9]:
%%timeit
fdt.fit(Xt, Yt)

8.28 s ± 302 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [94]:
class RandomForestClassifier:
    def __init__(self, n_estimators=10, max_depth=4, max_features=None, 
                 random_state=42, verbose=False):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.max_features = max_features
        self.random_state = random_state
        self.verbose = verbose

        self.unique_labels = None
        self.trees = []
        self.feat_ids_by_tree = []
        
    def fit(self, X, y):
        n, m = X.shape
        nrange = np.arange(n)
        mrange = np.arange(m)
        max_features = m if self.max_features is None else self.max_features
        
        self.unique_labels = np.unique(y)
        for i in range(self.n_estimators):
            seed = self.random_state + i
            np.random.seed(seed)

            if self.verbose:
                print(f'[{i+1}/{self.n_estimators}]')

            fi = np.random.choice(mrange, replace=False, size=max_features)
            si = np.random.choice(nrange, replace=True, size=n)
            X_train = X[si, :][:, fi]
            y_train = y[si]

            self.feat_ids_by_tree.append(fi)
            
            dt = FastDecisionTree(max_depth=self.max_depth, random_state=seed)
            self.trees.append(dt)
            dt.fit(X_train, y_train)

        if self.verbose:
            print(f"Training is finished: {len(self.trees)} trees have beed built | {len(self.unique_labels)} labels in target")
        return self

    
    def predict_proba(self, X):
        probs = np.zeros((len(X), len(self.unique_labels)))
        for tree, fi in zip(self.trees, self.feat_ids_by_tree):
            X_sub = X[:, fi]
            probs += tree.predict_proba(X_sub)
        return probs / len(self.trees)
    
    def predict(self, X):
        return np.argmax(self.predict_proba(X), axis=1)

    def __repr__(self):
        return f'{self.__class__.__name__}: {self.get_params()}'

    def get_params(self):
        return {'n_estimators': self.n_estimators, 'max_depth': self.max_depth, 'max_features': self.max_features}

In [95]:
rf = RandomForestClassifier(n_estimators=3, verbose=True, random_state=42)

In [96]:
rf.fit(Xt, Yt)

[1/3]
[2/3]
[3/3]
Training is finished: 3 trees have beed built | 2 labels in target


RandomForestClassifier: {'n_estimators': 3, 'max_depth': 4, 'max_features': None}

In [93]:
accuracy_score(Yt, rf.predict(Xt)), roc_auc_score(Yt, rf.predict_proba(Xt)[:, 1])

(0.883042004048583, 0.8623083485750223)