In [2]:
import numpy as np
from preproc import *
import pandas as pd
from sklearn.metrics import accuracy_score, roc_auc_score
import pyximport
pyximport.install(language_level=3)
from FastDecisionTree import *

In [3]:
df = pd.read_csv('resources/train.csv', index_col=0)
test_df = pd.read_csv('resources/test.csv', index_col=0)
sub_df = pd.read_csv('resources/sampleSubmission.csv', index_col=0)

delta = df.iloc[:, 0]
Y = df.iloc[:, 1:401].values
X = df.iloc[:, 401:].values.reshape(-1, 20, 20)

kernel_size = 3
X_i = X[delta == 1]
Y_i = Y[delta == 1]

X_train, Y_train = prepare_data(X_i, Y_i, kernel_size=kernel_size)

In [4]:
X_train = X_train.astype(np.float64)

In [5]:
Y_train = Y_train.astype(np.float64)

In [6]:
Xt, Yt = X_train, Y_train

In [7]:
fdt = FastDecisionTree(max_depth=3, random_state=42)
fdt.fit(Xt, Yt)
accuracy_score(Yt, fdt.predict(Xt)), roc_auc_score(Yt, fdt.predict_proba(Xt)[:, 1])

(0.883042004048583, 0.8257417033215595)

In [7]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(max_depth=3, random_state=42)
dt.fit(Xt, Yt)
accuracy_score(Yt, dt.predict(Xt)), roc_auc_score(Yt, dt.predict_proba(Xt)[:, 1])

(0.883042004048583, 0.8257417033215595)

In [8]:
%%timeit
dt.fit(Xt, Yt)

2.53 s ± 265 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [9]:
%%timeit
fdt.fit(Xt, Yt)

8.28 s ± 302 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [8]:
import multiprocessing as mp

In [9]:
import scipy as sp

def job(model, X, y, seed, max_features):
    sp.random.seed(seed=seed)
    
    n, m = X.shape
    nrange = np.arange(n)
    mrange = np.arange(m)
    fi = np.random.choice(mrange, replace=False, size=max_features)
    si = np.random.choice(nrange, replace=True, size=n)
    X_train = X[si, :][:, fi]
    y_train = y[si]
    
    model.fit(X_train, y_train)
    return model, fi

class RandomForestClassifier:
    def __init__(self, n_estimators=12, max_depth=4, max_features=None, 
                 random_state=42, verbose=False, n_jobs=-1):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.max_features = max_features
        self.random_state = random_state
        self.verbose = verbose
        self.n_jobs = mp.cpu_count()
        if n_jobs > 0:
            self.n_jobs = min(mp.cpu_count(), n_jobs)
        self.pool = mp.Pool(self.n_jobs)

        self.unique_labels = None
        self.trees = []
        self.feat_ids_by_tree = []
        
    def fit(self, X, y):
        if self.n_jobs > 1:
            return self.fit_parallel(X, y)
        self.trees = []
        self.feat_ids_by_tree = []
        n, m = X.shape
        nrange = np.arange(n)
        mrange = np.arange(m)
        max_features = m if self.max_features is None else self.max_features
        
        self.unique_labels = np.unique(y)
        for i in range(self.n_estimators):
            seed = self.random_state + i
            np.random.seed(seed)

            if self.verbose:
                print(f'[{i+1}/{self.n_estimators}]')

            fi = np.random.choice(mrange, replace=False, size=max_features)
            si = np.random.choice(nrange, replace=True, size=n)
            X_train = X[si, :][:, fi]
            y_train = y[si]

            self.feat_ids_by_tree.append(fi)
            
            dt = FastDecisionTree(max_depth=self.max_depth, random_state=seed)
            self.trees.append(dt)
            dt.fit(X_train, y_train)

        if self.verbose:
            print(f"Training is finished: {len(self.trees)} trees have beed built | {len(self.unique_labels)} labels in target")
        return self
    
    def fit_parallel(self, X, y):
        self.trees = []
        self.feat_ids_by_tree = []
        n, m = X.shape
        max_features = m if self.max_features is None else self.max_features
        self.unique_labels = np.unique(y)
        
        tasks = [(FastDecisionTree(max_depth=self.max_depth, random_state=seed), X, y, seed, max_features) for seed in range(self.random_state, self.random_state + self.n_estimators)]
        results = self.pool.starmap(job, tasks)
        for tree, fi in results:
            self.trees.append(tree)
            self.feat_ids_by_tree.append(fi)
        if self.verbose:
            print(f"Training is finished: {len(self.trees)} trees have beed built | {len(self.unique_labels)} labels in target")
        return self
    
    def predict_proba(self, X):
        probs = np.zeros((len(X), len(self.unique_labels)))
        for tree, fi in zip(self.trees, self.feat_ids_by_tree):
            X_sub = X[:, fi]
            probs += tree.predict_proba(X_sub)
        return probs / len(self.trees)
    
    def predict(self, X):
        return np.argmax(self.predict_proba(X), axis=1)

    def __repr__(self):
        return f'{self.__class__.__name__}: {self.get_params()}'

    def get_params(self):
        return {'n_estimators': self.n_estimators, 'max_depth': self.max_depth, 'max_features': self.max_features}


In [29]:
rf = RandomForestClassifier(n_estimators=5, verbose=True, random_state=42, n_jobs=1)

Process ForkPoolWorker-59:
Traceback (most recent call last):
  File "/Users/ptyshevs/.brew/Cellar/python/3.7.4_1/Frameworks/Python.framework/Versions/3.7/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/Users/ptyshevs/.brew/Cellar/python/3.7.4_1/Frameworks/Python.framework/Versions/3.7/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/ptyshevs/.brew/Cellar/python/3.7.4_1/Frameworks/Python.framework/Versions/3.7/lib/python3.7/multiprocessing/pool.py", line 110, in worker
    task = get()
  File "/Users/ptyshevs/.brew/Cellar/python/3.7.4_1/Frameworks/Python.framework/Versions/3.7/lib/python3.7/multiprocessing/queues.py", line 352, in get
    res = self._reader.recv_bytes()
  File "/Users/ptyshevs/.brew/Cellar/python/3.7.4_1/Frameworks/Python.framework/Versions/3.7/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes
    buf = self._recv_bytes(maxlength)
  File

In [30]:
%%timeit
rf.fit(Xt, Yt)

[1/5]
[2/5]
[3/5]
[4/5]
[5/5]
Training is finished: 5 trees have beed built | 2 labels in target
[1/5]
[2/5]
[3/5]
[4/5]
[5/5]
Training is finished: 5 trees have beed built | 2 labels in target
[1/5]
[2/5]
[3/5]
[4/5]
[5/5]
Training is finished: 5 trees have beed built | 2 labels in target
[1/5]
[2/5]
[3/5]
[4/5]
[5/5]
Training is finished: 5 trees have beed built | 2 labels in target
[1/5]
[2/5]
[3/5]
[4/5]
[5/5]
Training is finished: 5 trees have beed built | 2 labels in target
[1/5]
[2/5]
[3/5]
[4/5]
[5/5]
Training is finished: 5 trees have beed built | 2 labels in target
[1/5]
[2/5]
[3/5]
[4/5]
[5/5]
Training is finished: 5 trees have beed built | 2 labels in target
[1/5]
[2/5]
[3/5]
[4/5]
[5/5]
Training is finished: 5 trees have beed built | 2 labels in target
55.4 s ± 1.89 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
%%timeit
rf_mp = RandomForestClassifier(n_estimators=5, verbose=True, random_state=42, n_jobs=-1)
rf_mp.fit(Xt, Yt)

In [32]:
accuracy_score(Yt, rf.predict(Xt)), roc_auc_score(Yt, rf.predict_proba(Xt)[:, 1])

(0.883042004048583, 0.8626280246671785)

In [10]:
rf_mp = RandomForestClassifier(n_estimators=12, verbose=True, random_state=42, n_jobs=-1)
rf_mp.fit(Xt, Yt)

Training is finished: 12 trees have beed built | 2 labels in target


RandomForestClassifier: {'n_estimators': 12, 'max_depth': 4, 'max_features': None}

In [11]:
accuracy_score(Yt, rf_mp.predict(Xt)), roc_auc_score(Yt, rf_mp.predict_proba(Xt)[:, 1])

(0.883042004048583, 0.8629577505862798)

In [12]:
import pickle

In [14]:
with open('test.pcl', 'wb') as f:
    pickle.dump(rf_mp.trees, f)