In [1]:
import pandas as pd
import numpy as np
import random, math
import multiprocessing
from tqdm import tqdm
import numba
tqdm.pandas()

xtrain = pd.read_csv('../input/X_train.csv')
ytrain = pd.read_csv('../input/y_train.csv')
train = pd.merge(xtrain, ytrain, how='left', on='series_id')

xtest = pd.read_csv('../input/X_test.csv')
ytest = pd.read_csv('../input/sample_submission.csv')
test = pd.merge(xtest, ytest, how='left', on='series_id')
print(train.shape, test.shape)

(487680, 15) (488448, 14)


In [2]:
def features(df):
    for c in ['angular_velocity_', 'linear_acceleration_']:
        col = [c + c1 for c1 in ['X','Y','Z']]
        for agg in ['min(', 'max(', 'sum(', 'mean(', 'std(', 'skew(', 'kurtosis(', 'quantile(.25,', 'quantile(.5,', 'quantile(.75,']:
            df[c+agg] = eval('df[col].' + agg + 'axis=1)')
            df[c+'a'+agg] = eval('df[col].abs().' + agg + 'axis=1)')
    return df

train = features(train).fillna(0)
test = features(test).fillna(0)
print(train.shape, test.shape)

(487680, 55) (488448, 54)


In [3]:
col = [c for c in train.columns if c not in ['row_id', 'series_id', 'measurement_number', 'group_id', 'surface']] + ['surface']

In [4]:
#Inspiration from http://searene.me/2017/12/23/Write-Machine-Learning-Algorithms-From-Scratch-Random-Forest/
#@numba.jitclass()
class tree_node:
    def __init__(self, data):
        self.data = data
        self.left = None
        self.right = None
        self.category = None
        self.split_point = (None, None)
        self.split_value = None

def get_most_common_category(data):
    gdata = data[data.columns[-1]].value_counts()
    return gdata.index[0]

def get_categories(data):
    return data[data.columns[-1]].unique()

def get_gini(left, right, categories): #Try different metrics here
    left = left[left.columns[-1]].value_counts()
    right = right[right.columns[-1]].value_counts()
    gini = 0
    for group in left, right:
        if len(group) == 0:
            continue
        score = 0
        for category in categories:
            if category in group.index:
                p = group[category] / sum(group)
            else:
                p = 0
            score += p * p
        gini += (1 - score) * (sum(group) / (sum(left) + sum(right)))
    return gini

def split(data, x, y):
    split_value = data.iloc[x][y]
    left = data[data[y]<=split_value]
    right = data[data[y]>split_value]
    return left, right

def get_split_point(data, split_search, split_min_gini): #Add histogram option
    features = list(data.columns[:-1])
    categories = get_categories(data)
    x, y, gini = None, None, None
    for i_ in range(split_search):
        feature = random.choice(features) 
        i = random.choice(range(len(data)))
        left, right = split(data, i, feature)
        current_gini = get_gini(left, right, categories)
        if gini is None or current_gini < gini:
            x, y, gini = i, feature, current_gini
        if gini <= split_min_gini:
            break
    return x, y

def build_tree(data, depth, max_depth, min_size, n_sample_rate, split_search, split_min_gini):
    if depth==1: data = data.sample(frac=n_sample_rate, random_state=None).copy()
    root = tree_node(data.copy())
    x, y = get_split_point(data, split_search, split_min_gini)
    left_branch, right_branch = split(data, x, y)
    root.split_point = (x, y)
    root.split_value = root.data.iloc[x][y]
    if len(left_branch) == 0 or len(right_branch) == 0 or depth >= max_depth:
        root.category = get_most_common_category(pd.concat((left_branch, right_branch)))
        #if depth < max_depth:print(depth, root.category)
    else:
        if len(left_branch) < min_size:
            root.left = tree_node(left_branch)
            root.left.category = get_most_common_category(left_branch)
        else:
            root.left = build_tree(left_branch, depth + 1, max_depth, min_size, n_sample_rate, split_search, split_min_gini)

        if len(right_branch) < min_size:
            root.right = tree_node(right_branch)
            root.right.category = get_most_common_category(right_branch)
        else:
            root.right = build_tree(right_branch, depth + 1, max_depth, min_size, n_sample_rate, split_search, split_min_gini)
    root.data = None #clean up
    if depth == 1: print('tree created...')
    return root

def RandomForest(df, n_trees, max_depth, min_size, n_sample_rate, split_search, split_min_gini):
    #TO DO: GPU enable and add numba if possible
    p = multiprocessing.Pool(multiprocessing.cpu_count())
    trees = [[df, 1, max_depth, min_size, n_sample_rate, split_search, split_min_gini] for i in range(n_trees)]
    trees=p.starmap(build_tree, trees)
    p.close(); p.join();
    return trees

model = RandomForest(df=train[col], n_trees=14, max_depth=10, min_size=2, n_sample_rate=0.4, split_search=10, split_min_gini=0.0001)

tree created...
tree created...
tree created...
tree created...
tree created...
tree created...
tree created...
tree created...
tree created...
tree created...
tree created...
tree created...
tree created...
tree created...


In [5]:
def print_tree(tree, l, feature='', comment=''):
    i, feature = tree.split_point
    if tree.category != None:
        print('\t'*l, l, comment, tree.category, feature, tree.split_value)
    l += 1
    try: print_tree(tree.left, l, comment='left')
    except: pass
    try: print_tree(tree.right, l, comment='right')
    except: pass

for tree in model:
    print_tree(tree,0,comment='root')
    break

									 9 left concrete orientation_W -0.065042
									 9 right concrete orientation_Y 0.5355
									 9 left tiled orientation_Z 0.14631
									 9 right wood linear_acceleration_aquantile(.5, 2.6488
									 9 left concrete linear_acceleration_skew( -0.9686729227820298
									 9 right soft_pvc orientation_X -0.80377
									 9 left concrete orientation_Z 0.0023171999999999997
									 9 right fine_concrete linear_acceleration_astd( 7.303434557338987
									 9 left fine_concrete linear_acceleration_aquantile(.5, 0.2894
									 9 right concrete orientation_Y -0.86693
									 9 left concrete orientation_Y 0.42603
									 9 right soft_pvc orientation_Y -0.9158700000000001
									 9 left soft_pvc orientation_X -0.47522
									 9 right tiled linear_acceleration_aquantile(.25, 1.1485400000000001
									 9 left soft_pvc linear_acceleration_skew( -0.439550756608559
									 9 right soft_pvc linear_acceleration_min( -9.1761
									 9 left soft_pvc angular_velocity_Y

									 9 left wood angular_velocity_astd( 0.20410455823425402
									 9 right wood angular_velocity_amax( 0.47523999999999994
									 9 left concrete orientation_Y 0.33055999999999996
									 9 right concrete angular_velocity_skew( 1.1103661402565288
									 9 left concrete angular_velocity_quantile(.5, -0.03781
									 9 right tiled angular_velocity_amean( 0.3060533333333333
									 9 left hard_tiles_large_space angular_velocity_amax( 0.29863
									 9 right tiled angular_velocity_Y -0.1344
									 9 left hard_tiles_large_space orientation_X 0.48493
									 9 right hard_tiles_large_space orientation_Y 0.6109
									 9 left wood orientation_X -0.9677899999999999
									 9 right wood linear_acceleration_max( 3.6049
									 9 left wood angular_velocity_sum( 0.508861
									 9 right wood angular_velocity_Z 0.17565
									 9 left hard_tiles_large_space linear_acceleration_min( -6.7566
									 9 right hard_tiles_large_space linear_acceleration_amax( 7.5048
	

In [6]:
#@numba.jit()
def predict_tree(tree, r):
    if tree.category is not None:
        return tree.category
    i, feature = tree.split_point
    split_value = tree.split_value
    if r[feature] <= split_value:
        return predict_tree(tree.left, r)
    else:
        return predict_tree(tree.right, r)

#@numba.jit()
def predict(trees, r):
    prediction = []
    for tree in trees:
        prediction.append(predict_tree(tree, r))
    return max(set(prediction), key=prediction.count)

def mpredict_apply(df, model, col, target_name):
    df = pd.DataFrame(df)
    df[target_name] = df[col].progress_apply(lambda r: predict(model, r), axis=1).values
    return df

def mpredict(df, model, col, target_name, split):
    p = multiprocessing.Pool(multiprocessing.cpu_count())
    df = np.array_split(df, split)
    df = [[df_, model, col, target_name] for df_ in df]
    df = p.starmap(mpredict_apply, df)
    df = pd.concat(df, axis=0, ignore_index=True).reset_index(drop=True)
    p.close(); p.join()
    return df

val = train.tail(87680)
val = mpredict(val, model, col, 'surface2', split=4)
print('Accuracy Score: ', np.mean(val['surface'] ==val['surface2']))

100%|██████████| 21920/21920 [00:41<00:00, 523.58it/s]
100%|██████████| 21920/21920 [00:42<00:00, 521.20it/s]
100%|██████████| 21920/21920 [00:42<00:00, 521.25it/s]
100%|██████████| 21920/21920 [00:42<00:00, 514.57it/s]


Accuracy Score:  0.5954037408759124


In [None]:
test = mpredict(test, model, col, 'surface', split=4)
sub = test.groupby(by=['series_id', 'surface'], as_index=False)['row_id'].count()
sub = sub.sort_values(by=['series_id', 'surface', 'row_id'], ascending=[True, True, False]).reset_index(drop=True)
sub.drop_duplicates(subset=['series_id'], keep='first', inplace=True)
sub[['series_id', 'surface']].to_csv('submission.csv', index=False)

 30%|██▉       | 36511/122112 [01:13<02:41, 529.94it/s]