# 15 Solution (Private: [0.77])
** *If I would've submitted according to my local CV but I got mesmerized by the Leaderboard Scores and so I fall.**

In [182]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from itertools import combinations
import random
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import confusion_matrix

import time
from tqdm import tqdm
import warnings
warnings.simplefilter('ignore')

def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [143]:
train_X = pd.read_csv('../input/X_train.csv').iloc[:,3:].values.reshape(-1,128,10)
test_X  = pd.read_csv('../input/X_test.csv' ).iloc[:,3:].values.reshape(-1,128,10)
print('train_X shape:', train_X.shape, ', test_X shape:', test_X.shape)

train_X shape: (3810, 128, 10) , test_X shape: (3816, 128, 10)


In [144]:
df_train_y = pd.read_csv('../input/y_train.csv')

# build a dict to convert surface names into numbers
surface_names = df_train_y['surface'].unique()
num_surfaces = len(surface_names)
surface_to_numeric = dict(zip(surface_names, range(num_surfaces)))
print('Convert to numbers: ', surface_to_numeric)

# y and group data as numeric values:
train_y = df_train_y['surface'].replace(surface_to_numeric).values
train_group = df_train_y['group_id'].values

Convert to numbers:  {'fine_concrete': 0, 'concrete': 1, 'soft_tiles': 2, 'tiled': 3, 'soft_pvc': 4, 'hard_tiles_large_space': 5, 'carpet': 6, 'hard_tiles': 7, 'wood': 8}


In [145]:
def sq_dist(a,b):
    ''' the squared euclidean distance between two samples '''
    
    return np.sum((a-b)**2, axis=1)


def find_run_edges(data, edge):
    ''' examine links between samples. left/right run edges are those samples which do not have a link on that side. '''

    if edge == 'left':
        border1 = 0
        border2 = -1
    elif edge == 'right':
        border1 = -1
        border2 = 0
    else:
        return False
    
    edge_list = []
    linked_list = []
    
    for i in range(len(data)):
        dist_list = sq_dist(data[i, border1, :4], data[:, border2, :4]) # distances to rest of samples
        min_dist = np.min(dist_list)
        closest_i   = np.argmin(dist_list) # this is i's closest neighbor
        if closest_i == i: # this might happen and it's definitely wrong
            print('Sample', i, 'linked with itself. Next closest sample used instead.')
            closest_i = np.argsort(dist_list)[1]
        dist_list = sq_dist(data[closest_i, border2, :4], data[:, border1, :4]) # now find closest_i's closest neighbor
        rev_dist = np.min(dist_list)
        closest_rev = np.argmin(dist_list) # here it is
        if closest_rev == closest_i: # again a check
            print('Sample', i, '(back-)linked with itself. Next closest sample used instead.')
            closest_rev = np.argsort(dist_list)[1]
        if (i != closest_rev): # we found an edge
            edge_list.append(i)
        else:
            linked_list.append([i, closest_i, min_dist])
            
    return edge_list, linked_list


def find_runs(data, left_edges, right_edges):
    ''' go through the list of samples & link the closest neighbors into a single run '''
    
    data_runs = []

    for start_point in left_edges:
        i = start_point
        run_list = [i]
        while i not in right_edges:
            tmp = np.argmin(sq_dist(data[i, -1, :4], data[:, 0, :4]))
            if tmp == i: # self-linked sample
                tmp = np.argsort(sq_dist(data[i, -1, :4], data[:, 0, :4]))[1]
            i = tmp
            run_list.append(i)
        data_runs.append(np.array(run_list))
    
    return data_runs

In [146]:
train_left_edges, train_left_linked  = find_run_edges(train_X, edge='left')
train_right_edges, train_right_linked = find_run_edges(train_X, edge='right')
print('Found', len(train_left_edges), 'left edges and', len(train_right_edges), 'right edges.')

Sample 1 (back-)linked with itself. Next closest sample used instead.
Sample 216 linked with itself. Next closest sample used instead.
Sample 335 linked with itself. Next closest sample used instead.
Sample 748 (back-)linked with itself. Next closest sample used instead.
Sample 799 linked with itself. Next closest sample used instead.
Sample 1205 linked with itself. Next closest sample used instead.
Sample 1913 linked with itself. Next closest sample used instead.
Sample 1986 linked with itself. Next closest sample used instead.
Sample 2555 linked with itself. Next closest sample used instead.
Sample 2612 linked with itself. Next closest sample used instead.
Sample 2761 linked with itself. Next closest sample used instead.
Sample 2917 linked with itself. Next closest sample used instead.
Sample 3312 linked with itself. Next closest sample used instead.
Sample 181 linked with itself. Next closest sample used instead.
Sample 272 linked with itself. Next closest sample used instead.
Sampl

In [147]:
train_runs = find_runs(train_X, train_left_edges, train_right_edges)

In [148]:
flat_list = [series_id for run in train_runs for series_id in run]
print(len(flat_list), len(np.unique(flat_list)))

3810 3810


In [149]:
df_train_y['run_id'] = 0
df_train_y['run_pos'] = 0

for run_id in range(len(train_runs)):
    for run_pos in range(len(train_runs[run_id])):
        series_id = train_runs[run_id][run_pos]
        df_train_y.at[ series_id, 'run_id'  ] = run_id
        df_train_y.at[ series_id, 'run_pos' ] = run_pos

df_train_y.to_csv('y_train_with_runs.csv', index=False)
df_train_y.tail()

Unnamed: 0,series_id,group_id,surface,run_id,run_pos
3805,3805,55,tiled,46,4
3806,3806,67,wood,74,2
3807,3807,48,fine_concrete,65,57
3808,3808,54,tiled,54,97
3809,3809,56,soft_pvc,69,123


In [150]:
test_left_edges, test_left_linked  = find_run_edges(test_X, edge='left')
test_right_edges, test_right_linked = find_run_edges(test_X, edge='right')
print('Found', len(test_left_edges), 'left edges and', len(test_right_edges), 'right edges.')

Sample 355 linked with itself. Next closest sample used instead.
Sample 580 linked with itself. Next closest sample used instead.
Sample 1402 linked with itself. Next closest sample used instead.
Sample 1547 linked with itself. Next closest sample used instead.
Sample 1716 linked with itself. Next closest sample used instead.
Sample 2136 (back-)linked with itself. Next closest sample used instead.
Sample 2474 linked with itself. Next closest sample used instead.
Sample 2495 (back-)linked with itself. Next closest sample used instead.
Sample 2600 linked with itself. Next closest sample used instead.
Sample 2922 linked with itself. Next closest sample used instead.
Sample 580 linked with itself. Next closest sample used instead.
Sample 1216 linked with itself. Next closest sample used instead.
Sample 1409 linked with itself. Next closest sample used instead.
Sample 1410 linked with itself. Next closest sample used instead.
Sample 1670 linked with itself. Next closest sample used instead.

In [151]:
test_runs = find_runs(test_X, test_left_edges, test_right_edges)

In [152]:
lost_samples = np.array([ i for i in range(len(test_X)) if i not in np.concatenate(test_runs) ])
print(lost_samples)
print(len(lost_samples))

[ 264  361  529  620  733  954 1148 1248 1432 1534 1570 1738 1739 2090
 2205 2714 2847 2978 2991 3115 3173 3183 3195 3359 3517 3655]
26


In [153]:
find_run_edges(test_X[lost_samples], edge='left')[1][0]

[0, 5, 2.0548399999999954e-07]

In [154]:
lost_run = np.array(lost_samples[find_runs(test_X[lost_samples], [0], [5])[0]])
test_runs.append(lost_run)

In [155]:
df_test_y = pd.read_csv("../input/sample_submission.csv")
df_test_y['run_id'] = 0
df_test_y['run_pos'] = 0

for run_id in range(len(test_runs)):
    for run_pos in range(len(test_runs[run_id])):
        series_id = test_runs[run_id][run_pos]
        df_test_y.at[ series_id, 'run_id'  ] = run_id
        df_test_y.at[ series_id, 'run_pos' ] = run_pos

df_test_y.to_csv('y_test_with_runs.csv', index=False)

df_test_y.drop("surface", axis=1, inplace=True)

cheat_json = df_train_y.groupby(['run_id'])['surface'].unique().reset_index().to_dict()
df_test_y['surface'] = df_test_y['run_id'].apply(lambda x: cheat_json['surface'][x][0])
df_test_y.head()

Unnamed: 0,series_id,run_id,run_pos,surface
0,0,14,19,soft_tiles
1,1,57,8,wood
2,2,42,13,fine_concrete
3,3,65,11,fine_concrete
4,4,68,3,wood


# Now adding samples of 256 size

We can get 256 sized samples by just adding two parts of same run_id i.e groups which have same surface.
I've done this by taking randomly selecting 50 combinations for every run_id. We could have done for more nbut the kernel space would exceed so I remained for 50.

In [161]:
%%time
counter = 0
agg_dict = df_train_y.groupby(['run_id'])['series_id'].unique().reset_index()['series_id'].to_dict()
two_sampled_dict = {}
for key, value in agg_dict.items():
    two_sampled_dict[key] = []
#     for item in list(combinations(agg_dict[key].tolist(), 2)):
    llist = list(combinations(agg_dict[key].tolist(), 2))
    if len(llist) > 50:
        two_sampled_dict[key] = random.sample(llist, 50)
        counter += 50
    else:
        two_sampled_dict[key] = random.sample(llist, len(llist))
#         two_sampled_dict[key].append(item)
        counter += len(llist)
print(counter)
del llist
del counter

3505
CPU times: user 72 ms, sys: 12 ms, total: 84 ms
Wall time: 73.1 ms


In [162]:
train = pd.read_csv("../input/X_train.csv")
test = pd.read_csv("../input/X_test.csv")
label = pd.read_csv("../input/y_train.csv")

train = reduce_mem_usage(train)
test = reduce_mem_usage(test)
label = reduce_mem_usage(label)

Memory usage of dataframe is 48.37 MB
Memory usage after optimization is: 36.74 MB
Decreased by 24.0%
Memory usage of dataframe is 48.45 MB
Memory usage after optimization is: 36.77 MB
Decreased by 24.1%
Memory usage of dataframe is 0.09 MB
Memory usage after optimization is: 0.01 MB
Decreased by 82.8%


In [163]:
le = LabelEncoder()
label['surface'] = le.fit_transform(label['surface'])
print(le.classes_)

['carpet' 'concrete' 'fine_concrete' 'hard_tiles' 'hard_tiles_large_space'
 'soft_pvc' 'soft_tiles' 'tiled' 'wood']


In [164]:
train.drop(['row_id', 'measurement_number'], axis=1, inplace=True)
test.drop(['row_id', 'measurement_number'], axis=1, inplace=True)

# Transforming Train with new series i.e 256 sized series.
The newer series having 256 size are given newer series_id starting from 3809.

In [165]:
start_time = time.time()

new_train = train.copy()
print("Initial Train Size :: ", new_train.shape)

new_label = {}
last_series_id = 3810
# for item in range(149205):

for key, value in tqdm(two_sampled_dict.items(), total=len(two_sampled_dict)):
# for key, value in two_sampled_dict.items():
    
    for item in value:
        
        idx1 = item[0]
        idx2 = item[1]
        
        df = pd.DataFrame(columns=train.columns)

        # Creating Train
        for col in df.columns[1: ]:
            df[col] = new_train[col][(new_train['series_id'] == idx1) | (new_train['series_id'] == idx2)]
        df['series_id'] = last_series_id
        
        df.reset_index(inplace=True)
        df.drop(['index'], axis=1, inplace=True)

        # Creating in Label
        new_label[last_series_id] = df_train_y['surface'][(df_train_y['series_id'] == idx1) | (df_train_y['series_id'] == idx2)].value_counts(ascending=False).index[0]
        last_series_id += 1
        
        new_train = pd.concat([new_train, df], ignore_index=True)
        
print("Final Train Size :: ", new_train.shape)
print("Time Taken :: ", time.time() - start_time)

  0%|          | 0/76 [00:00<?, ?it/s]

Initial Train Size ::  (487680, 11)


100%|██████████| 76/76 [03:55<00:00,  3.24s/it]

Final Train Size ::  (1384960, 11)
Time Taken ::  235.0648832321167





# Modelling

In [166]:
def FE(data):
    
    df = pd.DataFrame()
    data['totl_anglr_vel'] = (data['angular_velocity_X']**2 + data['angular_velocity_Y']**2 +
                             data['angular_velocity_Z']**2)** 0.5
    data['totl_linr_acc'] = (data['linear_acceleration_X']**2 + data['linear_acceleration_Y']**2 +
                             data['linear_acceleration_Z']**2)**0.5
#     data['totl_xyz'] = (data['orientation_X']**2 + data['orientation_Y']**2 +
#                              data['orientation_Z'])**0.5
   
    data['acc_vs_vel'] = data['totl_linr_acc'] / data['totl_anglr_vel']
    
    for col in data.columns:
        if col in ['row_id','series_id','measurement_number', 'orientation_X', 'orientation_Y', 'orientation_Z', 'orientation_W']:
            continue
        df[col + '_mean'] = data.groupby(['series_id'])[col].mean()
        df[col + '_median'] = data.groupby(['series_id'])[col].median()
        df[col + '_max'] = data.groupby(['series_id'])[col].max()
        df[col + '_min'] = data.groupby(['series_id'])[col].min()
        df[col + '_std'] = data.groupby(['series_id'])[col].std()
        df[col + '_range'] = df[col + '_max'] - df[col + '_min']
        df[col + '_maxtoMin'] = df[col + '_max'] / df[col + '_min']
        df[col + '_mean_abs_chg'] = data.groupby(['series_id'])[col].apply(lambda x: np.mean(np.abs(np.diff(x))))
        df[col + '_abs_max'] = data.groupby(['series_id'])[col].apply(lambda x: np.max(np.abs(x)))
        df[col + '_abs_min'] = data.groupby(['series_id'])[col].apply(lambda x: np.min(np.abs(x)))
        df[col + '_abs_avg'] = (df[col + '_abs_min'] + df[col + '_abs_max'])/2
    return df

In [167]:
%%time
new_train = FE(new_train)
test = FE(test)
print(new_train.shape, test.shape)

(7315, 99) (3816, 99)
CPU times: user 1min 22s, sys: 120 ms, total: 1min 22s
Wall time: 1min 22s


In [168]:
new_train.fillna(0, inplace = True)
test.fillna(0, inplace = True)
new_train.replace(-np.inf, 0, inplace = True)
new_train.replace(np.inf, 0, inplace = True)
test.replace(-np.inf, 0, inplace = True)
test.replace(np.inf, 0, inplace = True)

In [169]:
def k_folds(clf, X, y, X_test, k):
    folds = StratifiedKFold(n_splits = k, shuffle=True, random_state=13)
    y_test = np.zeros((X_test.shape[0], 9))
    y_oof = np.zeros((X.shape[0]))
    score = 0
    for i, (train_idx, val_idx) in  enumerate(folds.split(X, y)):
#         clf =  RandomForestClassifier(n_estimators = 500, n_jobs = -1)
        clf.fit(X.iloc[train_idx], y[train_idx])
        y_oof[val_idx] = clf.predict(X.iloc[val_idx])
        y_test += clf.predict_proba(X_test) / folds.n_splits
        score += clf.score(X.iloc[val_idx], y[val_idx])
        print('Fold: {} score: {}'.format(i,clf.score(X.iloc[val_idx], y[val_idx])))
    print('Avg Accuracy', score / folds.n_splits) 
        
    return y_oof, y_test

In [170]:
new_label = pd.DataFrame.from_dict(new_label, orient='index').reset_index()
new_label.columns = ['series_id', 'surface']
new_label.head()

Unnamed: 0,series_id,surface
0,3810,soft_tiles
1,3811,soft_tiles
2,3812,soft_tiles
3,3813,soft_tiles
4,3814,soft_tiles


In [171]:
label = pd.read_csv("../input/y_train.csv")
label = pd.concat([label, new_label], ignore_index=True)
label.head()

Unnamed: 0,group_id,series_id,surface
0,13.0,0,fine_concrete
1,31.0,1,concrete
2,20.0,2,concrete
3,31.0,3,concrete
4,22.0,4,soft_tiles


In [172]:
label['surface'] = le.transform(label['surface'])
label.surface.head()

0    2
1    1
2    1
3    1
4    6
Name: surface, dtype: int64

In [173]:
rand = RandomForestClassifier(n_estimators=500, random_state=13)
y_oof, y_test_rand = k_folds(rand, new_train, label['surface'], test, k=5)

Fold: 0 score: 0.7205180640763463
Fold: 1 score: 0.769808743169399
Fold: 2 score: 0.7551299589603283
Fold: 3 score: 0.7371663244353183
Fold: 4 score: 0.7713894592744696
Avg Accuracy 0.7508025099831723


# Without Orientations : (50 * 76 samples(256) added)

Avg Accuracy 0.7465495699892621

In [174]:
ext = ExtraTreesClassifier(n_estimators=500, random_state=13)
y_oof, y_test_ext = k_folds(ext, new_train, label['surface'], test, k=5)

Fold: 0 score: 0.7430129516019086
Fold: 1 score: 0.7841530054644809
Fold: 2 score: 0.7886456908344733
Fold: 3 score: 0.7618069815195072
Fold: 4 score: 0.7919233401779603
Avg Accuracy 0.773908393919666


# ExtraTrees - 0.7617245030663025 (#50)#without orientations

In [175]:
confusion_matrix(y_oof,label['surface'])

array([[ 312,   10,    6,    2,    2,   14,    5,    1,    5],
       [  52, 1270,   95,    0,   63,   33,    3,  206,   73],
       [   2,   25,  275,    0,    8,   12,    0,   28,    8],
       [   0,    0,    0,   47,    0,    1,    0,    0,    0],
       [   3,   16,    5,    0,  450,    9,    6,   30,   19],
       [  55,   35,  159,   11,   24,  991,   11,   31,   73],
       [  15,    8,   13,    1,   21,   20,  511,    6,    9],
       [   1,   86,   27,    0,   42,    1,    0,  605,   44],
       [  20,   20,   60,   10,   12,   65,   11,   21, 1200]])

In [176]:
# Submitting averaging

y_test = y_test_ext + y_test_rand
y_test = np.argmax(y_test, axis=1)
submission = pd.read_csv(os.path.join("../input/", 'sample_submission.csv'))
submission['surface'] = le.inverse_transform(y_test)
submission.to_csv('submission.csv', index=False)
submission.surface.value_counts()

soft_pvc                  988
concrete                  930
wood                      592
tiled                     437
soft_tiles                409
hard_tiles_large_space    173
fine_concrete             168
carpet                    113
hard_tiles                  6
Name: surface, dtype: int64

# Trick (Every run has only one surface)
# So taking the max of every run as the surface for each element

In [177]:
df_test_y['sub'] = submission['surface']
df_test_y.head()

Unnamed: 0,series_id,run_id,run_pos,surface,sub
0,0,14,19,soft_tiles,fine_concrete
1,1,57,8,wood,soft_pvc
2,2,42,13,fine_concrete,concrete
3,3,65,11,fine_concrete,soft_tiles
4,4,68,3,wood,soft_tiles


In [178]:
agg = df_test_y.groupby(['run_id', 'sub'])['sub'].count()
agg = pd.DataFrame(agg)
agg.columns = ['count']
agg.reset_index(inplace=True)
agg = df_test_y.groupby(['run_id']).agg(lambda x: x.value_counts().index[0]).reset_index()[['run_id', 'sub']]
agg_dict = agg.to_dict()
submission['surface'] = df_test_y['run_id'].apply(lambda x: agg_dict['sub'][x])
submission['surface'][df_test_y['run_id'] == 39] = 'hard_tiles'
submission.surface.value_counts()

concrete                  1211
soft_pvc                  1062
wood                       500
soft_tiles                 461
tiled                      189
hard_tiles_large_space     151
hard_tiles                 108
carpet                      72
fine_concrete               62
Name: surface, dtype: int64

In [179]:
submission.surface.value_counts() / submission.shape[0]

concrete                  0.317348
soft_pvc                  0.278302
wood                      0.131027
soft_tiles                0.120807
tiled                     0.049528
hard_tiles_large_space    0.039570
hard_tiles                0.028302
carpet                    0.018868
fine_concrete             0.016247
Name: surface, dtype: float64

In [180]:
submission.shape

(3816, 2)

In [181]:
submission.to_csv("with_model.csv", index=False)