## Configuration

In [None]:
# Configuration
features = ['accel_x', 'accel_y', 'accel_z', 'gyro_x', 'gyro_y', 'gyro_z']
num_feat = len(features)
random_seed = 10

freq = 12.5 # Hz
frame_length = 4 # sec
overlap_percentage = 0 # recommend .5
cutmix = False

opt = 'adam' # optimizer 
ema = False

epochs = 10

# Calculation
frame_size = int(freq * frame_length )
overlap_size = int(frame_size * overlap_percentage)
increment_size = int(frame_size * (1 - overlap_percentage))

## Import lib

In [None]:
import time
import glob
import statistics
import os
import json
import joblib
import json

import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt

from numpy import mean, std
from pathlib import Path

from scipy import stats
from scipy.io import loadmat
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder

from sklearn.decomposition import PCA, IncrementalPCA, KernelPCA, SparsePCA, FastICA, TruncatedSVD
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.random_projection import GaussianRandomProjection, SparseRandomProjection

from sklearn.model_selection import (cross_val_score, cross_validate, StratifiedKFold, 
                                      LeaveOneGroupOut, ShuffleSplit, GroupKFold,
                                      GroupShuffleSplit, PredefinedSplit, RepeatedKFold,
                                      KFold)

# from sklearn.linear_model import LogisticRegression 
from sklearn.neighbors import KNeighborsClassifier, NeighborhoodComponentsAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.pipeline import Pipeline


## Load dataset

In [None]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


In [None]:
!unzip -qo '/content/drive/Shareddrives/Ink/PDIoT/training_data.zip'

In [None]:
data = []

dataset_folder = '/content/training_data/thingy'

for subfolder in glob.glob(os.path.join(dataset_folder, '*')): # find all subfolder
    activity = subfolder[len(dataset_folder)+1:]
    #print(activity)


    # for file in sorted(glob.glob(os.path.join(subfolder, '*.json')), key= lambda x: int(os.path.basename(x).split('_')[0])): # find files in numerically sorted order
    for file in glob.glob(os.path.join(subfolder, '*.csv')):
        # print(file)
        # d = pd.read_csv(file, usecols=features) # take only features column
        d = pd.read_csv(file)
        #d = d.dropna(axis=1)
        '''
        # Some datasets have 25hz. Those contain gyro_x in column name
        # skip every 1 row to reduce halve freq from 25hz -> 12hz
        if 'gyro_x' in d: 
            original_len = len(d)
            d = d.take(list(range(0, len(d), 2)))
            new_len = len(d)

            assert original_len == new_len * 2 or original_len + 1 == new_len * 2, (
                    f'{file} not halved. {original_len} vs {new_len}')
        '''

        # some files have x instead of accel_x
        if 'x' in d:
            d.rename(columns = {'x': 'accel_x', 'y': 'accel_y', 'z': 'accel_z'}, inplace=True)

        d = d.loc[:,features]
        d["activity"] = activity

        # Rearrange activity column to be first
        cols = d.columns.tolist()
        cols = cols[-1:] + cols[:-1]
        d = d[cols]

        # Append to data as numpy arr
        data.append(d.to_numpy())
     

data = np.array(data, dtype=object)
data.shape


(1021,)

## Pre-process

### Generator class

Class for Generating frames from row of data

In [None]:
class Generator:
    def __init__(self, cutmix, scale, verbose):
        self.cutmix = cutmix
        self.scale = scale
        self.verbose = verbose

    def get_frames(self, data):
        if self.cutmix:
            return self.get_frames_cutmix(data)
        else:
            return self.get_frames_segment(data)

    def flatten(self, df):
        return np.asarray([record for li in df for record in li])
        # l = []
        # for li in df:
        #     for record in li:
        #         l.append(record)
        # return np.asarray(l)

    def scale_df(self, df):
        if self.verbose:
            print('I am scaling the df')
        X = df[:,1:]
        y = df[:,0].reshape(-1, 1)
        df = np.concatenate([y, StandardScaler().fit_transform(X)], axis=1)
        return df

    # Get all frames via a continuous stream cutmix method
    def get_frames_cutmix(self, df):
        # Creates a Flattened copy from (N,M,3) into (X,3) 
        # where N(file), M(records in file)
        
        if df.ndim != 2:
            df = self.flatten(df)

        if self.scale:
            df = self.scale_df(df)

        frames = []
        labels = []
        for i in range(0, len(df) - frame_size, increment_size):
            # Retrieve the most often used label in this segment
            label = stats.mode(df[i:i + frame_size, 0])[0][0]
            # Take all features in frame except label
            frames.append([df[i:i + frame_size, 1:]])
            labels.append(label)
            

        ## Bring the segments into a better shape
        frames = np.asarray(frames, dtype=float).reshape(-1, frame_size, num_feat)
        labels = np.asarray(labels)

        return frames, labels

    # Get all frames via single file segment method
    def get_frames_segment(self, data):
        # Get single frame of a file
        def get_frame(li):
            frames = []
            labels = []
            li = np.asarray(li)
            for i in range(0, len(li) - frame_size, increment_size):
                frames.append(li[i:i+frame_size, 1:])
                labels.append(li[i, 0])
            return frames, labels


        frames, labels = [], []
        for li in data:
            f, l = get_frame(li)
            frames.extend(f)
            labels.extend(l)

        frames = np.asarray(frames, dtype=float).reshape(-1, frame_size, num_feat)
        labels = np.asarray(labels)
        return frames, labels

### Get frames

In [None]:
generator = Generator(cutmix=cutmix, scale=False, verbose=True)
df, labels = generator.get_frames(data=data)

In [None]:
df.shape

(15015, 50, 6)

In [None]:
np.unique(labels,return_counts=True)

(array(['ascending_stairs', 'descending_stairs', 'desk_work',
        'lying_on_back', 'lying_on_left_side', 'lying_on_right_side',
        'lying_on_stomach', 'movement', 'running', 'sitting',
        'sitting_bent_backward', 'sitting_bent_forward', 'standing',
        'walking'], dtype='<U21'),
 array([1069, 1068, 1075, 1076, 1073, 1073, 1077, 1058, 1074, 1076, 1074,
        1072, 1074, 1076]))

In [None]:
pd.DataFrame(labels)

Unnamed: 0,0
0,ascending_stairs
1,ascending_stairs
2,ascending_stairs
3,ascending_stairs
4,ascending_stairs
...,...
15010,standing
15011,standing
15012,standing
15013,standing


### Labels Encoding

Fit labels to LabelEncoder to create encoding

In [None]:
encoder = LabelEncoder()
labels_encoded = encoder.fit_transform(labels)
encoder.classes_

array(['ascending_stairs', 'descending_stairs', 'desk_work',
       'lying_on_back', 'lying_on_left_side', 'lying_on_right_side',
       'lying_on_stomach', 'movement', 'running', 'sitting',
       'sitting_bent_backward', 'sitting_bent_forward', 'standing',
       'walking'], dtype='<U21')

In [None]:
# Get dict that maps index to name
# Useful for prediction to map results back to text
temp = {}
for i, val in enumerate(encoder.classes_):
    temp[i] = val

print(temp)

{0: 'ascending_stairs', 1: 'descending_stairs', 2: 'desk_work', 3: 'lying_on_back', 4: 'lying_on_left_side', 5: 'lying_on_right_side', 6: 'lying_on_stomach', 7: 'movement', 8: 'running', 9: 'sitting', 10: 'sitting_bent_backward', 11: 'sitting_bent_forward', 12: 'standing', 13: 'walking'}


## Feature Extraction

Extract each frames' feature

In [None]:
def feat_extr(signal):
    list_feat = []

    for i in range(num_feat):
        mean_s = stats.tmean(signal[:,i])
        median_s = statistics.median(signal[:,i])
        mode_s = stats.mode(signal[:,i])[0][0]
        stdev_s = stats.tstd(signal[:,i])
        max_s = stats.tmax(signal[:,i])    
        min_s = stats.tmin(signal[:,i])    
        range_s = max_s - min_s
        skew_s = stats.skew(signal[:,i])
        kurt_s = stats.kurtosis(signal[:,i])
        p10_s = np.percentile(signal[:,i], 10)
        p25_s = np.percentile(signal[:,i], 25)
        p50_s = np.percentile(signal[:,i], 50)
        p75_s = np.percentile(signal[:,i], 75)
        p90_s = np.percentile(signal[:,i], 90)
 
        list_feat.extend([mean_s, median_s, mode_s, stdev_s, max_s, min_s, range_s, skew_s, kurt_s, 
                            p10_s, p25_s, p50_s, p75_s, p90_s])
    
    return np.array(list_feat)

In [None]:
data = []

for frame in df:
    data.append(feat_extr(frame))

data = np.array(data)
data.shape     

(15015, 84)

Add column name

In [None]:
feature_labels = []

for feat in features:
    for sub_feat in ['_mean', '_median', '_mode', '_stdev', '_max', 
                      '_min', '_range', '_skew', '_kurt', '_p10', 
                      '_p25', '_p50', '_p75', '_p90']:
        feature_labels.append(feat + sub_feat)

pd.DataFrame(data, columns = feature_labels)

Unnamed: 0,accel_x_mean,accel_x_median,accel_x_mode,accel_x_stdev,accel_x_max,accel_x_min,accel_x_range,accel_x_skew,accel_x_kurt,accel_x_p10,...,gyro_z_max,gyro_z_min,gyro_z_range,gyro_z_skew,gyro_z_kurt,gyro_z_p10,gyro_z_p25,gyro_z_p50,gyro_z_p75,gyro_z_p90
0,-0.887891,-0.925293,-0.579102,0.448219,-0.037109,-2.000000,1.962891,0.011039,-0.348469,-1.395215,...,63.21875,-82.34375,145.56250,-0.101162,-0.486114,-45.675000,-25.257812,-4.312500,18.164062,49.950000
1,-0.949336,-0.958008,-1.144531,0.397264,-0.129883,-1.756836,1.626953,0.172735,-0.595560,-1.462988,...,56.81250,-77.40625,134.21875,-0.492980,-0.433414,-61.753125,-27.367188,1.031250,11.945312,24.721875
2,-1.011953,-1.020508,-1.291992,0.357246,0.036133,-1.704102,1.740234,0.773741,0.747849,-1.435547,...,62.59375,-83.37500,145.96875,-0.506405,-0.542617,-66.928125,-24.703125,6.984375,22.953125,43.184375
3,-0.905059,-0.931152,-0.894531,0.398221,-0.153320,-2.000000,1.846680,-0.087719,-0.012902,-1.338086,...,50.03125,-81.31250,131.34375,-0.838105,0.722746,-27.934375,-10.546875,8.437500,25.398438,42.115625
4,-0.931484,-0.923828,-0.907227,0.424610,-0.047852,-2.000000,1.952148,0.099267,-0.282570,-1.386816,...,50.81250,-82.71875,133.53125,-0.348371,-0.372128,-56.143750,-23.070312,-4.406250,10.960938,32.725000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15010,-1.004082,-1.003906,-1.003906,0.005289,-0.992188,-1.014648,0.022461,0.104253,-0.356464,-1.010742,...,0.68750,-2.28125,2.96875,-0.571458,1.522129,-1.187500,-0.781250,-0.468750,-0.343750,-0.062500
15011,-1.004336,-1.003906,-1.003906,0.005119,-0.994141,-1.014648,0.020508,0.153073,-0.636391,-1.010742,...,1.12500,-1.18750,2.31250,0.800126,-0.079424,-0.881250,-0.734375,-0.500000,-0.062500,0.481250
15012,-1.005801,-1.006348,-1.006836,0.006836,-0.990234,-1.023438,0.033203,-0.107668,-0.056952,-1.013672,...,0.81250,-1.81250,2.62500,-0.081798,-0.050535,-1.187500,-0.750000,-0.500000,-0.046875,0.218750
15013,-1.003848,-1.003906,-1.004883,0.004965,-0.993164,-1.013672,0.020508,0.259218,-0.425809,-1.009766,...,-0.03125,-1.62500,1.59375,-0.869127,0.507550,-1.062500,-0.687500,-0.468750,-0.281250,-0.093750


## Clasification models

In [None]:
from sklearn.linear_model import RidgeClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.semi_supervised import LabelPropagation
from sklearn.semi_supervised import LabelSpreading
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

## List of Classifiers    
model_list = []
model_list.append(('KNN', KNeighborsClassifier(n_jobs=-1)))
model_list.append(('DT', DecisionTreeClassifier(random_state=random_seed)))
model_list.append(('SVM', SVC(random_state=random_seed)))  
model_list.append(('RF',  RandomForestClassifier(random_state=random_seed, n_jobs=-1)))  
model_list.append(('XGB', XGBClassifier(random_state=random_seed)))  
model_list.append(('LightGBM', LGBMClassifier(random_state=random_seed, n_jobs=-1)))  
model_list.append(('RidgeClassifier', RidgeClassifier()))  
model_list.append(('ExtraTree', ExtraTreeClassifier(random_state=0))) 
#model_list.append(('RadiusNeighbours', RadiusNeighborsClassifier(radius=1.0)))
model_list.append(('LR', LogisticRegression(random_state=0))) 
model_list.append(('LabelPropagation', LabelPropagation())) 
model_list.append(('LabelSpreading', LabelSpreading())) 
model_list.append(('QuadraticDiscriminantAnalysis', QuadraticDiscriminantAnalysis())) 
model_list.append(('LinearDiscriminantAnalysis', LinearDiscriminantAnalysis())) 

## Evaluation

### Pipeline

Create pipeline based on model_list. Prepend Standard Scaler

In [None]:
pipes = []

for model in model_list:
    name = '{} -> {}'.format('SScaler', model[0])
    step = [('StandardScaler', StandardScaler()), model]
    pipes.append([name, Pipeline(steps=step)])

pipes

[['SScaler -> KNN', Pipeline(steps=[('StandardScaler', StandardScaler()),
                  ('KNN', KNeighborsClassifier(n_jobs=-1))])],
 ['SScaler -> DT', Pipeline(steps=[('StandardScaler', StandardScaler()),
                  ('DT', DecisionTreeClassifier(random_state=10))])],
 ['SScaler -> SVM', Pipeline(steps=[('StandardScaler', StandardScaler()),
                  ('SVM', SVC(random_state=10))])],
 ['SScaler -> RF', Pipeline(steps=[('StandardScaler', StandardScaler()),
                  ('RF', RandomForestClassifier(n_jobs=-1, random_state=10))])],
 ['SScaler -> XGB', Pipeline(steps=[('StandardScaler', StandardScaler()),
                  ('XGB', XGBClassifier(random_state=10))])],
 ['SScaler -> LightGBM', Pipeline(steps=[('StandardScaler', StandardScaler()),
                  ('LightGBM', LGBMClassifier(random_state=10))])],
 ['SScaler -> RidgeClassifier',
  Pipeline(steps=[('StandardScaler', StandardScaler()),
                  ('RidgeClassifier', RidgeClassifier())])],
 ['SScal

### Cross validators

In [None]:
cvs = []
# cvs.append(('KFold', KFold(n_splits=10, shuffle=True, random_state=10)))
# cvs.append(('Repeat-KFold', RepeatedKFold(n_splits=10, n_repeats=10, random_state=10)))
cvs.append(('Strat-KFold', StratifiedKFold(n_splits=10, shuffle=True, random_state=random_seed)))

### Train/Eval

In [None]:
scorings = ['accuracy', 'f1_macro', 'precision_macro', 'recall_macro']

scores = {}

for model_name, pipe in pipes:
    scores[model_name] = {}
    for cv_name, cv in cvs:
        scores[model_name][cv_name] = {}

        start_time = time.time()
        data = np.nan_to_num(data)
        n_scores = cross_validate(pipe, data, labels_encoded, cv=cv, scoring=scorings, n_jobs=-1, error_score='raise')
        duration = time.time() - start_time
        
        scores[model_name][cv_name]['duration'] = duration
        
        print('{} | {} | Time taken : {} sec '.format(model_name, cv_name, duration))
        
        for scoring in scorings:
            test_str = 'test_'+scoring
            avg, standard_dev = mean(n_scores[test_str]), std(n_scores[test_str])

            scores[model_name][cv_name][scoring] = {
                'mean': avg,
                'std' : standard_dev
            }
            print('{} : {} ({})'.format(scoring, avg, standard_dev))

        print('='*100)

SScaler -> KNN | Strat-KFold | Time taken : 7.139177560806274 sec 
accuracy : 0.7049611843325045 (0.006142402178067227)
f1_macro : 0.6970639845658068 (0.005888961697017527)
precision_macro : 0.7048655356854217 (0.0073231243205061905)
recall_macro : 0.7048659658420472 (0.006152169823198118)
SScaler -> DT | Strat-KFold | Time taken : 12.286929607391357 sec 
accuracy : 0.8360317710962331 (0.010000169382368045)
f1_macro : 0.8357235662961857 (0.009488201571688153)
precision_macro : 0.8370375736235784 (0.008893535530709281)
recall_macro : 0.8357810747097092 (0.009907978380146888)
SScaler -> SVM | Strat-KFold | Time taken : 65.27836561203003 sec 
accuracy : 0.6960376615323473 (0.009153710108022923)
f1_macro : 0.6848705253408849 (0.009871355498291115)
precision_macro : 0.6873624147810926 (0.010495678551191113)
recall_macro : 0.6961911386157551 (0.009324659538025384)
SScaler -> RF | Strat-KFold | Time taken : 80.89694023132324 sec 
accuracy : 0.9562442614821366 (0.004532938352512571)
f1_macro :



SScaler -> LabelPropagation | Strat-KFold | Time taken : 68.26070141792297 sec 
accuracy : 0.645552898156666 (0.011671875011469497)
f1_macro : 0.6423987809315169 (0.01159540937119643)
precision_macro : 0.668712952892795 (0.00955384672994356)
recall_macro : 0.645393942780584 (0.01174315997250118)




SScaler -> LabelSpreading | Strat-KFold | Time taken : 125.56092286109924 sec 
accuracy : 0.6460855656814676 (0.012084487642191534)
f1_macro : 0.6428350177066653 (0.011960753408480568)
precision_macro : 0.6690335616493555 (0.00974466706605079)
recall_macro : 0.6459261338501582 (0.012136926198959534)
SScaler -> QuadraticDiscriminantAnalysis | Strat-KFold | Time taken : 1.8924553394317627 sec 
accuracy : 0.45174273520271885 (0.026174820961846555)
f1_macro : 0.4078066136805738 (0.027996415721329077)
precision_macro : 0.5192551842310584 (0.032482573149973014)
recall_macro : 0.45185238965587604 (0.025877362897706653)
SScaler -> LinearDiscriminantAnalysis | Strat-KFold | Time taken : 2.9238297939300537 sec 
accuracy : 0.6110535275639586 (0.012376547183709851)
f1_macro : 0.5922771128626686 (0.012118142546681167)
precision_macro : 0.5915948678301716 (0.013944481022829745)
recall_macro : 0.6110292146208588 (0.012445991852602725)


## Visualization

In [None]:
scores

{'SScaler -> KNN': {'Strat-KFold': {'duration': 7.139177560806274,
   'accuracy': {'mean': 0.7049611843325045, 'std': 0.006142402178067227},
   'f1_macro': {'mean': 0.6970639845658068, 'std': 0.005888961697017527},
   'precision_macro': {'mean': 0.7048655356854217,
    'std': 0.0073231243205061905},
   'recall_macro': {'mean': 0.7048659658420472, 'std': 0.006152169823198118}}},
 'SScaler -> DT': {'Strat-KFold': {'duration': 12.286929607391357,
   'accuracy': {'mean': 0.8360317710962331, 'std': 0.010000169382368045},
   'f1_macro': {'mean': 0.8357235662961857, 'std': 0.009488201571688153},
   'precision_macro': {'mean': 0.8370375736235784,
    'std': 0.008893535530709281},
   'recall_macro': {'mean': 0.8357810747097092, 'std': 0.009907978380146888}}},
 'SScaler -> SVM': {'Strat-KFold': {'duration': 65.27836561203003,
   'accuracy': {'mean': 0.6960376615323473, 'std': 0.009153710108022923},
   'f1_macro': {'mean': 0.6848705253408849, 'std': 0.009871355498291115},
   'precision_macro': {'

In [None]:
# See accuracy 
scores_dataframe = pd.DataFrame.from_dict({(i,j): scores[i][j]['accuracy']
                           for i in scores.keys() 
                           for j in scores[i].keys()},
                       orient='index')

# scores_dataframe.insert(2, 'duration', [scores[i][j][k]['duration'] for i in scores.keys() for j in scores[i].keys() for k in scores[i][j].keys()])

scores_dataframe

Unnamed: 0,Unnamed: 1,mean,std
SScaler -> KNN,Strat-KFold,0.704961,0.006142
SScaler -> DT,Strat-KFold,0.836032,0.01
SScaler -> SVM,Strat-KFold,0.696038,0.009154
SScaler -> RF,Strat-KFold,0.956244,0.004533
SScaler -> XGB,Strat-KFold,0.832502,0.008053
SScaler -> LightGBM,Strat-KFold,0.959175,0.002942
SScaler -> RidgeClassifier,Strat-KFold,0.575357,0.008306
SScaler -> ExtraTree,Strat-KFold,0.869262,0.008269
SScaler -> LR,Strat-KFold,0.642825,0.009146
SScaler -> LabelPropagation,Strat-KFold,0.645553,0.011672


# Fit

In [None]:
for _, pipe in pipes:
    pipe.fit(data, labels_encoded)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


# Save

In [None]:
with open("/content/drive/Shareddrives/Ink/PDIoT/model_development/results_thingy_accel_gyro_ml_only_no_dimen.json", "w") as outfile:
    json.dump(scores, outfile, indent = 4)

In [None]:
!mkdir -p "/content/drive/Shareddrives/Ink/PDIoT/model_development/models_thingy_accel_gyro_ml_only_no_dimen_reduc/"

In [None]:
for name, pipe in pipes:
    joblib.dump(pipe, f"/content/drive/Shareddrives/Ink/PDIoT/model_development/models_thingy_accel_gyro_ml_only_no_dimen_reduc/{name}.joblib")