In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import GridSearchCV

In [2]:

data_dir = '/media/arrteom/3923b309-fef1-47f0-a74c-cd259de5b45b/PAMAP2_Dataset/PAMAP2_Dataset/' 
data_path = os.path.join(data_dir, 'pamap_small_with_id.csv')

In [4]:
df_init = pd.read_csv(data_path)
df_init = df_init[df_init.activityID.isin([2, 4, 5, 6, 17])]
df_init.activityID = df_init.activityID.replace({1:0, 2:0, 3:0, 4:1, 5:2, 6:3, 16:4, 17:4})
df_init.shape

(84416, 42)

In [5]:
print(f'full df shape {df_init.shape}')
df_train = df_init[df_init.subjectID.isin([101, 102, 104, 105])]
df_test = df_init[df_init.subjectID.isin([106, 107, 108])]
del df_init
print(f'{df_train.shape = }, {df_test.shape = }')

full df shape (84416, 42)
df_train.shape = (43699, 42), df_test.shape = (32876, 42)


In [6]:
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline

# pipe = make_pipeline(SimpleImputer(strategy='mean'), StandardScaler())
pipe = make_pipeline(StandardScaler())
X_train = pipe.fit_transform(df_train.drop(columns=['activityID', 'subjectID']))
y_train = df_train['activityID']

In [7]:
from lightgbm import LGBMClassifier

In [16]:
param_grid = {
    'reg_lambda': [10, 50, 100, 300],
    'max_depth': [3, 5, 10]
}
grid_search = GridSearchCV(LGBMClassifier(), param_grid, cv=5, scoring='roc_auc_ovo')
grid_search.fit(X_train, y_train)
grid_search.best_score_, grid_search.best_params_ 

In [19]:
model = LGBMClassifier(**grid_search.best_params_).fit(X_train, y_train)

In [20]:
X_test = pipe.transform(df_test.drop(columns=['activityID', 'subjectID']))
y_test = df_test['activityID']

In [21]:
from sklearn.metrics import roc_auc_score

preds = model.predict_proba(X_test)
train_preds = model.predict_proba(X_train)
print(roc_auc_score(y_test, preds, multi_class='ovo'), roc_auc_score(y_train, train_preds, multi_class='ovo'))

0.9567555286405565 0.9999999802981714


In [22]:
from sklearn.metrics import accuracy_score

preds = model.predict_proba(X_test).argmax(axis=1)
train_preds = model.predict_proba(X_train).argmax(axis=1)

print(accuracy_score(y_test, preds), accuracy_score(y_train, train_preds))

0.7514600316340188 0.9999084647245933


In [23]:
preds = model.predict_proba(X_test).argmax(axis=1)
confusion = np.empty((len(set(y_test)), len(set(preds))))
df_conf = pd.DataFrame({'true': y_test, 'pred': preds})
for true_label in set(y_test):
    for pred_label in set(preds):
        confusion[true_label, pred_label] = ((df_conf.true == true_label) & (df_conf.pred == pred_label)).sum()
np.set_printoptions(suppress=True)
print(confusion)

[[4917.    8.    0.    0.  400.]
 [ 936. 7345.    0.    7.    9.]
 [ 166.  163. 1762. 1758.    0.]
 [ 148.  429.    4. 5688.    4.]
 [3643.  496.    0.    0. 4993.]]


In [321]:
for test_subj in [106, 107, 108]:
    print(test_subj)
    X_test = pipe.transform(df_test[df_test.subjectID == test_subj].drop(columns=['activityID', 'subjectID']))
    y_test = df_test[df_test.subjectID == test_subj]['activityID']
    
    preds = model.predict_proba(X_test)
    train_preds = model.predict_proba(X_train)
    print('ovo auc: ', roc_auc_score(y_test, preds, multi_class='ovo'), roc_auc_score(y_train, train_preds, multi_class='ovo'))

    preds = model.predict_proba(X_test).argmax(axis=1)
    train_preds = model.predict_proba(X_train).argmax(axis=1)

    print('acc: ', accuracy_score(y_test, preds), accuracy_score(y_train, train_preds))

106
ovo auc:  0.9237865188738986 0.9999990263854798
acc:  0.6115155526141628 0.9991046672828097
107
ovo auc:  0.8824833191073369 0.9999990263854798
acc:  0.6654554567096855 0.9991046672828097
108
ovo auc:  0.9922518109956883 0.9999990263854798
acc:  0.8860780065005417 0.9991046672828097


In [322]:
model.predict_proba(X_test[:2]), y_test[:2]

(array([[0.98832832, 0.00254618, 0.00268733, 0.0012962 , 0.00514196],
        [0.98770709, 0.00268171, 0.00283037, 0.00136519, 0.00541565]]),
 153553    0
 153554    0
 Name: activityID, dtype: int64)

In [323]:
model.predict_proba(X_test[-2:]), y_test[-2:]

(array([[0.01062628, 0.00448248, 0.94902743, 0.03141608, 0.00444773],
        [0.01060995, 0.00447559, 0.94756912, 0.03290445, 0.00444089]]),
 176601    2
 176602    2
 Name: activityID, dtype: int64)

In [309]:
y_test.value_counts()

activityID
0    6607
4    5226
1    2873
3    2329
2    1425
Name: count, dtype: int64