In [108]:
# Работа с табличными данными
import pandas as pd
import numpy as np

# Пайплайн
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

# Преобразование признаков
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler

# Модели
from sklearn.linear_model import LogisticRegression

# Валидация
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.metrics import f1_score, accuracy_score, classification_report
from sklearn.model_selection import train_test_split

# Визуализация
import plotly.express as px
import plotly.io as pio
pio.templates.default = 'plotly_dark'

from collections import deque

from motorica.utils import *

from typing import Any

## Базовое решение с использованием *Logistic Regression*

In [109]:
METAINFO_PATH = 'marked/selected_montages.csv'
read_meta_info(METAINFO_PATH)

Unnamed: 0_level_0,pilote_id,last_train_idx,len(train),len(test),ts_delta,ticks_per_gest,n_gestures,ACC,GYR,hi_val_sensors,mark_sensors
montage,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2023-05-15_16-16-08.palm,1,23337,23337,5810,33.0,46.0,271.0,True,True,"[3, 4, 6, 12, 13, 16, 17, 21, 22, 27, 28, 30, ...","[3, 4, 6, 12, 13, 16, 17, 21, 22, 27, 28, 30, ..."
2023-05-15_17-12-24.palm,1,23336,23336,5803,33.0,46.0,271.0,True,True,"[3, 4, 6, 12, 13, 16, 17, 21, 22, 27, 28, 30, ...","[3, 4, 6, 12, 13, 16, 17, 21, 22, 27, 28, 30, ..."
2023-06-05_16-12-38.palm,1,17939,17939,4431,33.0,30.0,361.0,True,True,"[3, 4, 5, 6, 12, 13, 16, 17, 21, 22, 27, 28, 3...","[3, 4, 5, 6, 12, 13, 16, 17, 21, 22, 27, 28, 3..."
2023-06-05_17-53-01.palm,1,17771,17771,4435,33.0,31.0,361.0,True,True,"[3, 4, 5, 6, 12, 13, 16, 17, 21, 22, 27, 28, 3...","[3, 4, 5, 6, 12, 13, 16, 17, 21, 22, 27, 28, 3..."
2023-06-20_14-43-11.palm,1,17936,17936,4441,33.0,31.0,361.0,True,True,"[3, 4, 5, 6, 12, 13, 16, 17, 21, 27, 28, 30, 3...","[3, 5, 6, 12, 13, 16, 17, 21, 27, 28, 30, 31, ..."
2023-06-20_13-30-15.palm,1,17928,17928,4435,33.0,31.0,361.0,True,True,"[3, 4, 5, 6, 12, 13, 16, 17, 21, 22, 27, 28, 3...","[3, 4, 5, 6, 12, 13, 16, 17, 21, 22, 27, 28, 3..."
2023-06-20_12-34-17.palm,1,17758,17758,4444,33.0,31.0,361.0,True,True,"[3, 4, 5, 6, 12, 13, 16, 17, 21, 22, 27, 28, 3...","[3, 4, 5, 6, 12, 13, 16, 17, 21, 22, 27, 28, 3..."
2023-09-30_08-06-44.palm,2,5693,5693,5509,33.0,31.0,181.0,True,True,"[7, 9, 10, 18, 20, 23, 26, 28, 31, 34, 37, 39]","[7, 9, 10, 18, 20, 23, 26, 28, 31, 34, 37, 39]"
2023-09-29_11-03-50.palm,2,5694,5694,5511,33.0,31.0,181.0,True,True,"[7, 9, 10, 18, 20, 23, 26, 28, 31, 34, 37, 39]","[7, 9, 10, 18, 20, 23, 26, 28, 34, 37, 39]"
2023-09-29_09-20-47.palm,2,5690,5690,5507,33.0,31.0,181.0,True,True,"[7, 9, 10, 18, 20, 23, 26, 28, 31, 34, 37, 39]","[7, 9, 10, 18, 20, 23, 26, 28, 31, 34, 37, 39]"


In [110]:
def read_train_and_test(
        montage: str,
        features: List[str], 
        target_col: str = 'act_label',
        subdir: str = 'marked/'
) -> List:
    
    data_train = pd.read_csv(subdir + montage + ".train", index_col=0)
    data_test = pd.read_csv(subdir + montage + ".test", index_col=0)
    data_full = pd.read_csv(subdir + montage + ".marked", index_col=None)
    X_train = data_train.drop(target_col, axis=1)[features]
    y_train = data_train[target_col]
    X_test = data_test.drop(target_col, axis=1)[features]
    y_test = data_test[target_col]
    return X_train, X_test, y_train, y_test, data_full

In [111]:
montage = "2023-05-22_17-04-29.palm"
montage = "2023-05-22_20-22-01.palm"
montage_info = read_meta_info(METAINFO_PATH).loc[montage]

print(montage)
display(montage_info)

features = montage_info['hi_val_sensors'] + cols_gyr

X_train, X_test, y_train, y_test, _ = read_train_and_test(montage, features)

scaler = MinMaxScaler()
X_train_scaled = pd.DataFrame(
    scaler.fit_transform(X_train),
    columns=X_train.columns
)
X_test_scaled = pd.DataFrame(
    scaler.transform(X_test),
    columns=X_test.columns
)

2023-05-22_20-22-01.palm


pilote_id                                                         2
last_train_idx                                                15602
len(train)                                                    15602
len(test)                                                      3872
ts_delta                                                       33.0
ticks_per_gest                                                 46.0
n_gestures                                                    181.0
ACC                                                            True
GYR                                                            True
hi_val_sensors    [5, 7, 9, 10, 15, 18, 20, 23, 26, 28, 34, 37, 39]
mark_sensors             [7, 9, 10, 18, 20, 23, 26, 28, 34, 37, 39]
Name: 2023-05-22_20-22-01.palm, dtype: object

In [112]:
lr = LogisticRegression(C=500, max_iter=5000)
lr.fit(X_train_scaled, y_train)

y_pred = lr.predict(X_test_scaled)

#proba = lr.predict_proba(X_test_scaled)
#y_proba = np.array([p[y_pred[i]] for i, p in enumerate(proba)])

In [113]:
fig_data = X_test.copy()
fig_data['true'] = y_test * 100
fig_data['pred'] = y_pred * 100
#fig_data['proba'] = y_proba * 100

fig = px.line(fig_data, width=1000, height=700, title=montage)
fig.update_traces(line=dict(width=1))

In [114]:
print(classification_report(y_test, y_pred, zero_division=0))

              precision    recall  f1-score   support

           0       0.96      0.97      0.97      2666
           1       0.95      0.94      0.95       249
           2       0.95      0.96      0.96       257
           3       1.00      0.88      0.93       234
           4       0.85      0.99      0.92       201
           5       0.93      0.84      0.89       262

    accuracy                           0.96      3869
   macro avg       0.94      0.93      0.94      3869
weighted avg       0.96      0.96      0.96      3869



## Использование лаговых признаков

In [115]:
class LagWrapper(BaseEstimator, TransformerMixin):

    def __init__(
        self, 
        estimator,         # объект модели (необученный)
        w: int = 3,        # ширина лага
        fill_val: Any = 0  # метка класса для первых w - 1 предсказаний
    ):
        self.estimator = estimator
        self.w = w
        self.fill_val = fill_val

    # Внутренний метод для формирования набора с лаговыми признаками
    def _make_lag(self, X: pd.DataFrame | np.ndarray):
        X_copy = np.array(X)
        w = self.w          # w - ширина лага
        n, m = X_copy.shape # n - кол-во строк, m - кол-во столбцов
            
        return np.vstack(
            [X_copy[i: i + w, :].reshape(m * w) for i in range(n - w + 1)]
        )

    def fit(self, X, y):
        # Обучаем модель на лаговых признаках. 
        # При этом первые w - 1 примеров исходных данных пропускаются 
        # и в обучении не участвуют 
        self.estimator.fit(self._make_lag(X), np.array(y)[self.w - 1:])
        # Скопируем атрибут модели .classes_ в соответсвутующий атрибут обёртки
        self.classes_ = self.estimator.classes_
        return self
    
    def predict(self, X):
        
        return np.hstack([
            # заполняем первые w - 1 "предсказанных" меток значением по умолчанию
            [self.fill_val] * (self.w - 1),
            # делаем предсказание на лаговых признаках        
            self.estimator.predict(self._make_lag(X))
        ])
    
    def predict_proba(self, X):
        classes = self.estimator.classes_
        n_classes = classes.shape[0]
        return np.vstack([
            # заполняем первые w - 1 "предсказанных" меток равновероятными значениями
            [[1 / n_classes] * n_classes] * (self.w - 1),
            # делаем предсказание на лаговых признаках        
            self.estimator.predict_proba(self._make_lag(X))
        ])
    
    def set_params(self, **params):
        # Все именованные арументы кроме перечисленных
        wrapper_params = ['w']
        for param in wrapper_params:
            if param in params:
                self.w = params.pop(param)
        # предназначены для оборачиваемой модели
        self.estimator.set_params(**params)

# Оборачиваем логистическую регрессию   
lr_lag = LagWrapper(LogisticRegression(C=200, max_iter=5000), w=5)
# Обучаем и предсказываем как обычную модель
lr_lag.fit(X_train_scaled, y_train)
y_pred = lr_lag.predict(X_test_scaled)
print(classification_report(y_test, y_pred, zero_division=0))


fig_data = X_test.copy()
fig_data['true'] = y_test * 100
fig_data['pred'] = y_pred * 100

# Добавим в визуализацию предсказанные вероятности классов
y_pred_proba = pd.DataFrame(
    lr_lag.predict_proba(X_test_scaled),
    columns=['proba_' + c for c in map(str, lr_lag.classes_)],
    index=fig_data.index
)
fig_data = pd.concat([fig_data, y_pred_proba * 100], axis=1)

fig = px.line(fig_data, width=1000, height=700, title=montage)
fig.update_traces(line=dict(width=1))

              precision    recall  f1-score   support

           0       0.97      0.98      0.98      2666
           1       0.98      0.97      0.98       249
           2       0.97      0.97      0.97       257
           3       0.99      0.93      0.96       234
           4       0.87      1.00      0.93       201
           5       0.95      0.82      0.88       262

    accuracy                           0.97      3869
   macro avg       0.95      0.94      0.95      3869
weighted avg       0.97      0.97      0.97      3869

