# Project Fatigue
- GitHub [link](https://github.com/romainmartinez/fatigue)

## todos
- onehot encoding
- feature importance in pipeline

In [6]:
# Common imports
import scipy.io as sio
import numpy as np
import os

# Figures
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
sns.set_context("notebook", font_scale=1.1)
sns.set_style("ticks")
plt.rcParams['pdf.fonttype'] = 42
plt.rcParams['ps.fonttype'] = 42

# to make this notebook's output stable across runs
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

## 0. load data

In [7]:
DATA_PATH = os.path.join('.', 'data/')
DATA_FILE = 'DatabaseRPT.mat'
mat = sio.loadmat(os.path.join(DATA_PATH, DATA_FILE))['DataBaseRPT'][0, 0]

- label
    - `Y` (1, 162): 1 = prefatigue, 2 = fatigue

- used features
    - `AllX` (24, 162): 24 (6 DoF x 4 variables) x 162 (81 participants x 2 time points).
    - `CAssignAll` (1, 24): AllX column assignment

    - `Sex` (1, 162)
    - `Endurance` (1, 162)

- not used
    - `BestX` (7, 162): 7 (variables with SRM>0.8) x 162 (81 participants x 2 time points) matrix. Contains data only for the most responsive variables (SRM>0.8).
    - `CAssignBest` (1, 7): BestX column assignment.
    - `Age` (1, 162): too much NaN.
    - `Height` (1, 162): too much NaN.
    - `Weight` (1, 162) Too much NaN.
    - `SubjectKey` (1, 162): useless.
    - `SID` (1, 162): useless.

### create feature matrix

In [8]:
col_names = [i[0] for i in mat['CAssignAll'].flatten()]

# add `AllX`
X = mat['AllX'].T

# add `Sex`
X = np.c_[X, mat['Sex'].T]
col_names.append('Sex')

# add `Endurance`
X = np.c_[X, mat['Endurance'].T]
col_names.append('Endurance')

col_names = np.array(col_names)

In [17]:
# find variable with SRM > .8
SRM_names = np.array([i[0] for i in mat['CAssignBest'].flatten()])

In [21]:
col_names.shape

(26,)

In [23]:
np.logical_or()

array(['AveElbFlexMean', 'AveElbFlexSD', 'AveShEleSD', 'AveShPlaneMean',
       'AveShPlaneSD', 'AveTrYMean', 'AveTrYSD', 'AveTrZSD', 'Endurance',
       'ROMElbFlexMean', 'ROMElbFlexSD', 'ROMShEleMean', 'ROMShEleSD',
       'ROMShPlaneMean', 'ROMShPlaneSD', 'ROMTrYSD', 'ROMTrZMean',
       'ROMTrZSD', 'Sex'], dtype='<U14')

### create label vector

In [10]:
y = mat['Y'].T

### split data & shuffle

In [11]:
# split data & shuffle
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_SEED)

## 1. Pipeline

In [12]:
import pandas as pd
pd.DataFrame(X, columns=col_names).head()

Unnamed: 0,AveElbFlexMean,ROMElbFlexMean,AveElbFlexSD,ROMElbFlexSD,AveShPlaneMean,ROMShPlaneMean,AveShPlaneSD,ROMShPlaneSD,AveShEleMean,ROMShEleMean,...,AveTrYMean,ROMTrYMean,AveTrYSD,ROMTrYSD,AveTrZMean,ROMTrZMean,AveTrZSD,ROMTrZSD,Sex,Endurance
0,105.983144,63.339618,2.219401,1.794134,23.518597,13.624799,1.338393,2.357813,80.274017,2.942674,...,16.382423,13.629558,1.112671,1.977798,-0.550047,2.671822,0.228494,0.711787,2.0,2.0
1,106.166216,61.875226,2.23931,2.548458,22.851141,11.726534,1.18249,1.401127,77.897777,4.062635,...,16.04163,15.078508,0.910697,1.003788,-0.208865,1.242553,0.541,0.496108,2.0,2.0
2,100.914284,60.930029,1.482458,1.167593,27.289316,12.291697,1.223189,2.452705,83.932823,2.886935,...,15.039772,14.311074,0.97972,2.070705,0.096174,0.934767,0.48854,0.317386,2.0,8.0
3,105.066549,59.524098,2.364182,1.849991,23.083816,7.519415,1.51,3.503819,78.701025,2.99799,...,16.07012,16.178023,1.418294,3.521951,-0.544067,1.88548,0.610577,0.851287,2.0,8.0
4,93.259333,81.531138,2.610755,3.632922,42.387152,27.789377,1.60553,2.027628,71.280986,8.000008,...,8.908385,8.815273,0.709582,1.250228,-0.123442,1.06591,0.415764,0.404298,2.0,4.0


In [13]:
# custom class
from sklearn.base import BaseEstimator, TransformerMixin

# decomposition
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.decomposition import PCA, NMF

# Scale
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import Normalizer

# models
from xgboost import XGBClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# other
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import PolynomialFeatures

class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, selection):
        self.selection = selection
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        if self.selection == 'raw':
            output = X
        elif self.selection == 'srm':
            output = 
        return output

class Scale(BaseEstimator, TransformerMixin):
    def __init__(self, strategy='raw'):
        self.strategy = strategy
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X_copy = X.copy()
        if self.strategy == 'raw':
            scaler = None
        elif self.strategy == 'minmaxscaler':
            scaler = MinMaxScaler()
        elif self.strategy == 'standardscaler':
            scaler = StandardScaler()
        elif self.strategy == 'robustscaler':
            scaler = RobustScaler(quantile_range=(25, 75))
        elif self.strategy == 'normalizer':
            scaler = Normalizer()
        
        if scaler is None:
            output = X_copy
        else:
            output = scaler.fit_transform(X_copy)
        return output

class Decomposition(BaseEstimator, TransformerMixin):
    def __init__(self, strategy='raw', N=None):
        self.strategy = strategy
        self.N = N
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        if self.strategy == 'raw':
            decomposer = None
            output = X
        elif self.strategy == 'pca':
            decomposer = PCA(iterated_power=7, n_components=self.N)
            output = decomposer.fit_transform(X)
        return output
    
def get_categorical_cols(X):
    return X[:, np.logical_or(col_names == 'Sex', col_names == 'Endurance')]

def get_numerical_cols(X):
    return X[:, np.logical_and(col_names != 'Sex', col_names != 'Endurance')]

In [8]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer

pipeline_categorical = Pipeline([
    ('selector', FunctionTransformer(get_categorical_cols, validate=False)),
    ('encoder', OneHotEncoder(sparse=False))
])

pipeline_numerical = Pipeline([
    ('selector', FunctionTransformer(get_numerical_cols, validate=False)),
    ('featureselector', FeatureSelector)
    ('scale', Scale(strategy='raw')),
    ('polyfeatures', PolynomialFeatures(degree=1))
])

pipeline_preprocessing = FeatureUnion([
    ('categorical', pipeline_categorical),
    ('numerical', pipeline_numerical)
])

pipeline_preprocessing_with_decomposition = Pipeline([
    ('categorical', pipeline_preprocessing),
    ('decomposition', Decomposition(strategy='raw', N=10))
])

In [10]:
from sklearn.model_selection import GridSearchCV
import pprint

models = {
#     'XGBClassifier': XGBClassifier(),
    'ExtraTreeClassifier': ExtraTreeClassifier(),
    'LogisticRegression': LogisticRegression(),
#     'GaussianNB': GaussianNB(),
#     'SVC': SVC(), 
#     'KNeighborsClassifier': KNeighborsClassifier()
}

param_grid = {
    'preprocessing__categorical__numerical__polyfeatures__degree': [1, 2, 3],
    'preprocessing__decomposition__strategy': ['raw', 'pca'],
    'preprocessing__decomposition__N': [1, 3, 9, 25]
}

scoring = ['accuracy', 'precision', 'recall', 'f1']

for model_name, model in models.items():
    print(f'{model_name}'.upper())
    
    # trees do no need scaling
    if model_name is 'XGBClassifier' or model_name is 'ExtraTreeClassifier':
        param_grid['preprocessing__categorical__numerical__scale__strategy'] = ['raw']
    else:
        param_grid['preprocessing__categorical__numerical__scale__strategy'] = ['raw', 'minmaxscaler', 'standardscaler', 'robustscaler', 'normalizer']
    
    pipeline_full = Pipeline([
        ('preprocessing', pipeline_preprocessing_with_decomposition),
        ('classifier', model)
    ])
    
    grid_search = GridSearchCV(pipeline_full, param_grid, cv=5, scoring='accuracy',
                                verbose=1)
    grid_search.fit(X_train, y_train.ravel())
    
    print(f'\tbest score: {grid_search.best_score_:.3f}')
    print('\tbest params')
    pprint.pprint(grid_search.best_params_)
    
    print('-' * 10)

EXTRATREECLASSIFIER
Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=1)]: Done 120 out of 120 | elapsed:    8.4s finished


	best score: 0.713
	best params
{'preprocessing__categorical__numerical__polyfeatures__degree': 3,
 'preprocessing__categorical__numerical__scale__strategy': 'raw',
 'preprocessing__decomposition__N': 25,
 'preprocessing__decomposition__strategy': 'raw'}
----------
LOGISTICREGRESSION
Fitting 5 folds for each of 120 candidates, totalling 600 fits


KeyboardInterrupt: 

In [10]:
# polyfeature degree = [1, 2, 3]
# decomposition strategy = ['raw', 'pca', 'selectkbest']
# decomposition n = [1, 3, 9, 26]
# model model = ['XGBClassifier', 'ExtraTreeClassifier', 'LogisticRegression', 'GaussianNB', 'SVC', 'KNeighborsClassifier']

In [71]:
pipeline_numerical.fit_transform(X).shape

(162, 25)

In [72]:
pipeline_categorical.fit_transform(X).shape

(162, 21)

In [73]:
pipeline_preprocessing.fit_transform(X).shape

(162, 46)

In [74]:
pipeline_preprocessing_with_decomposition.fit_transform(X).shape

(162, 46)

In [27]:
pipeline_full.fit(X_train, y_train.ravel())

Pipeline(memory=None,
     steps=[('preprocessing', Pipeline(memory=None,
     steps=[('categorical', FeatureUnion(n_jobs=1,
       transformer_list=[('categorical', Pipeline(memory=None,
     steps=[('selector', FunctionTransformer(accept_sparse=False,
          func=<function get_categorical_cols at 0x7f4a026f50d0>,
       ...owski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'))])

In [21]:
predictions = pipeline_full.predict(X_test)

  if diff:


In [22]:
np.sum(y_test.ravel() == predictions) / y_test.shape[0] * 100

69.6969696969697