# Project Fatigue
- GitHub [link](https://github.com/romainmartinez/fatigue)

## todos
- onehot encoding
- feature importance in pipeline

In [1]:
# Common imports
import scipy.io as sio
import numpy as np
import os

# Figures
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
sns.set_context("notebook", font_scale=1.1)
sns.set_style("ticks")
plt.rcParams['pdf.fonttype'] = 42
plt.rcParams['ps.fonttype'] = 42

# to make this notebook's output stable across runs
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

## 0. load data

In [2]:
DATA_PATH = os.path.join('.', 'data/')
DATA_FILE = 'DatabaseRPT.mat'
mat = sio.loadmat(os.path.join(DATA_PATH, DATA_FILE))['DataBaseRPT'][0, 0]

- label
    - `Y` (1, 162): 1 = prefatigue, 2 = fatigue

- used features
    - `AllX` (24, 162): 24 (6 DoF x 4 variables) x 162 (81 participants x 2 time points).
    - `CAssignAll` (1, 24): AllX column assignment

    - `Sex` (1, 162)
    - `Endurance` (1, 162)

- not used
    - `BestX` (7, 162): 7 (variables with SRM>0.8) x 162 (81 participants x 2 time points) matrix. Contains data only for the most responsive variables (SRM>0.8).
    - `CAssignBest` (1, 7): BestX column assignment.
    - `Age` (1, 162): too much NaN.
    - `Height` (1, 162): too much NaN.
    - `Weight` (1, 162) Too much NaN.
    - `SubjectKey` (1, 162): useless.
    - `SID` (1, 162): useless.

### create feature matrix

In [3]:
col_names = [mat['CAssignAll'][0, i].tolist() for i in range(mat['CAssignAll'].shape[1])]
col_names = [item for sublist in col_names for item in sublist]

# add `AllX`
X = mat['AllX'].T

# add `Sex`
X = np.c_[X, mat['Sex'].T]
col_names.append('Sex')

# add `Endurance`
X = np.c_[X, mat['Endurance'].T]
col_names.append('Endurance')

### create label vector

In [4]:
y = mat['Y'].T

### split data & shuffle

In [5]:
# split data & shuffle
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_SEED)

## 1. Pipeline

In [12]:
import pandas as pd
pd.DataFrame(X, columns=col_names)

Unnamed: 0,AveElbFlexMean,ROMElbFlexMean,AveElbFlexSD,ROMElbFlexSD,AveShPlaneMean,ROMShPlaneMean,AveShPlaneSD,ROMShPlaneSD,AveShEleMean,ROMShEleMean,...,AveTrYMean,ROMTrYMean,AveTrYSD,ROMTrYSD,AveTrZMean,ROMTrZMean,AveTrZSD,ROMTrZSD,Sex,Endurance
0,105.983144,63.339618,2.219401,1.794134,23.518597,13.624799,1.338393,2.357813,80.274017,2.942674,...,16.382423,13.629558,1.112671,1.977798,-0.550047,2.671822,0.228494,0.711787,2.0,2.0
1,106.166216,61.875226,2.239310,2.548458,22.851141,11.726534,1.182490,1.401127,77.897777,4.062635,...,16.041630,15.078508,0.910697,1.003788,-0.208865,1.242553,0.541000,0.496108,2.0,2.0
2,100.914284,60.930029,1.482458,1.167593,27.289316,12.291697,1.223189,2.452705,83.932823,2.886935,...,15.039772,14.311074,0.979720,2.070705,0.096174,0.934767,0.488540,0.317386,2.0,8.0
3,105.066549,59.524098,2.364182,1.849991,23.083816,7.519415,1.510000,3.503819,78.701025,2.997990,...,16.070120,16.178023,1.418294,3.521951,-0.544067,1.885480,0.610577,0.851287,2.0,8.0
4,93.259333,81.531138,2.610755,3.632922,42.387152,27.789377,1.605530,2.027628,71.280986,8.000008,...,8.908385,8.815273,0.709582,1.250228,-0.123442,1.065910,0.415764,0.404298,2.0,4.0
5,93.032476,93.229131,5.744352,4.339535,44.517391,37.048092,2.861734,4.352739,62.093725,16.668762,...,7.189821,10.006955,1.162508,3.156199,-2.333114,2.517017,0.769059,0.798565,2.0,4.0
6,108.167450,84.544943,2.507849,1.686678,36.760291,28.755959,1.059043,2.575782,89.055916,8.222719,...,3.395774,3.413249,0.286429,0.780095,0.257424,0.820104,0.475103,0.262870,2.0,5.0
7,105.238224,90.801875,4.350865,5.275744,38.314538,39.578175,2.978478,3.553977,80.482334,8.186150,...,-0.459897,5.665457,1.480955,1.183729,1.681460,1.265310,0.559305,0.442786,2.0,5.0
8,109.352227,65.911194,1.712302,2.163251,39.651505,17.411303,1.196658,2.037376,83.610290,3.356398,...,5.660714,2.796026,0.532010,0.656012,0.057561,0.632462,0.350963,0.239863,1.0,6.0
9,106.415586,63.248066,2.154349,1.674603,39.215862,14.853711,1.327845,1.804076,78.755419,3.890835,...,7.887792,4.198460,0.846471,0.916475,0.546096,0.981348,0.681981,0.417417,1.0,6.0


- one-hot encoding
- scaler
- feature selection (ANOVA, model based, etc.)
- PolynomialFeatures
- different models (XGBoost, scikit, keras)

In [None]:
# custom class

class Normalize(BaseEstimator, TransformerMixin):
    """Normalize a given array with weight, height or IMC"""
    def __init__(self):
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X_copy = X.copy()
        return X_copy