# Project Fatigue
- GitHub [link](https://github.com/romainmartinez/fatigue)

## todos
- onehot encoding
- feature importance in pipeline

In [1]:
# Common imports
import scipy.io as sio
import numpy as np
import os

# Figures
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
sns.set_context("notebook", font_scale=1.1)
sns.set_style("ticks")
plt.rcParams['pdf.fonttype'] = 42
plt.rcParams['ps.fonttype'] = 42

# to make this notebook's output stable across runs
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

## 0. load data

In [2]:
DATA_PATH = os.path.join('.', 'data/')
DATA_FILE = 'DatabaseRPT.mat'
mat = sio.loadmat(os.path.join(DATA_PATH, DATA_FILE))['DataBaseRPT'][0, 0]

- label
    - `Y` (1, 162): 1 = prefatigue, 2 = fatigue

- used features
    - `AllX` (24, 162): 24 (6 DoF x 4 variables) x 162 (81 participants x 2 time points).
    - `CAssignAll` (1, 24): AllX column assignment

    - `Sex` (1, 162)
    - `Endurance` (1, 162)

- not used
    - `BestX` (7, 162): 7 (variables with SRM>0.8) x 162 (81 participants x 2 time points) matrix. Contains data only for the most responsive variables (SRM>0.8).
    - `CAssignBest` (1, 7): BestX column assignment.
    - `Age` (1, 162): too much NaN.
    - `Height` (1, 162): too much NaN.
    - `Weight` (1, 162) Too much NaN.
    - `SubjectKey` (1, 162): useless.
    - `SID` (1, 162): useless.

### create feature matrix

In [3]:
col_names = [i[0] for i in mat['CAssignAll'].flatten()]

# add `AllX`
X = mat['AllX'].T

# add `Sex`
X = np.c_[X, mat['Sex'].T]
col_names.append('Sex')

# add `Endurance`
X = np.c_[X, mat['Endurance'].T]
col_names.append('Endurance')

col_names = np.array(col_names)

### create label vector

In [4]:
y = mat['Y'].T

### split data & shuffle

In [5]:
# split data & shuffle
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_SEED)

## 1. Pipeline

In [6]:
import pandas as pd
pd.DataFrame(X, columns=col_names).head()

Unnamed: 0,AveElbFlexMean,ROMElbFlexMean,AveElbFlexSD,ROMElbFlexSD,AveShPlaneMean,ROMShPlaneMean,AveShPlaneSD,ROMShPlaneSD,AveShEleMean,ROMShEleMean,...,AveTrYMean,ROMTrYMean,AveTrYSD,ROMTrYSD,AveTrZMean,ROMTrZMean,AveTrZSD,ROMTrZSD,Sex,Endurance
0,105.983144,63.339618,2.219401,1.794134,23.518597,13.624799,1.338393,2.357813,80.274017,2.942674,...,16.382423,13.629558,1.112671,1.977798,-0.550047,2.671822,0.228494,0.711787,2.0,2.0
1,106.166216,61.875226,2.23931,2.548458,22.851141,11.726534,1.18249,1.401127,77.897777,4.062635,...,16.04163,15.078508,0.910697,1.003788,-0.208865,1.242553,0.541,0.496108,2.0,2.0
2,100.914284,60.930029,1.482458,1.167593,27.289316,12.291697,1.223189,2.452705,83.932823,2.886935,...,15.039772,14.311074,0.97972,2.070705,0.096174,0.934767,0.48854,0.317386,2.0,8.0
3,105.066549,59.524098,2.364182,1.849991,23.083816,7.519415,1.51,3.503819,78.701025,2.99799,...,16.07012,16.178023,1.418294,3.521951,-0.544067,1.88548,0.610577,0.851287,2.0,8.0
4,93.259333,81.531138,2.610755,3.632922,42.387152,27.789377,1.60553,2.027628,71.280986,8.000008,...,8.908385,8.815273,0.709582,1.250228,-0.123442,1.06591,0.415764,0.404298,2.0,4.0


- one-hot encoding: done
- scaler: done
- feature selection (ANOVA, model based, etc.)
- PolynomialFeatures
- different models (XGBoost, scikit, keras)

In [44]:
# custom class
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import FunctionTransformer

# Scale
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing.data import QuantileTransformer

# feature selection
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectFromModel

# models
from xgboost import XGBClassifier

# other
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import PolynomialFeatures

class Scale(BaseEstimator, TransformerMixin):
    def __init__(self, strategy='Raw'):
        self.strategy = strategy.lower()
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X_copy = X.copy()
        if self.strategy == 'raw':
            scaler = None
        elif self.strategy == 'minmaxscaler':
            scaler = MinMaxScaler()
        elif self.strategy == 'maxabsscaler':
            scaler = MaxAbsScaler()
        elif self.strategy == 'standardscaler':
            scaler = StandardScaler()
        elif self.strategy == 'robustscaler':
            scaler = RobustScaler(quantile_range=(25, 75))
        elif self.strategy == 'normalizer':
            scaler = Normalizer()
        elif self.strategy == 'quantiletransformer':
            scaler = QuantileTransformer(output_distribution='normal')
        
        if scaler is None:
            output = X_copy
        else:
            output = scaler.fit_transform(X_copy)
        return output

class Decomposition(BaseEstimator, TransformerMixin):
    def __init__(self, strategy):
        self.strategy = strategy.lower()
        
    def fit(self, X, y=None):
        return
    
    def transform(self, X):
        if self.strategy == 'raw':
            pass
        elif self.strategy == 'pca':
            pass
        elif self.strategy == 'nmf':
            pass
        elif self.strategy == 'selectkbest':
            pass
        return
    
def get_categorical_cols(X):
    return X[:, np.logical_or(col_names == 'Sex', col_names == 'Endurance')]

def get_numerical_cols(X):
    return X[:, np.logical_and(col_names != 'Sex', col_names != 'Endurance')]

In [52]:
http://scikit-learn.org/stable/auto_examples/plot_compare_reduction.html#sphx-glr-auto-examples-plot-compare-reduction-py

SyntaxError: invalid syntax (<ipython-input-52-955a81e62539>, line 1)

In [45]:
pipeline_categorial = make_pipeline(
    FunctionTransformer(get_categorical_cols, validate=False),
    OneHotEncoder(sparse=False)
)

pipeline_numerical = make_pipeline(
    FunctionTransformer(get_numerical_cols, validate=False),
    Scale(strategy='Raw'),
    PolynomialFeatures(degree=1)
)

full_pipeline = make_union(
    pipeline_categorial,
    pipeline_numerical,
    FeatureSelection
)

In [46]:
pipeline_numerical.fit_transform(X).shape

(162, 25)

In [47]:
pipeline_categorial.fit_transform(X).shape

(162, 21)

In [51]:
full_pipeline.fit(X, y)

TypeError: fit() missing 1 required positional argument: 'y'