# Project Fatigue
- GitHub [link](https://github.com/romainmartinez/fatigue)

## todos
- keras/tensorflow: [link](https://medium.com/@williamkoehrsen/deep-neural-network-classifier-32c12ff46b6c)
- model optimization
- apply: [link](https://towardsdatascience.com/fine-tuning-a-classifier-in-scikit-learn-66e048c21e65)
- pca with 99% variance explained: [link](https://stackoverflow.com/a/47325158/7221111)
    - SHOULD BE ONLY ON TRANING SET (test w/ print on pipeline)

In [1]:
# Common imports
import scipy.io as sio
import numpy as np
import os

# Figures
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
sns.set_context("notebook", font_scale=1.1)
sns.set_style("ticks")
plt.rcParams['pdf.fonttype'] = 42
plt.rcParams['ps.fonttype'] = 42

# to make this notebook's output stable across runs
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

## 0. load data

In [2]:
DATA_PATH = os.path.join('.', 'data/')
DATA_FILE = 'DatabaseRPT.mat'
mat = sio.loadmat(os.path.join(DATA_PATH, DATA_FILE))['DataBaseRPT'][0, 0]

- label
    - `Y` (1, 162): 1 = prefatigue, 2 = fatigue

- used features
    - `AllX` (24, 162): 24 (6 DoF x 4 variables) x 162 (81 participants x 2 time points).
    - `CAssignAll` (1, 24): AllX column assignment

    - `Sex` (1, 162)
    - `Endurance` (1, 162)

- not used
    - `BestX` (7, 162): 7 (variables with SRM>0.8) x 162 (81 participants x 2 time points) matrix. Contains data only for the most responsive variables (SRM>0.8).
    - `CAssignBest` (1, 7): BestX column assignment.
    - `Age` (1, 162): too much NaN.
    - `Height` (1, 162): too much NaN.
    - `Weight` (1, 162) Too much NaN.
    - `SubjectKey` (1, 162): useless.
    - `SID` (1, 162): useless.

### create feature matrix

In [3]:
col_names = [i[0] for i in mat['CAssignAll'].flatten()]
# find variable with SRM > .8
srm_names = np.array([i[0] for i in mat['CAssignBest'].flatten()])
srm_idx = np.in1d(col_names, srm_names)

# add `AllX`
X = mat['AllX'].T

# add `Sex`
X = np.c_[X, mat['Sex'].T]
col_names.append('Sex')

# add `Endurance`
X = np.c_[X, mat['Endurance'].T]
col_names.append('Endurance')

col_names = np.array(col_names)

### create label vector

In [4]:
y = mat['Y'].T

### create report

In [5]:
import os.path
import pandas as pd

# create dataframe
df = pd.DataFrame(X, columns=col_names)
df['fatigue (label)'] = y

# generate pandas report
REPORT_FILENAME = './pandas_report.html'
if not os.path.isfile(REPORT_FILENAME):
    import pandas_profiling
    report = pandas_profiling.ProfileReport(df)
    report.to_file('./pandas_report.html')

This call to matplotlib.use() has no effect because the backend has already
been chosen; matplotlib.use() must be called *before* pylab, matplotlib.pyplot,
or matplotlib.backends is imported for the first time.

The backend was *originally* set to 'module://ipykernel.pylab.backend_inline' by the following code:
  File "/usr/lib/python3.6/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/usr/lib/python3.6/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/home/romain/.local/share/virtualenvs/fatigue-WgHAVgza/lib/python3.6/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/home/romain/.local/share/virtualenvs/fatigue-WgHAVgza/lib/python3.6/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/home/romain/.local/share/virtualenvs/fatigue-WgHAVgza/lib/python3.6/site-packages/ipykernel/kernelapp.py", line 478, in start
    self.io_loop.start()
  File

### split data & shuffle

In [8]:
# split data & shuffle
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_SEED)

## 1. Pipeline

In [10]:
df.head()

Unnamed: 0,AveElbFlexMean,ROMElbFlexMean,AveElbFlexSD,ROMElbFlexSD,AveShPlaneMean,ROMShPlaneMean,AveShPlaneSD,ROMShPlaneSD,AveShEleMean,ROMShEleMean,...,ROMTrYMean,AveTrYSD,ROMTrYSD,AveTrZMean,ROMTrZMean,AveTrZSD,ROMTrZSD,Sex,Endurance,fatigue (label)
0,105.983144,63.339618,2.219401,1.794134,23.518597,13.624799,1.338393,2.357813,80.274017,2.942674,...,13.629558,1.112671,1.977798,-0.550047,2.671822,0.228494,0.711787,2.0,2.0,1
1,106.166216,61.875226,2.23931,2.548458,22.851141,11.726534,1.18249,1.401127,77.897777,4.062635,...,15.078508,0.910697,1.003788,-0.208865,1.242553,0.541,0.496108,2.0,2.0,2
2,100.914284,60.930029,1.482458,1.167593,27.289316,12.291697,1.223189,2.452705,83.932823,2.886935,...,14.311074,0.97972,2.070705,0.096174,0.934767,0.48854,0.317386,2.0,8.0,1
3,105.066549,59.524098,2.364182,1.849991,23.083816,7.519415,1.51,3.503819,78.701025,2.99799,...,16.178023,1.418294,3.521951,-0.544067,1.88548,0.610577,0.851287,2.0,8.0,2
4,93.259333,81.531138,2.610755,3.632922,42.387152,27.789377,1.60553,2.027628,71.280986,8.000008,...,8.815273,0.709582,1.250228,-0.123442,1.06591,0.415764,0.404298,2.0,4.0,1


In [11]:
# custom class
from sklearn.base import BaseEstimator, TransformerMixin

# decomposition
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.decomposition import PCA, NMF

# scale
from sklearn.preprocessing import StandardScaler

# models
from xgboost import XGBClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# other
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import PolynomialFeatures

class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, selection='raw', idx=srm_idx):
        self.selection = selection
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        if self.selection == 'raw':
            output = X
        elif self.selection == 'idx':
            output = X[:, srm_idx]
        return output

class Scale(BaseEstimator, TransformerMixin):
    def __init__(self, strategy='raw'):
        self.strategy = strategy
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X_copy = X.copy()
        if self.strategy == 'raw':
            scaler = None
        elif self.strategy == 'standardscaler':
            scaler = StandardScaler()
        
        if scaler is None:
            output = X_copy
        else:
            output = scaler.fit_transform(X_copy)
        return output

class Decomposition(BaseEstimator, TransformerMixin):
    def __init__(self, strategy='raw'):
        self.strategy = strategy
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        if self.strategy == 'raw':
            decomposer = None
            output = X
        elif self.strategy == 'pca':
            decomposer = PCA(n_components=0.99, svd_solver='full')
            output = decomposer.fit_transform(X)
        return output
    
def get_categorical_cols(X):
    return X[:, np.logical_or(col_names == 'Sex', col_names == 'Endurance')]

def get_numerical_cols(X):
    return X[:, np.logical_and(col_names != 'Sex', col_names != 'Endurance')]

In [14]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer

pipeline_categorical = Pipeline([
    ('selector', FunctionTransformer(get_categorical_cols, validate=False)),
    ('encoder', OneHotEncoder(sparse=False))
])

pipeline_numerical = Pipeline([
    ('selector', FunctionTransformer(get_numerical_cols, validate=False)),
    ('featureselector', FeatureSelector(selection='idx')),
    ('scale', Scale(strategy='raw')),
    ('polyfeatures', PolynomialFeatures(degree=1))
])

pipeline_preprocessing = FeatureUnion([
    ('categorical', pipeline_categorical),
    ('numerical', pipeline_numerical)
])

pipeline_preprocessing_with_decomposition = Pipeline([
    ('categorical', pipeline_preprocessing),
#     ('decomposition', Decomposition(strategy='raw'))
#     ('pca', PCA(n_components=0.99, svd_solver='full'))
])

In [15]:
from sklearn.model_selection import GridSearchCV
import pprint
import warnings

models = {
    'XGBClassifier': XGBClassifier(),
    'ExtraTreeClassifier': ExtraTreeClassifier(),
    'LogisticRegression': LogisticRegression(),
    'GaussianNB': GaussianNB(),
    'SVC': SVC(), 
    'KNeighborsClassifier': KNeighborsClassifier()
}

param_grid = {
    'preprocessing__categorical__numerical__featureselector__selection': ['raw', 'idx'],
    'preprocessing__categorical__numerical__polyfeatures__degree': [1, 2, 3],
    'preprocessing__decomposition__strategy': ['raw', 'pca']
}

scoring = ['accuracy', 'precision', 'recall', 'f1']

for model_name, model in models.items():
    print(f'{model_name}'.upper())
    
    # trees do no need scaling
    if model_name is 'XGBClassifier' or model_name is 'ExtraTreeClassifier':
        param_grid['preprocessing__categorical__numerical__scale__strategy'] = ['raw']
        warnings.simplefilter('ignore')  # ignore xgboost warnings
    else:
        param_grid['preprocessing__categorical__numerical__scale__strategy'] = ['raw', 'standardscaler']
        warnings.simplefilter('default')
    
    pipeline_full = Pipeline([
        ('preprocessing', pipeline_preprocessing_with_decomposition),
        ('classifier', model)
    ])
    
    grid_search = GridSearchCV(pipeline_full, param_grid, cv=5, scoring='accuracy',
                                verbose=1)
    grid_search.fit(X_train, y_train.ravel())
    
    print(f'\tbest score: {grid_search.best_score_:.3f}')
    print('\tbest params')
    pprint.pprint(grid_search.best_params_)
    
    print('-' * 10)

XGBCLASSIFIER
Fitting 5 folds for each of 12 candidates, totalling 60 fits


ValueError: Invalid parameter decomposition for estimator Pipeline(memory=None,
     steps=[('categorical', FeatureUnion(n_jobs=1,
       transformer_list=[('categorical', Pipeline(memory=None,
     steps=[('selector', FunctionTransformer(accept_sparse=False,
          func=<function get_categorical_cols at 0x7f2d9d8fdbf8>,
          inv_kw_args=None, inverse_func=None, kw_args=None,
          pass_y='deprecated', validate=False)), ('...aw')), ('polyfeatures', PolynomialFeatures(degree=1, include_bias=True, interaction_only=False))]))],
       transformer_weights=None))]). Check the list of available parameters with `estimator.get_params().keys()`.

In [12]:
# featureselection selection = ['raw', 'idx']
# polyfeature degree = [1, 2, 3]
# decomposition strategy = ['raw', 'pca']
# decomposition n = [1, 3, 9, 25]
# model model = ['XGBClassifier', 'ExtraTreeClassifier', 'LogisticRegression', 'GaussianNB', 'SVC', 'KNeighborsClassifier']

In [16]:
grid_search.best_params_

{'preprocessing__categorical__numerical__featureselector__selection': 'raw',
 'preprocessing__categorical__numerical__polyfeatures__degree': 1,
 'preprocessing__categorical__numerical__scale__strategy': 'raw',
 'preprocessing__decomposition__N': 7,
 'preprocessing__decomposition__strategy': 'raw'}

In [27]:
pipeline_full.fit(X_train, y_train.ravel())

Pipeline(memory=None,
     steps=[('preprocessing', Pipeline(memory=None,
     steps=[('categorical', FeatureUnion(n_jobs=1,
       transformer_list=[('categorical', Pipeline(memory=None,
     steps=[('selector', FunctionTransformer(accept_sparse=False,
          func=<function get_categorical_cols at 0x7f4a026f50d0>,
       ...owski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'))])

In [14]:
pipeline_full = Pipeline([
        ('preprocessing', pipeline_preprocessing_with_decomposition),
        ('classifier', XGBClassifier())
    ])
pipeline_full.fit(X_train, y_train.ravel())
predictions = pipeline_full.predict(X_test)

  if diff:


In [16]:
predictions = grid_search.best_estimator_.predict(X_test)

In [17]:
np.sum(y_test.ravel() == predictions) / y_test.shape[0] * 100

66.66666666666666

---
KERAS

In [6]:
from keras.wrappers.scikit_learn import KerasClassifier

Using TensorFlow backend.


In [13]:
X_train.shape

(129, 26)

In [19]:
from keras.models import Sequential
from keras.layers import Dense

def create_model():
    # create model
    model = Sequential()
    model.add(Dense(12, input_dim=26, activation='relu'))
    model.add(Dense(26, activation='relu'))
    model.add(Dense(26, activation='sigmoid'))
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [20]:
from sklearn.model_selection import StratifiedKFold, cross_val_score

NN = KerasClassifier(build_fn=create_model,
                     epochs=10,
                     batch_size=10,
                     verbose=1)

kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
results = cross_val_score(NN, X, y, cv=kfold)

ValueError: Error when checking target: expected dense_9 to have shape (26,) but got array with shape (1,)

In [11]:
NN.fit(X_train, y_train)

TypeError: __call__() missing 1 required positional argument: 'inputs'