# Synchro Project
- [github link](https://github.com/romainmartinez/envergo)

## 0. Setup

In [1]:
# Common imports
import scipy.io as sio
import pandas as pd
import numpy as np
import os

# Figures
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
sns.set_context("notebook", font_scale=1.1)
sns.set_style("ticks")
plt.rcParams['pdf.fonttype'] = 42
plt.rcParams['ps.fonttype'] = 42

# to make this notebook's output stable across runs
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

## 1. Load data

In [2]:
# load from mat
DATA_PATH = './data/'
X_import = sio.loadmat(os.path.join(DATA_PATH, 'X.mat'))['TableauForces']
y_import = sio.loadmat(os.path.join(DATA_PATH, 'y.mat'))['TestData']

X_description = np.array(['AddL', 'AddR', 'AbdL', 'AbdR', 'ErL', 'ErR', 'IrL', 'IrR', 'ExtL', 'ExtR', 'FlexL', 'FlexR'])
y_description = np.array(['Dyn', 'BodyBoost', 'MeanEggBeater', 'MaxEggBeater'])

In [3]:
X_import.shape

(53, 12)

In [4]:
y_import.shape

(53, 4)

### nan remover

In [5]:
nan_id = np.argwhere(np.isnan(X_import))
n_nans = np.sum(np.isnan(X_import).sum(axis=1))
for i in nan_id:
    print(f'\tparticipant n: {i[0]}')
    print(f'\ttest: {X_description[i[1]]}')
    # if left take right, left otherwise
    if X_description[i[1]][-1] == 'L':
        replacer = i[1] + 1
    elif X_description[i[1]][-1] == 'R':
        replacer = i[1] - 1
    print(f'\t\t"{X_import[i[0], i[1]]}" replace by "{X_import[i[0], replacer]}"')
    X_import[i[0], i[1]] = X_import[i[0], replacer]
    print('\t', '-' * 5)

	participant n: 1
	test: IrL
		"nan" replace by "118.95"
	 -----
	participant n: 51
	test: IrL
		"nan" replace by "92.25"
	 -----


### add features

In [6]:
# load height + weight
anthropo = sio.loadmat(os.path.join(DATA_PATH, 'heightweight.mat'))['HeightWeight']
# replace nan
from sklearn.preprocessing import Imputer
anthropo = Imputer(strategy='median').fit_transform(anthropo)
# add IMC
anthropo = np.c_[anthropo, anthropo[:, 1] / (anthropo[:, 0])**2]

In [7]:
# compute imbalance
imbalance = None
for i in range(0, X_import.shape[1], 2):
    if imbalance is None:
        imbalance = np.abs((X_import[:, i] - X_import[:, i + 1]) / X_import[:, i]) * 100
    else:
        imbalance = np.c_[imbalance, np.abs((X_import[:, i] - X_import[:, i + 1]) / X_import[:, i]) * 100]
imbalance = np.mean(imbalance, axis=1)

In [8]:
X_mat = np.c_[X_import, anthropo, imbalance]
X_description = np.append(X_description, ['height', 'weight', 'IMC', 'imbalance'])

In [9]:
X_cols = {
    'test': np.arange(12),
    'height': np.array([12]),
    'weight': np.array([13]),
    'IMC': np.array([14]),
    'imbalance': np.array([15])
}

### split data

In [10]:
# split data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_mat, y_import, test_size=0.2, random_state=RANDOM_SEED)

## 3. Pipeline

In [17]:
# custom class
from sklearn.base import BaseEstimator, TransformerMixin

class Normalize(BaseEstimator, TransformerMixin):
    """Normalize a given array with weight, height or IMC"""
    def __init__(self, X_cols=X_cols, strategy='IMC'):
        self.strategy = strategy
        self.X_cols = X_cols
        
    def fit(self, X, y=None):
        X_copy = X.copy()
        if self.strategy is 'height':
            normalizer = X_copy[:, X_cols['height']].reshape(-1, 1)
        elif self.strategy is 'weight':
            normalizer = X_copy[:, X_cols['weight']].reshape(-1, 1)
        elif self.strategy is 'IMC':
            normalizer = X_copy[:, X_cols['IMC']].reshape(-1, 1)
        else:
            normalizer = 1
        X_copy[:, X_cols['test']] = X_copy[:, X_cols['test']] / normalizer
        self.output = X_copy
        return self
    
    def transform(self, X):
        return self.output
    
class TestSide(BaseEstimator, TransformerMixin):
    """Return the mean between left & right or both"""
    def __init__(self, X_cols=X_cols, strategy='mean'):
        self.strategy = strategy
        self.X_cols = X_cols
        
    def fit(self, X, y=None):
        if self.strategy is 'mean' or 'Fscore':
            output = X.copy()
            for i in range(X_cols['test'][-1] + 1):
                if i % 2 == 0:
                    if self.strategy is 'mean':
                        output[:, i] = np.mean([X[:, i], X[:, i + 1]], axis=0)
                    else:
                        output[:, i] = 2 * (X[:, i] * X[:, i + 1]) / (X[:, i] + X[:, i + 1])
                else:
                    output[:, i] = np.nan
        else:
            output = X
        self.output = output
        return self
    
    def transform(self, X):
        return self.output
    
class FeaturesAdder(BaseEstimator, TransformerMixin):
    """Add features based on the list `new_features`
    Possible `new_features` are: IMC, imbalance
    """
    def __init__(self, X_cols=X_cols, new_features='None'):
        self.new_features = new_features
        self.X_cols = X_cols
        
    def fit(self, X, y=None):
        X_copy = X.copy()
        for key, cols in X_cols.items():
            if not key in self.new_features:
                X_copy[:, cols] = np.nan
        # remove nan columns
        self.output = X_copy[:, ~np.all(np.isnan(X_copy), axis=0)]
        return self
    
    def transform(self, X):
        return self.output

In [18]:
from sklearn.pipeline import make_pipeline
preprocessing = make_pipeline(
    Normalize(strategy='IMC'),
    TestSide(strategy='mean'),
    FeaturesAdder(new_features=['IMC', 'imbalance'])
)

In [19]:
from sklearn.multioutput import MultiOutputRegressor
from xgboost import XGBRegressor

full_pipeline = make_pipeline(
    preprocessing,
    MultiOutputRegressor(XGBRegressor())
)

## 4. Optimization

In [20]:
full_pipeline.fit(X_train[0:20, :], y_train[0:20, :])



> [0;32m<ipython-input-17-0bc61bfc2e51>[0m(68)[0;36mfit[0;34m()[0m
[0;32m     66 [0;31m        [0mself[0m[0;34m.[0m[0moutput[0m [0;34m=[0m [0mX_copy[0m[0;34m[[0m[0;34m:[0m[0;34m,[0m [0;34m~[0m[0mnp[0m[0;34m.[0m[0mall[0m[0;34m([0m[0mnp[0m[0;34m.[0m[0misnan[0m[0;34m([0m[0mX_copy[0m[0;34m)[0m[0;34m,[0m [0maxis[0m[0;34m=[0m[0;36m0[0m[0;34m)[0m[0;34m][0m[0;34m[0m[0m
[0m[0;32m     67 [0;31m        [0;32mfrom[0m [0mIPython[0m[0;34m.[0m[0mcore[0m[0;34m.[0m[0mdebugger[0m [0;32mimport[0m [0mTracer[0m[0;34m;[0m [0mTracer[0m[0;34m([0m[0;34m)[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0m
[0m[0;32m---> 68 [0;31m        [0;32mreturn[0m [0mself[0m[0;34m[0m[0m
[0m[0;32m     69 [0;31m[0;34m[0m[0m
[0m[0;32m     70 [0;31m    [0;32mdef[0m [0mtransform[0m[0;34m([0m[0mself[0m[0;34m,[0m [0mX[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0m
[0m


Pipeline(memory=None,
     steps=[('pipeline', Pipeline(memory=None,
     steps=[('normalize', Normalize(X_cols={'test': array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11]), 'height': array([12]), 'weight': array([13]), 'IMC': array([14]), 'imbalance': array([15])},
     strategy='IMC')), ('testside', TestSide(X_cols={'te..., reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1),
           n_jobs=1))])

In [21]:
y_pred = full_pipeline.predict(X_test)

In [22]:
y_pred.shape

(20, 4)

In [45]:
pd.DataFrame(y_pred)

Unnamed: 0,0,1,2,3
0,8.140076,8.49925,7.870894,7.977165
1,8.763251,8.981225,6.990903,7.016715
2,8.854383,9.426169,7.008636,6.980362
3,7.770964,8.322161,6.479364,6.57678
4,9.475232,9.479385,7.243678,7.56338
5,8.329639,8.737689,7.75033,7.983349
6,7.183036,9.711098,7.739953,7.954637
7,10.147043,9.178121,6.569934,6.991956
8,7.118963,9.061635,7.989175,8.910803
9,10.360196,8.983489,8.057585,8.36688


In [None]:
# Normalize: m, weight, height, IMC
# TestSide: m, mean, Fscore
# FeatureAdder: ['test', 'height', 'weight', 'IMC', 'imbalance']

## 5. Evaluation

---

In [9]:
# predict
y_pred_reg = xgboost_multi.predict(X_test)

In [13]:
# evaluate
from sklearn.metrics import mean_squared_error

def mean_absolute_percentage_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

for i, output in enumerate(range(y_pred_reg.shape[1])):
    print(y_description[i])
    mse = mean_squared_error(y_test[:, i], y_pred_reg[:, i])
    rmse = np.sqrt(mse)
    print(f'rmse = {rmse:.3f}')
    mape = mean_absolute_percentage_error(y_test[:, i], y_pred_reg[:, i])
    print(f'mape = {mape:.3f}')
    print('-' * 10)

Dyn
rmse = 1.969
mape = 12.985
----------
BodyBoost
rmse = 0.834
mape = 8.167
----------
MeanEggBeater
rmse = 0.778
mape = 8.669
----------
MaxEggBeater
rmse = 1.110
mape = 10.755
----------
