# Synchro Project
- [github link](https://github.com/romainmartinez/envergo)

Todos
- tester Dyn en features ET label

## 0. Setup

In [1]:
# Common imports
import scipy.io as sio
import pandas as pd
import numpy as np
import os

# Figures
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
sns.set_context("notebook", font_scale=1.1)
sns.set_style("ticks")
plt.rcParams['pdf.fonttype'] = 42
plt.rcParams['ps.fonttype'] = 42

# to make this notebook's output stable across runs
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

## 1. Load data

In [2]:
# load from mat
DATA_PATH = './data/'
X = sio.loadmat(os.path.join(DATA_PATH, 'X.mat'))['TableauForces']
y = sio.loadmat(os.path.join(DATA_PATH, 'y.mat'))['TestData']

X_description = ['AddL', 'AddR', 'AbdL', 'AbdR', 'ErL', 'ErR', 'IrL', 'IrR', 'ExtL', 'ExtR', 'FlexL', 'FlexR']
y_description = ['Dyn', 'BodyBoost', 'MeanEggBeater', 'MaxEggBeater']

In [3]:
X.shape

(53, 12)

In [4]:
nan_id = np.argwhere(np.isnan(X))
n_nans = np.sum(np.isnan(X).sum(axis=1))
for i in nan_id:
    print(f'\tparticipant n: {i[0]}')
    print(f'\ttest: {X_description[i[1]]}')
    # if left take right, left otherwise
    if X_description[i[1]][-1] == 'L':
        replacer = i[1] + 1
    elif X_description[i[1]][-1] == 'R':
        replacer = i[1] - 1
    print(f'\t\t"{X[i[0], i[1]]}" replace by "{X[i[0], replacer]}"')
    X[i[0], i[1]] = X[i[0], replacer]
    print('\t', '-' * 5)

	participant n: 1
	test: IrL
		"nan" replace by "118.95"
	 -----
	participant n: 51
	test: IrL
		"nan" replace by "92.25"
	 -----


In [5]:
y.shape

(53, 4)

In [31]:
# split data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_SEED)

In [32]:
x1, x2, y1, y2, idx1, idx2 = train_test_split(
    X, y, np.arange(X.shape[0]), test_size=0.2)

In [33]:
idx1

array([ 8, 26,  6, 34,  4, 37, 24, 44, 33, 49, 15,  9, 16, 30, 36, 25, 11,
        0, 46, 27, 31, 39, 29, 45,  1, 21,  2, 40, 35, 23, 48, 10, 22, 18,
       52, 20,  7, 42, 14, 28, 51, 38])

In [39]:
idx2

array([19, 41, 47, 12, 43,  5, 17, 50,  3, 32, 13])

## 3. Pipeline

In [7]:
# preprocessing
    # normalize data with weight, IMC, nothing

# add features
    # mean, both L&R
    # add IMC
    
# prediction
    # xgboost
    # HP optimization

In [8]:
# load height + weight
anthropo = sio.loadmat(os.path.join(DATA_PATH, 'heightweight.mat'))['HeightWeight']
# replace nan
from sklearn.preprocessing import Imputer
anthropo = Imputer(strategy='median').fit_transform(anthropo)
# add IMC
anthropo = np.c_[anthropo, anthropo[:, 1] / (anthropo[:, 0])**2]

In [9]:
# compute imbalance
imbalance = None
for i in range(0, X.shape[1], 2):
    if imbalance is None:
        imbalance = np.abs((X[:, i] - X[:, i + 1]) / X[:, i]) * 100
    else:
        imbalance = np.c_[imbalance, np.abs((X[:, i] - X[:, i + 1]) / X[:, i]) * 100]
imbalance = np.mean(imbalance, axis=1)

In [26]:
# custom class
from sklearn.base import BaseEstimator, TransformerMixin

class Normalize(BaseEstimator, TransformerMixin):
    """Normalize a given array with weight, height or IMC"""
    def __init__(self, anthropo=anthropo, strategy='IMC'):
        self.anthropo = anthropo
        self.strategy = strategy
        
    def fit(self, X, y=None):
        if self.strategy == 'height':
            normalizer = anthropo[:, 0][:, np.newaxis]
        elif self.strategy == 'weight':
            normalizer = anthropo[:, 1][:, np.newaxis]
        elif self.strategy == 'IMC':
            normalizer = anthropo[:, 2][:, np.newaxis]
        else:
            normalizer = 1
        print(normalizer.shape)
        print(X.shape)
        self.normalized = X / normalizer
        return self
    
    def transform(self, X):
        return self.normalized
    
class TestSide(BaseEstimator, TransformerMixin):
    """Return the mean between left & right or both"""
    def __init__(self, strategy='mean'):
        self.strategy = strategy
        
    def fit(self, X, y=None):
        if self.strategy == 'mean':
            output = None
            for i in range(0, X.shape[1], 2):
                if output is None:
                    output = np.mean([X[:, i], X[:, i + 1]], axis=0)
                else:
                    output = np.c_[output, np.mean([X[:, i], X[:, i + 1]], axis=0)]
        else:
            output = X
        self.output = output
        return self
    
    def transform(self, X):
        return self.output
    
class FeaturesAdder(BaseEstimator, TransformerMixin):
    """Add features based on the list `new_features`
    Possible `new_features` are: IMC, imbalance
    """
    def __init__(self, new_features='None'):
        self.new_features = new_features
        
    def fit(self, X, y=None):
        if 'IMC' in self.new_features:
            X = np.c_[X, anthropo[:, 2]]
        if 'imbalance' in self.new_features:
            X = np.c_[X, imbalance]
        if 'weight-height' in self.new_features:
            X = np.c_[X, anthropo[:, 0:2]]
        self.X = X
        return self
    
    def transform(self, X):
        return self.X

In [27]:
from sklearn.pipeline import make_pipeline
preprocessing = make_pipeline(
    Normalize(strategy='IMC'),
    TestSide(strategy='mean'),
    FeaturesAdder(new_features=['IMC', 'imbalance'])
)

## 4. Optimization

In [28]:
from sklearn.multioutput import MultiOutputRegressor
from xgboost import XGBRegressor

full_pipeline = make_pipeline(
    preprocessing,
    XGBRegressor()
)

In [29]:
full_pipeline.fit(X_train)

(53, 1)
(42, 12)


ValueError: operands could not be broadcast together with shapes (42,12) (53,1) 

In [7]:
# train model
from sklearn.multioutput import MultiOutputRegressor
from xgboost import XGBRegressor

xgboost_multi = MultiOutputRegressor(XGBRegressor())

xgboost_multi.fit(X_train, y_train)



MultiOutputRegressor(estimator=XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1),
           n_jobs=1)

In [None]:
# Normalize: height, weight, IMC, nothing
# TestSide: mean, both
# FeatureAdder: IMC, imbalance, height-weight

## 5. Evaluation

---

In [9]:
# predict
y_pred_reg = xgboost_multi.predict(X_test)

In [13]:
# evaluate
from sklearn.metrics import mean_squared_error

def mean_absolute_percentage_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

for i, output in enumerate(range(y_pred_reg.shape[1])):
    print(y_description[i])
    mse = mean_squared_error(y_test[:, i], y_pred_reg[:, i])
    rmse = np.sqrt(mse)
    print(f'rmse = {rmse:.3f}')
    mape = mean_absolute_percentage_error(y_test[:, i], y_pred_reg[:, i])
    print(f'mape = {mape:.3f}')
    print('-' * 10)

Dyn
rmse = 1.969
mape = 12.985
----------
BodyBoost
rmse = 0.834
mape = 8.167
----------
MeanEggBeater
rmse = 0.778
mape = 8.669
----------
MaxEggBeater
rmse = 1.110
mape = 10.755
----------
