# Synchro Project
- [github link](https://github.com/romainmartinez/envergo)

## 0. Setup

In [1]:
# Common imports
import scipy.io as sio
import pandas as pd
import numpy as np
import os

# Figures
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
sns.set_context("notebook", font_scale=1.1)
sns.set_style("ticks")
plt.rcParams['pdf.fonttype'] = 42
plt.rcParams['ps.fonttype'] = 42

# to make this notebook's output stable across runs
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

## 1. Load data

In [2]:
# load from mat
DATA_PATH = './data/'
X = sio.loadmat(os.path.join(DATA_PATH, 'X.mat'))['TableauForces']
y = sio.loadmat(os.path.join(DATA_PATH, 'y.mat'))['TestData']

X_description = np.array(['AddL', 'AddR', 'AbdL', 'AbdR', 'ErL', 'ErR', 'IrL', 'IrR', 'ExtL', 'ExtR', 'FlexL', 'FlexR'])
y_description = np.array(['Dyn', 'BodyBoost', 'MeanEggBeater', 'MaxEggBeater'])

In [3]:
X.shape

(53, 12)

In [4]:
y.shape

(53, 4)

### nan remover

In [5]:
nan_id = np.argwhere(np.isnan(X))
n_nans = np.sum(np.isnan(X).sum(axis=1))
for i in nan_id:
    print(f'\tparticipant n: {i[0]}')
    print(f'\ttest: {X_description[i[1]]}')
    # if left take right, left otherwise
    if X_description[i[1]][-1] == 'L':
        replacer = i[1] + 1
    elif X_description[i[1]][-1] == 'R':
        replacer = i[1] - 1
    print(f'\t\t"{X[i[0], i[1]]}" replace by "{X[i[0], replacer]}"')
    X[i[0], i[1]] = X[i[0], replacer]
    print('\t', '-' * 5)

	participant n: 1
	test: IrL
		"nan" replace by "118.95"
	 -----
	participant n: 51
	test: IrL
		"nan" replace by "92.25"
	 -----


### add features

In [6]:
# load height + weight
anthropo = sio.loadmat(os.path.join(DATA_PATH, 'heightweight.mat'))['HeightWeight']
# replace nan
from sklearn.preprocessing import Imputer
anthropo = Imputer(strategy='median').fit_transform(anthropo)
# add IMC
anthropo = np.c_[anthropo, anthropo[:, 1] / (anthropo[:, 0])**2]

In [7]:
# compute imbalance
imbalance = None
for i in range(0, X.shape[1], 2):
    if imbalance is None:
        imbalance = np.abs((X[:, i] - X[:, i + 1]) / X[:, i]) * 100
    else:
        imbalance = np.c_[imbalance, np.abs((X[:, i] - X[:, i + 1]) / X[:, i]) * 100]
imbalance = np.mean(imbalance, axis=1)

In [8]:
X = np.c_[X, anthropo, imbalance]
X_description = np.append(X_description, ['height', 'weight', 'IMC', 'imbalance'])

In [9]:
X_cols = {
    'test': np.arange(12),
    'anthropo': np.arange(12, 15),
    'imbalance': np.array([15])
}

### split data

In [10]:
# split data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_SEED)

## 3. Pipeline

In [11]:
# custom class
from sklearn.base import BaseEstimator, TransformerMixin

class Normalize(BaseEstimator, TransformerMixin):
    """Normalize a given array with weight, height or IMC"""
    def __init__(self, X_cols=X_cols, strategy='IMC'):
        self.strategy = strategy
        self.X_cols = X_cols
        
    def fit(self, X, y=None):
        X_copy = X.copy()
        if self.strategy == 'height':
            normalizer = X_copy[:, X_cols['anthropo']][:, 0].reshape(-1, 1)
        elif self.strategy == 'weight':
            normalizer = X_copy[:, X_cols['anthropo']][:, 1].reshape(-1, 1)
        elif self.strategy == 'IMC':
            normalizer = X_copy[:, X_cols['anthropo']][:, 2].reshape(-1, 1)
        else:
            normalizer = 1
        X_copy[:, X_cols['test']] = X_copy[:, X_cols['test']] / normalizer
        self.output = X_copy
        return self
    
    def transform(self, X):
        return self.output
    
class TestSide(BaseEstimator, TransformerMixin):
    """Return the mean between left & right or both"""
    def __init__(self, X_cols=X_cols, strategy='mean'):
        self.strategy = strategy
        self.X_cols = X_cols
        
    def fit(self, X, y=None):
        if self.strategy == 'mean':
            output = X.copy()
            for i in range(X_cols['test'][-1] + 1):
                if i % 2 == 0:
                    output[:, i] = np.mean([X[:, i], X[:, i + 1]], axis=0)
                else:
                    output[:, i] = np.nan
        else:
            output = X
        self.output = output
        return self
    
    def transform(self, X):
        return self.output
    
class FeaturesAdder(BaseEstimator, TransformerMixin):
    """Add features based on the list `new_features`
    Possible `new_features` are: IMC, imbalance
    """
    def __init__(self, X_cols=X_cols, new_features='None'):
        self.new_features = new_features
        self.X_cols = X_cols
        
    def fit(self, X, y=None):
        X_copy = X[:, X_cols['test']].copy()
        if 'IMC' in self.new_features:
            X_copy = np.c_[X_copy, X[:, X_cols['anthropo']][:, 2]]
        if 'imbalance' in self.new_features:
            X_copy = np.c_[X_copy, X[:, X_cols['imbalance']]]
        if 'height-weight' in self.new_features:
            X_copy = np.c_[X_copy, X[:, X_cols['anthropo']][:, 0:2]]
        self.output = X_copy
        return self
    
    def transform(self, X):
        return self.output

In [19]:
xi = Normalize(strategy='IMC').fit_transform(X_train)  # m, weight, height, IMC
xi = TestSide(strategy='mean').fit_transform(xi)  # mean, m
xi = FeaturesAdder(new_features=['IMC']).fit_transform(xi)  # m, IMC, imbalance, height-weight
print(xi.shape)

pd.DataFrame(xi)

(42, 13)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,10.799532,,12.021331,,4.524388,,5.580041,,8.79673,,8.880368,,22.119015
1,12.484163,,11.222136,,3.831816,,4.129084,,10.879134,,6.954762,,22.959088
2,12.656776,,10.302738,,3.825159,,4.530759,,5.29628,,8.151698,,20.443594
3,14.195088,,12.673231,,3.598209,,5.169503,,9.727056,,8.147745,,18.710698
4,10.53948,,9.251875,,4.354518,,4.517107,,5.965991,,6.156116,,19.066406
5,11.421987,,12.16826,,3.62391,,4.974758,,11.992319,,8.977708,,21.171885
6,15.033453,,10.736623,,5.381943,,7.409033,,7.257717,,9.64333,,18.3391
7,13.597923,,9.626592,,4.658409,,5.778528,,7.375635,,9.791662,,23.323416
8,13.724905,,10.136963,,5.57566,,8.74437,,7.02512,,7.0886,,18.903592
9,12.429429,,11.706929,,4.263357,,5.583286,,7.408357,,8.787786,,20.588235


In [13]:
from sklearn.pipeline import make_pipeline
preprocessing = make_pipeline(
    Normalize(strategy='IMC'),
    TestSide(strategy='mean'),
    FeaturesAdder(new_features=['IMC', 'imbalance'])
)

In [16]:
preprocessing.fit_transform(X_train)

IndexError: index 6 is out of bounds for axis 1 with size 6

## 4. Optimization

In [15]:
from sklearn.multioutput import MultiOutputRegressor
from xgboost import XGBRegressor

full_pipeline = make_pipeline(
    preprocessing,
    XGBRegressor()
)

In [74]:
full_pipeline.fit(X_train)

XGBoostError: b'[09:53:27] src/objective/regression_obj.cc:89: Check failed: (info.labels.size()) != (0) label set cannot be empty'

In [7]:
# train model
from sklearn.multioutput import MultiOutputRegressor
from xgboost import XGBRegressor

xgboost_multi = MultiOutputRegressor(XGBRegressor())

xgboost_multi.fit(X_train, y_train)



MultiOutputRegressor(estimator=XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1),
           n_jobs=1)

In [None]:
# Normalize: height, weight, IMC, nothing
# TestSide: mean, both, Fscore
# FeatureAdder: IMC, imbalance, height-weight

## 5. Evaluation

---

In [9]:
# predict
y_pred_reg = xgboost_multi.predict(X_test)

In [13]:
# evaluate
from sklearn.metrics import mean_squared_error

def mean_absolute_percentage_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

for i, output in enumerate(range(y_pred_reg.shape[1])):
    print(y_description[i])
    mse = mean_squared_error(y_test[:, i], y_pred_reg[:, i])
    rmse = np.sqrt(mse)
    print(f'rmse = {rmse:.3f}')
    mape = mean_absolute_percentage_error(y_test[:, i], y_pred_reg[:, i])
    print(f'mape = {mape:.3f}')
    print('-' * 10)

Dyn
rmse = 1.969
mape = 12.985
----------
BodyBoost
rmse = 0.834
mape = 8.167
----------
MeanEggBeater
rmse = 0.778
mape = 8.669
----------
MaxEggBeater
rmse = 1.110
mape = 10.755
----------
