# Synchro Project
- [github link](https://github.com/romainmartinez/envergo)

## 0. Setup

In [1]:
# Common imports
import scipy.io as sio
import pandas as pd
import numpy as np
import os

# Figures
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
sns.set_context("notebook", font_scale=1.1)
sns.set_style("ticks")
plt.rcParams['pdf.fonttype'] = 42
plt.rcParams['ps.fonttype'] = 42

# to make this notebook's output stable across runs
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

## 1. Load data

In [2]:
# load from mat
DATA_PATH = './data/'
X_import = sio.loadmat(os.path.join(DATA_PATH, 'X.mat'))['TableauForces']
y_import = sio.loadmat(os.path.join(DATA_PATH, 'y.mat'))['TestData']

X_description = np.array(['AddL', 'AddR', 'AbdL', 'AbdR', 'ErL', 'ErR', 'IrL', 'IrR', 'ExtL', 'ExtR', 'FlexL', 'FlexR'])
y_description = np.array(['Dyn', 'BodyBoost', 'MeanEggBeater', 'MaxEggBeater'])

In [3]:
X_import.shape

(53, 12)

In [4]:
y_import.shape

(53, 4)

### nan remover

In [5]:
nan_id = np.argwhere(np.isnan(X_import))
n_nans = np.sum(np.isnan(X_import).sum(axis=1))
for i in nan_id:
    print(f'\tparticipant n: {i[0]}')
    print(f'\ttest: {X_description[i[1]]}')
    # if left take right, left otherwise
    if X_description[i[1]][-1] == 'L':
        replacer = i[1] + 1
    elif X_description[i[1]][-1] == 'R':
        replacer = i[1] - 1
    print(f'\t\t"{X_import[i[0], i[1]]}" replace by "{X_import[i[0], replacer]}"')
    X_import[i[0], i[1]] = X_import[i[0], replacer]
    print('\t', '-' * 5)

	participant n: 1
	test: IrL
		"nan" replace by "118.95"
	 -----
	participant n: 51
	test: IrL
		"nan" replace by "92.25"
	 -----


### add features

In [6]:
# load height + weight
anthropo = sio.loadmat(os.path.join(DATA_PATH, 'heightweight.mat'))['HeightWeight']
# replace nan
from sklearn.preprocessing import Imputer
anthropo = Imputer(strategy='median').fit_transform(anthropo)
# add IMC
anthropo = np.c_[anthropo, anthropo[:, 1] / (anthropo[:, 0])**2]

In [7]:
# compute imbalance
imbalance = None
for i in range(0, X_import.shape[1], 2):
    if imbalance is None:
        imbalance = np.abs((X_import[:, i] - X_import[:, i + 1]) / X_import[:, i]) * 100
    else:
        imbalance = np.c_[imbalance, np.abs((X_import[:, i] - X_import[:, i + 1]) / X_import[:, i]) * 100]
imbalance = np.mean(imbalance, axis=1)

In [8]:
X_mat = np.c_[X_import, anthropo, imbalance]
X_description = np.append(X_description, ['height', 'weight', 'IMC', 'imbalance'])

In [9]:
X_cols = {
    'test': np.arange(12),
    'height': np.array([12]),
    'weight': np.array([13]),
    'IMC': np.array([14]),
    'imbalance': np.array([15])
}

### split data

In [10]:
# split data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_mat, y_import, test_size=0.2, random_state=RANDOM_SEED)

## 3. Pipeline

In [11]:
# custom class
from sklearn.base import BaseEstimator, TransformerMixin

class Normalize(BaseEstimator, TransformerMixin):
    """Normalize a given array with weight, height or IMC"""
    def __init__(self, X_cols=X_cols, strategy='IMC'):
        self.strategy = strategy
        self.X_cols = X_cols
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X_copy = X.copy()
        if self.strategy is 'height':
            normalizer = X_copy[:, X_cols['height']].reshape(-1, 1)
        elif self.strategy is 'weight':
            normalizer = X_copy[:, X_cols['weight']].reshape(-1, 1)
        elif self.strategy is 'IMC':
            normalizer = X_copy[:, X_cols['IMC']].reshape(-1, 1)
        else:
            normalizer = 1
        X_copy[:, X_cols['test']] = X_copy[:, X_cols['test']] / normalizer
        return X_copy
    
class TestSide(BaseEstimator, TransformerMixin):
    """Return the mean between left & right or both"""
    def __init__(self, X_cols=X_cols, strategy='mean'):
        self.strategy = strategy
        self.X_cols = X_cols
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        if self.strategy is 'mean' or 'Fscore':
            output = X.copy()
            for i in range(X_cols['test'][-1] + 1):
                if i % 2 == 0:
                    if self.strategy is 'mean':
                        output[:, i] = np.mean([X[:, i], X[:, i + 1]], axis=0)
                    else:
                        output[:, i] = 2 * (X[:, i] * X[:, i + 1]) / (X[:, i] + X[:, i + 1])
                else:
                    output[:, i] = np.nan
        else:
            output = X
        return output
    
class FeaturesAdder(BaseEstimator, TransformerMixin):
    """Add features based on the list `new_features`
    Possible `new_features` are: IMC, imbalance
    """
    def __init__(self, X_cols=X_cols, new_features='None'):
        self.new_features = new_features
        self.X_cols = X_cols
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X_copy = X.copy()
        for key, cols in X_cols.items():
            if not key in self.new_features:
                X_copy[:, cols] = np.nan
        return X_copy[:, ~np.all(np.isnan(X_copy), axis=0)]  # remove nan columns

In [12]:
from sklearn.pipeline import make_pipeline
preprocessing = make_pipeline(
    Normalize(strategy='IMC'),
    TestSide(strategy='Fscore'),
    FeaturesAdder(new_features=['test', 'height', 'weight', 'IMC', 'imbalance'])
)

In [14]:
from sklearn.multioutput import MultiOutputRegressor
from xgboost import XGBRegressor

full_pipeline = make_pipeline(
    preprocessing,
    MultiOutputRegressor(XGBRegressor()),
#     XGBRegressor()
)

## 4. Optimization

In [15]:
stuff = ['test', 'height', 'weight', 'IMC', 'imbalance']

In [16]:
import itertools

all_possible_features = ['test', 'height', 'weight', 'IMC', 'imbalance']
all_features_combinations = []
for L in range(1, len(all_possible_features)+1):
    for subset in itertools.combinations(all_possible_features, L):
        all_features_combinations.append(list(subset))
all_features_combinations

[['test'],
 ['height'],
 ['weight'],
 ['IMC'],
 ['imbalance'],
 ['test', 'height'],
 ['test', 'weight'],
 ['test', 'IMC'],
 ['test', 'imbalance'],
 ['height', 'weight'],
 ['height', 'IMC'],
 ['height', 'imbalance'],
 ['weight', 'IMC'],
 ['weight', 'imbalance'],
 ['IMC', 'imbalance'],
 ['test', 'height', 'weight'],
 ['test', 'height', 'IMC'],
 ['test', 'height', 'imbalance'],
 ['test', 'weight', 'IMC'],
 ['test', 'weight', 'imbalance'],
 ['test', 'IMC', 'imbalance'],
 ['height', 'weight', 'IMC'],
 ['height', 'weight', 'imbalance'],
 ['height', 'IMC', 'imbalance'],
 ['weight', 'IMC', 'imbalance'],
 ['test', 'height', 'weight', 'IMC'],
 ['test', 'height', 'weight', 'imbalance'],
 ['test', 'height', 'IMC', 'imbalance'],
 ['test', 'weight', 'IMC', 'imbalance'],
 ['height', 'weight', 'IMC', 'imbalance'],
 ['test', 'height', 'weight', 'IMC', 'imbalance']]

In [17]:
from sklearn.model_selection import GridSearchCV

param_grid = [{
    'pipeline__normalize__strategy': ['m', 'weight', 'height', 'IMC'],
    'pipeline__testside__strategy': ['m', 'mean', 'Fscore'],
    'pipeline__featuresadder__new_features': all_features_combinations
    }]

grid_search_prep = GridSearchCV(full_pipeline, param_grid, cv=5, scoring='neg_mean_squared_error',
                                verbose=1)

grid_search_prep.fit(X_train, y_train)

Fitting 5 folds for each of 372 candidates, totalling 1860 fits


[Parallel(n_jobs=1)]: Done 1860 out of 1860 | elapsed:  1.2min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('pipeline', Pipeline(memory=None,
     steps=[('normalize', Normalize(X_cols={'test': array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11]), 'height': array([12]), 'weight': array([13]), 'IMC': array([14]), 'imbalance': array([15])},
     strategy='IMC')), ('testside', TestSide(X_cols={'te..., reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1),
           n_jobs=1))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'pipeline__normalize__strategy': ['m', 'weight', 'height', 'IMC'], 'pipeline__testside__strategy': ['m', 'mean', 'Fscore'], 'pipeline__featuresadder__new_features': [['test'], ['height'], ['weight'], ['IMC'], ['imbalance'], ['test', 'height'], ['test', 'weight'], ['test', 'IMC'], ['test...nce'], ['height', 'weight', 'IMC', 'imbalance'], ['test', 'height', 'weight', 'IMC', 'imbalance']]}],
       pre_dispatch='2*n_jobs', refit=True, return_tra

In [18]:
grid_search_prep.best_params_

{'pipeline__featuresadder__new_features': ['test', 'weight'],
 'pipeline__normalize__strategy': 'weight',
 'pipeline__testside__strategy': 'mean'}

In [20]:
grid_search_prep.best_score_

-0.97362502907769455

## 5. Feature Importance

In [289]:
regressors = grid_search_prep.best_estimator_.named_steps["multioutputregressor"]
for ilabel in range(y_test.shape[1]):
    fscore = regressors.estimators_[ilabel].booster().get_fscore()
    if ilabel is 0:
        importance = pd.DataFrame(data={y_description[ilabel]: list(fscore.values())},
                                  index=list(fscore.keys()))
    else:
        temp = pd.DataFrame(data={y_description[ilabel]: list(fscore.values())},
                                  index=list(fscore.keys()))
        importance = importance.join(temp)
importance.sort_index(inplace=True)

In [328]:
relative_importance = (importance.div(importance.sum(axis=1), axis=0)) * 100
relative_importance['features'] = relative_importance.index
# to tidy
relative_importance = pd.melt(relative_importance, id_vars='features',
                              var_name='label', value_name='importance')

In [336]:
X_description

array(['AddL', 'AddR', 'AbdL', 'AbdR', 'ErL', 'ErR', 'IrL', 'IrR', 'ExtL',
       'ExtR', 'FlexL', 'FlexR', 'height', 'weight', 'IMC', 'imbalance'],
      dtype='<U9')

In [337]:
selected_features = ['Add', 'Abd', 'Er', 'Ir', 'Ext', 'Flex', 'weight']

In [345]:
plt.figure(figsize=(15, 10))
g = sns.factorplot(data=relative_importance, x='features', y='importance', col='label', kind='bar',
                   saturation=.5)

g.set_xticklabels(selected_features)
sns.despine(offset=10, trim=True)
plt.show()

## 6. Learning Curves

## 7. Evaluation

In [22]:
y_pred = grid_search_prep.best_estimator_.predict(X_test)

In [25]:
from sklearn.metrics import mean_squared_error

def mape(y_test, y_pred):
    val = (np.abs((y_test - y_pred) / y_test)) * 100
    return np.mean(val), np.std(val)

def mse(y_test, y_pred):
    val = ((y_test - y_pred) ** 2)
    return np.mean(val), np.std(val)

def rmse(y_test, y_pred):
    val = np.sqrt(((y_test - y_pred) ** 2))
    return np.mean(val), np.std(val)

def evaluate(y_test, y_pred, y_description):
    for i in range(y_pred.shape[1]):
        print(y_description[i])
        mse_mu, mse_std = mse(y_test[:, i], y_pred[:, i])
        rmse_mu, rmse_std = rmse(y_test[:, i], y_pred[:, i])
        mape_mu, mape_std = mape(y_test[:, i], y_pred[:, i])
        
        print(f'\tmse = {mse_mu:.3f} ({mse_std:.3f})')
        print(f'\trmse = {rmse_mu:.3f} ({rmse_std:.3f})')
        print(f'\tmape = {mape_mu:.3f}% ({mape_std:.3f})')
    
        print('-' * 10)

In [26]:
evaluate(y_test, y_pred, y_description)

Dyn
	mse = 4.476 (6.250)
	rmse = 1.646 (1.329)
	mape = 15.300% (10.019)
----------
BodyBoost
	mse = 0.540 (0.617)
	rmse = 0.565 (0.470)
	mape = 6.190% (5.005)
----------
MeanEggBeater
	mse = 0.446 (0.394)
	rmse = 0.582 (0.327)
	mape = 7.660% (4.348)
----------
MaxEggBeater
	mse = 1.061 (0.599)
	rmse = 0.937 (0.428)
	mape = 11.496% (5.257)
----------
