# MVC project
- [GitHub](https://github.com/romainmartinez/mvc)

In [1]:
# Common imports
import pandas as pd
pd.set_option('display.float_format', '{:.5f}'.format)
import numpy as np
import os
import time

# the 'mvc' directory contains functions used but not necessary to understand the story
import mvc

# Figures
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
sns.set_context("notebook", font_scale=1.1)
sns.set_style("ticks")
plt.rcParams['pdf.fonttype'] = 42
plt.rcParams['ps.fonttype'] = 42

# to make this notebook's output stable across runs
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

## 0. load data

In [2]:
df = pd.read_hdf('mvc.h5')

X = df.drop('max', axis=1).copy()
y = df['max'].copy()
X.head(1)

Unnamed: 0,upper trapezius,middle trapezius,lower trapezius,anterior deltoid,middle deltoid,posterior deltoid,pectoralis major,serratus anterior,latissimus dorsi,supraspinatus,...,6,7,8,9,10,11,12,13,14,15
0,1,0,0,0,0,0,0,0,0,0,...,0.03394,0.05925,0.106,0.08778,0.0287,0.10291,0.01253,0.0297,0.07777,


## 1. Preprocessing

In [3]:
# pipeline
from sklearn.pipeline import make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin

# preprocessing
from sklearn.preprocessing import Imputer

# feature selection
from sklearn.feature_selection import SelectFromModel

# models
from sklearn.ensemble import ExtraTreesRegressor

In [4]:
def normalize_y(vector, norm_vector):
    fct = lambda x, y: x * 100 / y
    return np.apply_along_axis(fct, 0, vector, norm_vector)

class TopTestsSelector(BaseEstimator, TransformerMixin):
    def __init__(self, k, keep):
        self.k = k
        self.keep = keep
    def fit(self, X, y):
        extra_tree = ExtraTreesRegressor(random_state=RANDOM_SEED)
        extra_tree.fit(X, y)
        tests_indices = self.indices_of_top_k(extra_tree.feature_importances_, self.k)
        if self.keep is None:
            self.feature_indices_ = tests_indices
        else:
            self.feature_indices_ = np.append(self.keep, tests_indices)
        print(f'keeped test: {tests_indices} (this is index,  not test no.)')
        return self
    def transform(self, X):
        return X[:, self.feature_indices_]
    @staticmethod
    def indices_of_top_k(arr, k):
        return np.sort(np.argpartition(np.array(arr), -k)[-k:])
    
class NormalizeBasedOnTest(BaseEstimator, TransformerMixin):
    def __init__(self, norm_vector, k):
        self.norm_vector = norm_vector
        self.k = k
    def fit(self, X, y=None):
        self.normalize = lambda x, y: x * 100 / y
        self.X_normalized = np.apply_along_axis(self.normalize, 0, X[:, -self.k:], self.norm_vector)
        return self
    def transform(self, X):
        if self.k is None:
            output = self.X_normalized
        return np.concatenate((X[:, :-self.k], self.X_normalized), axis=1)

In [5]:
# constants
KEEP = np.arange(0, 12) # columns to keep during feature selection
# KEEP = None
K = 5                   # number of features to keep
NORM_TEST = 4           # test used to normalize other features

# normalization vector
NORM_VECTOR = np.array(X[NORM_TEST])
low_idx = np.argwhere(np.logical_or(np.isnan(NORM_VECTOR), 
                           NORM_VECTOR < 10e-5))
NORM_VECTOR[low_idx] = np.nanmedian(NORM_VECTOR)



In [6]:
y_norm = normalize_y(y, norm_vector=NORM_VECTOR)

preprocessing = make_pipeline(
    Imputer(strategy='median'),       # replace missing values
    TopTestsSelector(k=K, keep=KEEP), # select best tests
    NormalizeBasedOnTest(norm_vector=NORM_VECTOR, k=K),
)

mvc = preprocessing.fit_transform(X, y_norm)

keeped test: [13 15 16 17 18] (this is index,  not test no.)


In [7]:
pd.DataFrame(mvc).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,153.46042,130.31499,124.27695,100.0,46.84948
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,84.12816,136.0572,73.97477,100.0,93.60384
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,83.06777,208.66602,142.43276,100.0,50.08998
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,104.62414,122.58334,102.34487,100.0,61.71398
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,100.96129,193.94497,124.40057,100.0,172.1474


In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(mvc, y_norm, test_size=0.2, random_state=RANDOM_SEED,
                                       stratify=df['muscle'])

In [9]:
optimized_params = {'bootstrap': False, 'criterion': 'mse', 'max_features': 14, 'n_estimators': 45}

model = ExtraTreesRegressor(random_state=RANDOM_SEED, **optimized_params)
model.fit(X_train, y_train)

predictions = model.predict(X_test)

In [10]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [11]:
np.sqrt(mean_squared_error(y_test, predictions))

424.04089538180722

In [12]:
r2_score(y_test, predictions)

0.18925859465815165