# MVC project: Get the Data
- [GitHub](https://github.com/romainmartinez/mvc)

## 0. Setup

In [1]:
# Common imports
import pandas as pd
import numpy as np
import os

# the 'mvc' directory contains functions used but not necessary to understand the story
import mvc

# Figures
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
sns.set_context("paper")

# to make this notebook's output stable across runs
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

## 1. Load data

In [2]:
df = pd.read_hdf('mvc.h5')
df.head()

Unnamed: 0,upper trapezius,middle trapezius,lower trapezius,anterior deltoid,middle deltoid,posterior deltoid,pectoralis major,serratus anterior,latissimus dorsi,supraspinatus,...,8,9,10,11,12,13,14,15,max,best_test
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,100,4
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,100,1
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,100,2
3,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,100,3
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,100,2


## 2. Split the data

### Cross-Validation

In [3]:
from sklearn.model_selection import cross_val_score

In [4]:
def display_scores(scores):
    # we rectify the scores because cross_val_score expect a utility function (greater is better)
    # rather than a cost function (lower is better)
    scores_rectified = np.sqrt(-scores)
    print(f'Scores: {scores_rectified}')
    print(f'Mean: {scores_rectified.mean()}')
    print(f'STD: {scores_rectified.std()}')

### Stratified Train Test Split

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
X_labels = list(df.drop(['max', 'muscle', 'best_test'], axis=1).columns)
y_labels = ['max']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(df[X_labels], df[y_labels], test_size=0.2, random_state=RANDOM_SEED,
                                       stratify=df['muscle'])

## 3. Select and Train a Model

### Linear Regression

In [8]:
from sklearn.linear_model import LinearRegression

In [9]:
lin_reg = LinearRegression()

In [10]:
scores = cross_val_score(lin_reg, X_train, y_train,
                        scoring='neg_mean_squared_error', cv=10)
display_scores(scores)

Scores: [ 0.16564868  0.20206006  0.28050008  0.19867492  0.1814264   0.27113567
  0.25450096  0.25790885  0.22418094  0.26343369]
Mean: 0.2299470252809861
STD: 0.03877581076537588


### Decision Tree Regressor

In [11]:
from sklearn.tree import DecisionTreeRegressor

In [12]:
tree_reg = DecisionTreeRegressor(random_state=RANDOM_SEED)

In [13]:
scores = cross_val_score(tree_reg, X_train, y_train,
                        scoring='neg_mean_squared_error', cv=10)
display_scores(scores)

Scores: [ 0.20584674  0.24356115  0.3188964   0.26037782  0.18490007  0.26148818
  0.34591635  0.30662207  0.29235267  0.2773501 ]
Mean: 0.26973115515738966
STD: 0.047191057081873704


### Random Forest Regressor

In [14]:
from sklearn.ensemble import RandomForestRegressor

In [15]:
forest_reg = RandomForestRegressor(random_state=RANDOM_SEED)

In [16]:
scores = cross_val_score(forest_reg, X_train, np.array(y_train).ravel(),
                        scoring='neg_mean_squared_error', cv=10)
display_scores(scores)

Scores: [ 0.179689    0.216873    0.28582322  0.17515127  0.17121986  0.24581975
  0.25064021  0.23388214  0.20106554  0.26230406]
Mean: 0.22224680359280993
STD: 0.037754767484057275


### Support Vector Regressor

In [17]:
from sklearn.svm import SVR

In [18]:
svm_reg = SVR(kernel='linear')

In [19]:
scores = cross_val_score(svm_reg, X_train, np.array(y_train).ravel(),
                        scoring='neg_mean_squared_error', cv=10)
display_scores(scores)

Scores: [ 0.17417743  0.20947764  0.27893884  0.20951207  0.19323896  0.26747476
  0.25438144  0.25435208  0.22589574  0.25435645]
Mean: 0.23218054117077247
STD: 0.0329512308389305


## 4. Fine-tune the model

### Grid search

In [20]:
from sklearn.model_selection import GridSearchCV

In [21]:
param_grid = [
    # try 12 (3×4) combinations of hyperparameters
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    # then try 6 (2×3) combinations with bootstrap set as False
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
  ]

In [22]:
forest_reg = RandomForestRegressor(random_state=RANDOM_SEED)

In [23]:
grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                          scoring='neg_mean_squared_error')

In [24]:
grid_search.fit(X_train, np.array(y_train).ravel())

GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=42, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]}, {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=0)

In [25]:
grid_search.best_params_

{'bootstrap': False, 'max_features': 3, 'n_estimators': 10}

In [26]:
cv_res = grid_search.cv_results_
for mean_score, params in zip(cv_res['mean_test_score'], cv_res['params']):
    print(np.sqrt(-mean_score), params)

0.25049638398 {'max_features': 2, 'n_estimators': 3}
0.231817498564 {'max_features': 2, 'n_estimators': 10}
0.223987405341 {'max_features': 2, 'n_estimators': 30}
0.258857841622 {'max_features': 4, 'n_estimators': 3}
0.232532901447 {'max_features': 4, 'n_estimators': 10}
0.223000718523 {'max_features': 4, 'n_estimators': 30}
0.27222347769 {'max_features': 6, 'n_estimators': 3}
0.237534181168 {'max_features': 6, 'n_estimators': 10}
0.22743622911 {'max_features': 6, 'n_estimators': 30}
0.267311826041 {'max_features': 8, 'n_estimators': 3}
0.235698244991 {'max_features': 8, 'n_estimators': 10}
0.225656549404 {'max_features': 8, 'n_estimators': 30}
0.250873922076 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
0.220712027261 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
0.238099294198 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
0.212953594452 {'bootstrap': False, 'max_features': 3, 'n_estimators': 10}
0.236303798406 {'bootstrap': False, 'max_features

### Randomized Grid Search

In [27]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

In [28]:
param_distribs = {
        'n_estimators': randint(low=1, high=200),
        'max_features': randint(low=1, high=8),
    }

In [29]:
forest_reg = RandomForestRegressor(random_state=RANDOM_SEED)

In [30]:
n_iter_search = 100
random_search = RandomizedSearchCV(forest_reg, param_distributions=param_distribs,
                                   n_iter=n_iter_search, cv=5, random_state=RANDOM_SEED,
                                   scoring='neg_mean_squared_error')

In [31]:
random_search.fit(X_train, np.array(y_train).ravel())

RandomizedSearchCV(cv=5, error_score='raise',
          estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=42, verbose=0, warm_start=False),
          fit_params=None, iid=True, n_iter=100, n_jobs=1,
          param_distributions={'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fc199477c88>, 'max_features': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fc1991d5c18>},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn', scoring='neg_mean_squared_error',
          verbose=0)

In [32]:
cv_res = random_search.cv_results_
for mean_score, params in zip(cv_res['mean_test_score'], cv_res['params']):
    print(np.sqrt(-mean_score), params)

0.219011924094 {'max_features': 7, 'n_estimators': 180}
0.230644850652 {'max_features': 5, 'n_estimators': 15}
0.221242576133 {'max_features': 3, 'n_estimators': 72}
0.228723652023 {'max_features': 5, 'n_estimators': 21}
0.219048509594 {'max_features': 7, 'n_estimators': 122}
0.221323946354 {'max_features': 3, 'n_estimators': 75}
0.221100065726 {'max_features': 3, 'n_estimators': 88}
0.22013450279 {'max_features': 5, 'n_estimators': 100}
0.220694788486 {'max_features': 3, 'n_estimators': 150}
0.279174983137 {'max_features': 5, 'n_estimators': 2}
0.220252857824 {'max_features': 4, 'n_estimators': 158}
0.220772305954 {'max_features': 6, 'n_estimators': 130}
0.224293751493 {'max_features': 4, 'n_estimators': 21}
0.22834330217 {'max_features': 1, 'n_estimators': 58}
0.22213112695 {'max_features': 6, 'n_estimators': 89}
0.228334762681 {'max_features': 1, 'n_estimators': 59}
0.218922427161 {'max_features': 7, 'n_estimators': 170}
0.220357340481 {'max_features': 4, 'n_estimators': 188}
0.2189

In [33]:
features_importances = random_search.best_estimator_.feature_importances_
sorted(zip(features_importances, X_train.columns), reverse=True)

[(0.11043144447957945, 2),
 (0.095483946213689608, 3),
 (0.087346531003121772, 4),
 (0.07784038309092807, 5),
 (0.073500148110099986, 0),
 (0.061923741110663316, 9),
 (0.057753548408439097, 6),
 (0.056133083869201239, 1),
 (0.053111596896091344, 7),
 (0.051896820754588971, 10),
 (0.051065895749865038, 14),
 (0.048627074936609484, 8),
 (0.042310186730623769, 13),
 (0.031242109114683534, 12),
 (0.024592888685561063, 15),
 (0.022412294473624271, 11),
 (0.0087664895488006548, 'subscapularis'),
 (0.006713997635201697, 'lower trapezius'),
 (0.0059341350587012254, 'posterior deltoid'),
 (0.0057537057429140016, 'middle deltoid'),
 (0.0055735301819876014, 'supraspinatus'),
 (0.0049856843565000012, 'anterior deltoid'),
 (0.0031913971680966936, 'infraspinatus'),
 (0.0030447498108344901, 'middle trapezius'),
 (0.0029498201674166124, 'latissimus dorsi'),
 (0.0029156151243685834, 'upper trapezius'),
 (0.0026627670561323543, 'serratus anterior'),
 (0.0018364145216762751, 'pectoralis major')]

In [34]:
from sklearn.metrics import mean_squared_error

In [37]:
final_model = random_search.best_estimator_

final_predictions = final_model.predict(X_test)
final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
print(f'final rmse: {final_rmse}')

final rmse: 0.2376955885350079
