<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#1.-MVC-project-description" data-toc-modified-id="1.-MVC-project-description-1">1. MVC project description</a></span></li><li><span><a href="#2.-Setup" data-toc-modified-id="2.-Setup-2">2. Setup</a></span></li><li><span><a href="#3.-Load-the-data" data-toc-modified-id="3.-Load-the-data-3">3. Load the data</a></span></li><li><span><a href="#4.-Configurations" data-toc-modified-id="4.-Configurations-4">4. Configurations</a></span></li><li><span><a href="#5.-Split-the-data" data-toc-modified-id="5.-Split-the-data-5">5. Split the data</a></span></li><li><span><a href="#6.-Pipeline" data-toc-modified-id="6.-Pipeline-6">6. Pipeline</a></span><ul class="toc-item"><li><span><a href="#6.1.-Temporary-step:-normalize-data" data-toc-modified-id="6.1.-Temporary-step:-normalize-data-6.1">6.1. Temporary step: normalize data</a></span></li><li><span><a href="#6.2.-Assembling-the-pipeline" data-toc-modified-id="6.2.-Assembling-the-pipeline-6.2">6.2. Assembling the pipeline</a></span></li><li><span><a href="#6.3.-Define-our-metric" data-toc-modified-id="6.3.-Define-our-metric-6.3">6.3. Define our metric</a></span></li><li><span><a href="#6.4.-Fit-the-pipeline" data-toc-modified-id="6.4.-Fit-the-pipeline-6.4">6.4. Fit the pipeline</a></span></li><li><span><a href="#6.5.-Save-the-model" data-toc-modified-id="6.5.-Save-the-model-6.5">6.5. Save the model</a></span></li></ul></li><li><span><a href="#7.-Learning-curves" data-toc-modified-id="7.-Learning-curves-7">7. Learning curves</a></span></li><li><span><a href="#8.-Evaluation" data-toc-modified-id="8.-Evaluation-8">8. Evaluation</a></span><ul class="toc-item"><li><span><a href="#8.1.-Prediction" data-toc-modified-id="8.1.-Prediction-8.1">8.1. Prediction</a></span></li><li><span><a href="#8.2.-Denormalize-data" data-toc-modified-id="8.2.-Denormalize-data-8.2">8.2. Denormalize data</a></span></li><li><span><a href="#8.3.-Evaluate-on-all-muscles" data-toc-modified-id="8.3.-Evaluate-on-all-muscles-8.3">8.3. Evaluate on all muscles</a></span></li><li><span><a href="#8.4.-Evaluate-on-each-muscle" data-toc-modified-id="8.4.-Evaluate-on-each-muscle-8.4">8.4. Evaluate on each muscle</a></span></li></ul></li><li><span><a href="#9.-Summary" data-toc-modified-id="9.-Summary-9">9. Summary</a></span></li></ul></div>

# 1. MVC project description

**Links**
- [github repo](https://github.com/romainmartinez/mvc)
- [plotly figures](https://plot.ly/organize/romainmartinez:114)

**Author**: _Romain Martinez._

# 2. Setup

In [1]:
# Common imports
import pandas as pd
import numpy as np
import json

# Path
from pathlib import Path
PROJECT_PATH = Path('./')
DATA_PATH = PROJECT_PATH / 'data'

# to make this notebook's output stable across runs
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

# Figures
OFFLINE = True
if OFFLINE:
    import plotly.offline as py
    py.init_notebook_mode(connected=True)
else:
    import plotly.plotly as py
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly import tools
BASE_LAYOUT = go.Layout(hovermode='closest', font=dict(size=14))
MARKER_LAYOUT = dict(
    color='rgba(27, 158, 119, 0.6)',
    line=dict(
        color='rgba(27, 158, 119, 1.0)',
        width=2,
    ))

# 3. Load the data

In [2]:
df_wide = pd.read_feather(DATA_PATH / 'df_wide')
with open(DATA_PATH / 'conf.json', 'r') as w:
    conf = json.load(w)

# 4. Configurations

In [3]:
REF_COLS = {
    'test_cols': np.array([3, 4, 5]).astype(str),
    'categorical_cols': np.array(['muscle']),
    'test_ref': np.array([4]).astype(str)
}

# 5. Split the data

In [4]:
def get_X_and_y(d, test_col_str, other_col_to_keep, remove_nans=False):
    # get y (row maximum)
    y = np.nanmax(d[test_col_str], axis=1)

    col_names = other_col_to_keep.tolist() + test_col_str.tolist()
    X = d[col_names].values

    nan_row_idx = np.isnan(X).any(axis=1)
    if remove_nans:
        X = X[~nan_row_idx, :]
        y = y[~nan_row_idx]
        print(f'Removed {np.sum(nan_row_idx)} rows')
    else:
        if np.any(nan_row_idx):
            print(
                f'Warning: {np.sum(nan_row_idx)} rows have nans. You should remove it or use Imputer'
            )
    return X, y, col_names, nan_row_idx

In [5]:
X, y, COL_NAMES, nan_row_idx = get_X_and_y(
    df_wide,
    test_col_str=REF_COLS['test_cols'],
    other_col_to_keep=REF_COLS['categorical_cols'],
    remove_nans=True)
df_wide.drop(df_wide.index[nan_row_idx], inplace=True)

Removed 3 rows



All-NaN axis encountered



In [6]:
from sklearn.model_selection import train_test_split

X_indices = np.arange(X.shape[0])

X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(
    X, y, X_indices, test_size=0.2, random_state=RANDOM_SEED, stratify=X[:, 0])

# 6. Pipeline

## 6.1. Temporary step: normalize data
This step needs to be temporary out of the pipeline because scikit-learn doesn't have the [TransformedTargetRegressor](http://scikit-learn.org/dev/modules/generated/sklearn.preprocessing.TransformedTargetRegressor.html#sklearn.preprocessing.TransformedTargetRegressor) yet.

In [7]:
class Normalize:
    def __init__(self, to_normalize=None, ref='max'):
        self.to_normalize = to_normalize
        self.ref = ref

    def transform(self, X, y):
        normalize = lambda a, b: a * 100 / b
        X_out, y_out = [], []

        if X.any():
            X_out = X.copy()
            if self.ref is 'max':
                self.ref_vector = np.nanmax(X_out[:, to_normalize], axis=1)
            else:
                self.ref_vector = X_out[:, self.ref].ravel()
            X_out[:, self.to_normalize] = np.apply_along_axis(
                normalize, 0, X_out[:, self.to_normalize], self.ref_vector)

            if y.any():
                y_out = np.apply_along_axis(normalize, 0, y, self.ref_vector)

        return X_out, y_out

    def inverse_transform(self, X, y):
        denormalize = lambda a, b: a / 100 * b
        X_out, y_out = [], []
        if X.any():
            X_out = X.copy()
            X_out[:, self.to_normalize] = np.apply_along_axis(
                denormalize, 0, X_out[:, self.to_normalize], self.ref_vector)

        if y.any():
            y_out = np.apply_along_axis(denormalize, 0, y, self.ref_vector)

        return X_out, y_out

In [8]:
to_normalize = np.in1d(COL_NAMES, REF_COLS['test_cols'])
ref = np.in1d(COL_NAMES, REF_COLS['test_ref'])

# normalize train set
normalizer_train = Normalize(to_normalize=to_normalize, ref=ref)
X_train, y_train = normalizer_train.transform(X_train, y_train)

# normalize test set
normalizer_test = Normalize(to_normalize=to_normalize, ref=ref)
X_test, y_test = normalizer_test.transform(X_test, y_test)

## 6.2. Assembling the pipeline

In [9]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import PolynomialFeatures

from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor


def get_categorical_cols(X, col_names=REF_COLS):
    return X[:, np.in1d(COL_NAMES, REF_COLS['categorical_cols'].astype(str))]


def get_numerical_cols(X, col_names=COL_NAMES):
    return X[:, np.in1d(COL_NAMES, REF_COLS['test_cols'].astype(str))]


pipeline_categorical = Pipeline([
    ('selector', FunctionTransformer(get_categorical_cols, validate=False)),
    ('encoder', OneHotEncoder(sparse=False))
])

pipeline_numerical = Pipeline([
    ('selector', FunctionTransformer(get_numerical_cols, validate=False))
])

pipeline_preprocessing = FeatureUnion([
    ('categorical', pipeline_categorical),
    ('numerical', pipeline_numerical)
])

model_param = dict(
    alpha=0.85,
    learning_rate=0.1,
    loss="ls",
    max_depth=10,
    max_features=1.0,
    min_samples_leaf=1,
    min_samples_split=2,
    n_estimators=100,
    subsample=0.55)

pipeline_full = Pipeline([('preprocessing', pipeline_preprocessing),
                          ('regressor',
                           GradientBoostingRegressor(**model_param))])

## 6.3. Define our metric

In [10]:
from sklearn.metrics import make_scorer


def mape(y_test, y_pred):
    val = (np.abs((y_test - y_pred) / y_test)) * 100
    return np.mean(val)


mape_scorer = make_scorer(mape, greater_is_better=False)

print_results = lambda title, toprint: print(f'{title}\n{"-" * 10}\n{toprint}\n')

## 6.4. Fit the pipeline

In [11]:
pipeline_full.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('preprocessing', FeatureUnion(n_jobs=1,
       transformer_list=[('categorical', Pipeline(memory=None,
     steps=[('selector', FunctionTransformer(accept_sparse=False,
          func=<function get_categorical_cols at 0x7f3bdda1f598>,
          inv_kw_args=None, inverse_func=None, kw_args=No...=100, presort='auto', random_state=None,
             subsample=0.55, verbose=0, warm_start=False))])

In [12]:
y_pred = pipeline_full.predict(X_test)

In [13]:
from sklearn.model_selection import cross_val_score

# evaluate
cv_score = cross_val_score(
    pipeline_full,
    X_train,
    y_train,
    cv=5,
    scoring=mape_scorer)

print_results(title='cv score', toprint=cv_score)
print_results(title='mean', toprint=np.mean(cv_score))

cv score
----------
[-0.46757464 -0.35904539 -1.26741222 -0.48436397 -0.75418421]

mean
----------
-0.6665160858444503



## 6.5. Save the model

In [14]:
MODEL_PATH = Path('./model/model.pkl')
if not MODEL_PATH.is_file():
    from sklearn.externals import joblib
    joblib.dump(pipeline_full, MODEL_PATH)

# 7. Learning curves

In [15]:
from sklearn.model_selection import learning_curve


def plot_learning_curve(estimator,
                        X,
                        y,
                        scoring=mape_scorer,
                        cv=None,
                        n_jobs=1,
                        train_sizes=np.linspace(.1, 1.0, 5),
                        **kwargs):
    train_sizes, train_scores, test_scores = learning_curve(
        estimator,
        X,
        y,
        scoring=scoring,
        cv=cv,
        train_sizes=train_sizes,
        n_jobs=n_jobs,
        random_state=RANDOM_SEED)

    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)

    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    trace = []

    # training mean
    trace.append(
        go.Scatter(
            x=train_sizes,
            y=train_scores_mean,
            marker=dict(color='red'),
            name='Training score'))

    # training std
    trace.append(
        go.Scatter(
            x=train_sizes,
            y=train_scores_mean + train_scores_std,
            mode='lines',
            line=dict(color='red', width=1),
            showlegend=False))

    trace.append(
        go.Scatter(
            x=train_sizes,
            y=train_scores_mean - train_scores_std,
            mode='lines',
            line=dict(color='red', width=1),
            fill='tonexty',
            showlegend=False))

    # test mean
    trace.append(
        go.Scatter(
            x=train_sizes,
            y=test_scores_mean,
            marker=dict(color='green'),
            name='Cross-validation score'))

    # test std
    trace.append(
        go.Scatter(
            x=train_sizes,
            y=test_scores_mean + test_scores_std,
            mode='lines',
            line=dict(color='green', width=1),
            showlegend=False))

    trace.append(
        go.Scatter(
            x=train_sizes,
            y=test_scores_mean - test_scores_std,
            mode='lines',
            line=dict(color='green', width=1),
            fill='tonexty',
            showlegend=False))

    data = [itrace for itrace in trace]
    layout = BASE_LAYOUT.copy()
    layout.update(
        dict(
            title=kwargs.get('title'),
            xaxis=dict(
                title=kwargs.get('xtitle'), showline=True, linewidth=1.5),
            yaxis=dict(
                title=kwargs.get('ytitle'), showline=True, linewidth=1.5)))
    fig = dict(data=data, layout=layout)
    return fig

In [16]:
learning_curves = plot_learning_curve(
    estimator=pipeline_full,
    X=X_train,
    y=y_train,
    scoring=mape_scorer,
    cv=5,
    train_sizes=np.linspace(.1, 1.0, 10),
    n_jobs=-1,
    title='Learning curves',
    xtitle='Training examples',
    ytitle='MAPE (%)')

py.iplot(learning_curves, filename='mvc/learning_curves')

# 8. Evaluation

## 8.1. Prediction

In [17]:
y_pred = pipeline_full.predict(X_test)

## 8.2. Denormalize data

In [18]:
_, y_pred_denorm = normalizer_test.inverse_transform(X_test, y_pred)
_, y_test_denorm = normalizer_test.inverse_transform(X_test, y_test)

## 8.3. Evaluate on all muscles

In [19]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import explained_variance_score


def regression_report(y_test, y_pred, verbose=True):
    report = {
        'rmse': np.sqrt(mean_squared_error(y_test, y_pred)),
        'mape': mape(y_test, y_pred),
        'r2': r2_score(y_test, y_pred),
        'variance': explained_variance_score(y_test, y_pred)
    }
    if verbose:
        print_results(title='RMSE', toprint=report["rmse"])
        print_results(title='MAPE', toprint=report["mape"])
        print_results(title='R2', toprint=report["r2"])
        print_results(title='variance', toprint=report["variance"])
    return report

In [20]:
report = {}
report['mean'] = regression_report(y_test_denorm,  y_pred_denorm)

RMSE
----------
0.0026836167215905224

MAPE
----------
0.36749478682948844

R2
----------
0.9997355542066814

variance
----------
0.9997355672026668



## 8.4. Evaluate on each muscle

In [21]:
col_muscle = np.in1d(COL_NAMES, REF_COLS['categorical_cols'])
for imuscle in np.unique(X_test[:, col_muscle]).astype(int):
    subset = (X_test[:, col_muscle] == imuscle).ravel()
    report[imuscle] = regression_report(
        y_test_denorm[subset], y_pred_denorm[subset], verbose=False)

report_by_muscle = pd.DataFrame(report).T.drop('mean', axis=0)
report_by_muscle.index = conf['MUSCLES']

In [22]:
def plot_bar_metrics(d, **kwargs):
    trace_mape = go.Bar(
        x=np.array(d.index),
        y=np.array(d['mape']),
        marker=MARKER_LAYOUT,
        name='mape')

    trace_rmse = go.Scatter(
        x=np.array(d.index),
        y=np.array(d['rmse']),
        marker=dict(
            color='rgba(117, 112, 179, 0.6)',
            line=dict(
                color='rgba(117, 112, 179, 1.0)',
                width=2,
            )),
        name='rmse',
        xaxis='x1',
        yaxis='y2')

    traces = [trace_mape, trace_rmse]

    layout = BASE_LAYOUT.copy()
    layout.update(
        dict(
            title=kwargs.get('title'),
            xaxis=dict(
                title=kwargs.get('xtitle'), showline=True, linewidth=1.5),
            yaxis=dict(
                title=kwargs.get('ytitle'), showline=True, linewidth=1.5),
            yaxis2=dict(
                overlaying='y',
                side='right',
                showgrid=False,
                title=kwargs.get('y2title'),
                showline=True,
                zeroline=False,
                linewidth=1.5),
            legend=dict(x=1, y=1.1)))

    return dict(data=traces, layout=layout)

In [23]:
metrics_by_muscle = plot_bar_metrics(
    report_by_muscle,
    title='MAPE and RMSE for each muscle',
    ytitle='MAPE (%)',
    y2title='RMSE (mV)')
py.iplot(metrics_by_muscle, filename='mvc/metrics_by_muscle')

# 9. Summary

In [24]:
def table_regression_report(d, **kwargs):
    table = ff.create_table(d, index=[d.index])
    
    table['layout'].update(font=dict(size=14))
    return table

In [25]:
summary_report = pd.DataFrame(report).T
summary_report.index = ['mean'] + conf['MUSCLES']

table_report = table_regression_report(
    np.round(summary_report, decimals=4),
    title='MAPE and RMSE for each muscle',
    ytitle='MAPE (%)',
    y2title='RMSE (mV)')
py.iplot(table_report, filename='mvc/table_report')