In [1]:
%matplotlib inline

# Learning `g`

Once $f:X\rightarrow Y$ is learned, let's learn $g:X_t \rightarrow X_{t+1}$. We are gonna use three models:

1. Multi-task Elastic-Net
2. Nuclear-Norm minimization
3. Multi-layer Perceptron

In [4]:
import os
import pandas as pd
import numpy as np
import scipy as sp
import time
import matplotlib.pyplot as plt
import cPickle as pkl
import warnings
warnings.filterwarnings('ignore')

from fancyimpute import KNN
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler

from sklearn.linear_model import MultiTaskElasticNet
from minimal.estimators import NNMRegressor
from sklearn.neural_network import MLPRegressor

# import distributed.joblib
# from sklearn.externals.joblib import parallel_backend

## 1. Loading training and test data



In [5]:
data_tr = pd.read_csv('../../data/AISM/vvr_dataset/vvr_training_data.csv', header=0, index_col=0)
labels_tr = pd.read_csv('../../data/AISM/vvr_dataset/vvr_training_labels.csv', header=0, index_col=0)
print(data_tr.shape)
print(labels_tr.shape)

data_ts = pd.read_csv('../../data/AISM/vvr_dataset/vvr_test_data.csv', header=0, index_col=0)
labels_ts = pd.read_csv('../../data/AISM/vvr_dataset/vvr_test_labels.csv', header=0, index_col=0)
print(data_ts.shape)
print(labels_ts.shape)

(1946, 145)
(1946, 34)
(714, 145)
(714, 34)


### 1.1 Imputing

In [6]:
# Impute tr data
data_tr = pd.DataFrame(data=KNN(k=3).complete(data_tr.values), index=data_tr.index, columns=data_tr.columns)
labels_tr = pd.DataFrame(data=KNN(k=3).complete(labels_tr.values), index=labels_tr.index, columns=labels_tr.columns)

# Impute ts data
data_ts = pd.DataFrame(data=KNN(k=3).complete(data_ts.values), index=data_ts.index, columns=data_ts.columns)
labels_ts = pd.DataFrame(data=KNN(k=3).complete(labels_ts.values), index=labels_ts.index, columns=labels_ts.columns)

Imputing row 1/1946 with 0 missing, elapsed time: 2.504
Imputing row 101/1946 with 8 missing, elapsed time: 2.521
Imputing row 201/1946 with 2 missing, elapsed time: 2.525
Imputing row 301/1946 with 3 missing, elapsed time: 2.529
Imputing row 401/1946 with 2 missing, elapsed time: 2.534
Imputing row 501/1946 with 0 missing, elapsed time: 2.537
Imputing row 601/1946 with 0 missing, elapsed time: 2.541
Imputing row 701/1946 with 1 missing, elapsed time: 2.544
Imputing row 801/1946 with 0 missing, elapsed time: 2.546
Imputing row 901/1946 with 2 missing, elapsed time: 2.550
Imputing row 1001/1946 with 0 missing, elapsed time: 2.554
Imputing row 1101/1946 with 3 missing, elapsed time: 2.556
Imputing row 1201/1946 with 1 missing, elapsed time: 2.559
Imputing row 1301/1946 with 0 missing, elapsed time: 2.561
Imputing row 1401/1946 with 0 missing, elapsed time: 2.565
Imputing row 1501/1946 with 2 missing, elapsed time: 2.569
Imputing row 1601/1946 with 4 missing, elapsed time: 2.573
Imputing 

In [7]:
pp = MinMaxScaler(feature_range=(0, 1))
X_tr = pp.fit_transform(data_tr.values)
X_ts = pp.transform(data_ts.values)

# Impute labels
pp = MinMaxScaler(feature_range=(0, 1))
Y_tr = pp.fit_transform(labels_tr.values)
Y_ts = pp.transform(labels_ts.values)

In [8]:
# Dumpt the results into a pkl file
with open('__vvrdata.pkl', 'wb') as f:
    pkl.dump({'X_tr': X_tr, 'Y_tr': Y_tr, 'X_ts': X_ts, 'Y_ts': Y_ts},f)

## 2. Define pipeline and model parameters

In [23]:
models = {                         
          'MLP': {'model': MLPRegressor(),
                  'params': {'hidden_layer_sizes': [[2**i] for i in range(10, 16)],
                             'alpha': np.logspace(-5, 1, 10),
                             'early_stopping': [True]}},
           
          'ENET': {'model': MultiTaskElasticNet(),
                   'params': {'l1_ratio': np.linspace(1e-3, 1, 20),
                              'alpha': np.logspace(-3, 2, 20)}},
    
          'NNM': {'model': NNMRegressor(),
                  'params': {'alpha': np.logspace(-5, 1, 20)}},
    }

def modelCV(model):
    return GridSearchCV(models[model]['model'], param_grid=models[model]['params'], verbose=1)

## 3. Models competition

In [25]:
# for model in models:
model = 'NNM'

print('Running {} ...'.format(model))
pipe = modelCV(model)

pipe.fit(X_tr, Y_tr)

# Save it
with open(model+'VVR_estimator.pkl', 'wb') as dd:
    pkl.dump(pipe, dd)

# Measure scores
Y_pred = pipe.predict(X_ts)
_err = metrics.mean_absolute_error(Y_ts, Y_pred)
print('Mean abs err: {:2.3f}'.format(_err))

Running NNM ...
Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed:  2.5min finished


Mean abs err: 0.101


In [26]:
# Score containers
errors = {}

for model in models:
    with open(model+'VVR_estimator.pkl', 'rb') as f:
        mdl = pkl.load(f)
    
    # Measure scores
    Y_pred = mdl.predict(X_ts)
    errors[model] = metrics.mean_absolute_error(Y_ts, Y_pred)
    print('[{}] Mean abs err: {:2.3f}'.format(model, errors[model]))

[MLP] Mean abs err: 0.107
[NNM] Mean abs err: 0.101
[ENET] Mean abs err: 0.099
