In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
from IPython import display

import os
import sys
import imp
import time
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim.lr_scheduler import MultiStepLR, StepLR
from torch.utils.data import DataLoader, TensorDataset
from torch.autograd import Variable

import torchvision as tv
import lib.pytorch_trainer as ptt

use_gpu = torch.cuda.is_available()
print('GPU available:', use_gpu)
print('torch', torch.version.__version__)
print('Python', sys.version)


GPU available: False
torch 0.2.0_4
Python 3.6.0 |Anaconda custom (x86_64)| (default, Dec 23 2016, 13:19:00) 
[GCC 4.2.1 Compatible Apple LLVM 6.0 (clang-600.0.57)]


In [2]:
from sklearn.base import BaseEstimator
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import PredefinedSplit
import scipy.stats as st


In [3]:
# MNIST Dataset
# =============
use_test_dataset = False
n_samples = 6000
n_splits = 6

def torch_datasets_to_sklearn_cv_data(train_ds, valid_ds):
    n_train, n_valid = len(train_ds), len(valid_ds)
    x, y = train_ds[0]
    all_tuples = list(train_ds) + list(valid_ds)
    all_labels = np.array([y for _, y in all_tuples], np.int)
    all_data = torch.cat([w.view(1, *x.shape) for w, _ in all_tuples], 0).numpy()
    valid_fold = np.zeros_like(all_labels)
    valid_fold[:n_train] = -1
    psplit = PredefinedSplit(valid_fold)
    return all_data, all_labels, psplit

train_ds = tv.datasets.MNIST('/data/datasets/MNIST/', train=True, 
                             transform=tv.transforms.ToTensor(), 
                             target_transform=None, 
                             download=True)

test_ds = tv.datasets.MNIST('/data/datasets/MNIST/', train=False, 
                             transform=tv.transforms.ToTensor(), 
                             target_transform=None, 
                             download=True)

if use_test_dataset:
    # using the test dataset as a fixed validation set (only one split)
    all_data, all_labels, psplit = torch_datasets_to_sklearn_cv_data(train_ds, valid_ds)
    
else:
    all_labels = np.array([y for _, y in list(train_ds)], np.int)[:n_samples]
    all_data = torch.cat([w.view(1, 1, 28, 28) for w, _ in list(train_ds)], 0).numpy()[:n_samples]
    psplit = n_splits
    
print(all_data.shape, all_data.min(), all_data.max(), '***', 
      all_labels.shape, all_labels.min(), all_labels.max())

(6000, 1, 28, 28) 0.0 1.0 *** (6000,) 0 9


In [4]:
class MyNet(nn.Module):
    # Input size is (-1, 1, 28, 28)
    def __init__(self):
        super().__init__()
        self.relu = nn.ReLU()
        self.pool = nn.MaxPool2d(2, 2)
        self.conv1 = nn.Conv2d(1, 6, 5)         # in_channels, out_channels, kernel_size
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 4 * 4, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        x = self.conv1(x)
        x = self.relu(x)
        x = self.pool(x)
        x = self.conv2(x)
        x = self.relu(x)
        x = self.pool(x)
        x = x.view(-1, 16 * 4 * 4)
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.fc3(x)
        return x

    
class SklEstimator(BaseEstimator):
    
    skl_id = 0
    fit_num = 0
    
    def __init__(self, model_class=None, criterion_class='CrossEntropyLoss', optim_class='SGD', 
                 optim_lr=0.001, optim_momentum=0.9, weight_decay=0, 
                 sched_step=10, sched_gamma=1.0, mb_size=16, n_epochs=100, verbose=True):
        self.par_model_class = model_class
        self.par_criterion_class = criterion_class
        self.par_optim_class = optim_class
        self.par_optim_lr = optim_lr
        self.par_optim_momentum = optim_momentum
        self.par_weight_decay = weight_decay
        self.par_sched_step = sched_step
        self.par_sched_gamma = sched_gamma
        self.par_mb_size = mb_size
        self.par_n_epochs = n_epochs
        self.par_verbose = verbose
        
    def _initialize(self):
        SklEstimator.skl_id += 1
        self.idd = 'skl_model_{}'.format(SklEstimator.skl_id)
        
        if self.par_model_class is None:
            raise Exception('Model not specified.')
        
        self.model = self.par_model_class()
        
        if self.par_criterion_class == 'CrossEntropyLoss':
            self.criterion = nn.CrossEntropyLoss()
        elif self.par_criterion_class == 'MSELoss':
            self.criterion = nn.MSELoss()
        else:
            self.criterion = None
            raise Exception("Calma...")
            
        if self.par_optim_class == 'Adam':
            self.optim = torch.optim.Adam(self.model.parameters(), lr=self.par_optim_lr, 
                                          weight_decay=self.par_weight_decay)
        elif self.par_optim_class == 'SGD':
            self.optim = torch.optim.SGD(self.model.parameters(), lr=self.par_optim_lr, 
                                         momentum=self.par_optim_momentum, nesterov=True,
                                         weight_decay=self.par_weight_decay)
        else:
            self.optim = None
            raise Exception("Calma...")
            
        if self.par_sched_gamma < 1.0:
            self.lr_sched = StepLR(self.optim, step_size=self.par_sched_step, gamma=self.par_sched_gamma)
        else:
            self.lr_sched = None
        
        if self.par_verbose > 0:
            callbacks = [ptt.PrintCallback()]
        else:
            callbacks = None
        
        self.trainer = ptt.DeepNetTrainer(model=self.model, 
                                          criterion=self.criterion, 
                                          optimizer=self.optim, 
                                          lr_scheduler=self.lr_sched, 
                                          callbacks=callbacks)
    
    def get_params(self, deep):
        params = []
        for k, v in self.__dict__.items():
            if k.startswith('par_'):
                params.append((k[4:], v))
        return dict(params)
    
    def set_params(self, **params):
        for k, v in params.items():
            setattr(self, 'par_' + k, v)
        self._initialize()
        return self
    
    def fit(self, Xtrain, ytrain):
        SklEstimator.fit_num += 1
        print('\n***** Fit #{} *****'.format(SklEstimator.fit_num))
        Xtra = torch.from_numpy(Xtrain)
        ytra = torch.from_numpy(ytrain)
        self.trainer.fit(self.par_n_epochs, Xtra, ytra, batch_size=self.par_mb_size, shuffle=True)
    
    def score(self, Xtrain, ytrain):
        Xtra = torch.from_numpy(Xtrain)
        ytra = torch.from_numpy(ytrain)
        loss = self.trainer.score(Xtra, ytra, batch_size=self.par_mb_size)
        print('***** Score = {:.5f} [{} samples] *****'.format(loss, ytra.shape[0]))
        return loss


In [5]:
skl = SklEstimator(MyNet)
skl.get_params(0)

{'criterion_class': 'CrossEntropyLoss',
 'mb_size': 16,
 'model_class': __main__.MyNet,
 'n_epochs': 100,
 'optim_class': 'SGD',
 'optim_lr': 0.001,
 'optim_momentum': 0.9,
 'sched_gamma': 1.0,
 'sched_step': 10,
 'verbose': True,
 'weight_decay': 0}

In [6]:
# st.uniform(loc, scale)  ==> This distribution is constant between loc and loc + scale.

parameters = {
    'model_class':     [MyNet],
    'optim_class':     ['Adam', 'SGD'], 
    'optim_lr':        st.uniform(1e-4, 5e-3),    # 0.0001:0.0051
    'sched_step':      [10], 
    'sched_gamma':     st.uniform(0.50, 0.45),    # 0.5:0.95
    'mb_size':         [50, 100],
    'n_epochs':        [10],
}
validator = RandomizedSearchCV(SklEstimator(verbose=0), 
                               param_distributions=parameters, 
                               cv=psplit,
                               n_iter=3, 
                               verbose=1)


In [7]:
try:
    validator.fit(all_data, all_labels)

except KeyboardInterrupt:
    print('Interrupted!')

Fitting 6 folds for each of 3 candidates, totalling 18 fits

***** Fit #1 *****
***** Score = -2.29325 [1000 samples] *****
***** Score = -2.29310 [5000 samples] *****

***** Fit #2 *****
***** Score = -2.28175 [1000 samples] *****
***** Score = -2.28125 [5000 samples] *****

***** Fit #3 *****
***** Score = -2.27691 [1000 samples] *****
***** Score = -2.27611 [5000 samples] *****

***** Fit #4 *****
***** Score = -2.29336 [1000 samples] *****
***** Score = -2.29033 [5000 samples] *****

***** Fit #5 *****
***** Score = -2.28387 [1000 samples] *****
***** Score = -2.28515 [5000 samples] *****

***** Fit #6 *****
***** Score = -2.28599 [1000 samples] *****
***** Score = -2.28813 [5000 samples] *****

***** Fit #7 *****
***** Score = -0.40038 [1000 samples] *****
***** Score = -0.32466 [5000 samples] *****

***** Fit #8 *****
***** Score = -0.47856 [1000 samples] *****
***** Score = -0.42569 [5000 samples] *****

***** Fit #9 *****
***** Score = -0.57728 [1000 samples] *****
***** Score 

[Parallel(n_jobs=1)]: Done  18 out of  18 | elapsed:  5.1min finished


In [8]:
validator.best_params_

{'mb_size': 50,
 'model_class': __main__.MyNet,
 'n_epochs': 10,
 'optim_class': 'SGD',
 'optim_lr': 0.0029121376750100353,
 'sched_gamma': 0.86446472221909865,
 'sched_step': 10}

In [9]:
validator.best_index_, validator.best_score_

(2, -0.2553615566963951)

In [10]:
validator.best_estimator_.score(all_data, all_labels)

***** Score = -0.18972 [6000 samples] *****


-0.18972252715999882

In [11]:
validator.cv_results_

{'mean_fit_time': array([ 15.75113988,  16.45204739,  16.95270749]),
 'mean_score_time': array([ 0.14573598,  0.19698481,  0.13030883]),
 'mean_test_score': array([-2.28585592, -0.48102179, -0.25536156]),
 'mean_train_score': array([-2.28567681, -0.45009571, -0.2050667 ]),
 'param_mb_size': masked_array(data = [100 100 50],
              mask = [False False False],
        fill_value = ?),
 'param_model_class': masked_array(data = [<class '__main__.MyNet'> <class '__main__.MyNet'> <class '__main__.MyNet'>],
              mask = [False False False],
        fill_value = ?),
 'param_n_epochs': masked_array(data = [10 10 10],
              mask = [False False False],
        fill_value = ?),
 'param_optim_class': masked_array(data = ['SGD' 'SGD' 'SGD'],
              mask = [False False False],
        fill_value = ?),
 'param_optim_lr': masked_array(data = [0.00091694046720705471 0.0037516121906576104 0.0029121376750100353],
              mask = [False False False],
        fill_value = 