In [3]:
import numpy as np
import sklearn.datasets
from sklearn.preprocessing import StandardScaler
import time
import random
import os
import datetime

from matplotlib import pyplot as plt

import torch, torch.nn as nn
import torch.nn.functional as F

from torch.autograd import Variable
from torch import autograd

from paths import (path_to_save_remote, 
                   path_to_save_local,
                   port_to_remote)

from params_5d_gaussians import (random_seed,
                                 batch_size,
                                 num_samples_in_cluster,
                                 dim,
                                 num_gaussian_per_dim,
                                 coord_limits,
                                 sigma, 
                                 train_dataset_size,
                                 n_dim,
                                 n_layers_d,
                                 n_layers_g,
                                 n_hid_d,
                                 n_hid_g,
                                 n_out,
                                 normalize_to_0_1,
                                 loss_type,
                                 lr_init,
                                 betas,
                                 use_gradient_penalty,
                                 Lambda,
                                 num_epochs,
                                 num_epoch_for_save,
                                 batch_size_sample,
                                 k_g,
                                 k_d,
                                 mode,
                                 proj_list,
                                 n_calib_pts,
                                 plot_mhgan,
                                 device)
from utils import (prepare_gaussians, 
                   prepare_train_batches,
                   prepare_dataloader, 
                   logging)

from gan_fc_models import (Generator_fc, 
                           Discriminator_fc, 
                           weights_init_1, 
                           weights_init_2)

from gan_train import train_gan

In [4]:
X_train = prepare_gaussians(num_samples_in_cluster = num_samples_in_cluster, 
                            dim = dim, 
                            num_gaussian_per_dim = num_gaussian_per_dim, 
                            coord_limits = coord_limits, 
                            sigma = sigma,
                            random_seed = random_seed)
scaler = None 
train_dataloader = prepare_dataloader(X_train, batch_size, 
                                      random_seed=random_seed)

G = Generator_fc(n_dim=n_dim, 
                 n_layers=n_layers_g,
                 n_hid=n_hid_g,
                 n_out=n_out,
                 non_linear=nn.ReLU(),
                 device=device).to(device)
D = Discriminator_fc(n_in=n_dim, 
                     n_layers=n_layers_d,
                     n_hid=n_hid_d,
                     non_linear=nn.ReLU(),
                     device=device).to(device)
G.init_weights(weights_init_2, random_seed=random_seed)
D.init_weights(weights_init_2, random_seed=random_seed)

In [5]:
X_train.shape

(243000, 5)

In [6]:
from mh_2d_sampling import mh_sampling

if scaler is not None:
    X_train_scale = scaler.transform(X_train)
else:
    X_train_scale = X_train

print("Start to do MH sampling....")
type_calibrator = 'iso'
X_mh = mh_sampling(X_train_scale, 
                   G, 
                   D, 
                   G.device, 
                   n_calib_pts, 
                   batch_size_sample=batch_size_sample,
                   normalize_to_0_1=normalize_to_0_1,
                   type_calibrator=type_calibrator)

Start to do MH sampling....


AssertionError: 

In [11]:
import mh
import classification as cl
import numpy as np
import pandas as pd
import os
import torch
from scipy.special import expit


def discriminator_analysis(scores_fake_df, scores_real_df, ref_method,
                           calib_dict,
                           dump_fname=None,
                           label='label'):
    '''
    scores_fake_df : DataFrame, shape (n, n_discriminators)
    scores_real_df : DataFrame, shape (n, n_discriminators)
    ref_method : (str, str)
    perf_report : str
    calib_report : str
    clf_df : DataFrame, shape (n_calibrators, n_discriminators)
    '''
    # Build combined data set dataframe and train calibrators
    pred_df, y_true = cl.combine_class_df(neg_class_df=scores_fake_df,
                                          pos_class_df=scores_real_df)
    pred_df, y_true, clf_df = cl.calibrate_pred_df(pred_df, y_true, 
                                                   calibrators=calib_dict)
    # Make methods flat to be compatible with benchmark tools
    pred_df.columns = cl.flat_cols(pred_df.columns)
    ref_method = cl.flat(ref_method)  # Make it flat as well

    # Do calibration analysis
    #Z = cl.calibration_diagnostic(pred_df, y_true)
    #calib_report = Z.to_string()

    # Dump prediction to csv in case we want it for later analysis
    if dump_fname is not None:
        pred_df_dump = pd.DataFrame(pred_df, copy=True)
        pred_df_dump[label] = y_true
        pred_df_dump.to_csv(dump_fname, header=True, index=False)
    
    return pred_df, clf_df
    # No compute report on performance of each discriminator:
    # Make it into log-scale cat distn for use with benchmark tools
    #pred_df = cl.binary_pred_to_one_hot(pred_df, epsilon=1e-12)
    #print(y_true)
    #print(pred_df)
    #perf_df, _ = btc.summary_table(pred_df, y_true,
    #                               btc.STD_CLASS_LOSS, btc.STD_BINARY_CURVES,
    #                               ref_method=ref_method)

    #crap_lim = const_dict(1)


In [12]:
def validate_scores(scores):
    assert isinstance(scores, dict)
    for sv in scores.values():
        assert isinstance(sv, np.ndarray)
        assert sv.dtype.kind == 'f'
        assert sv.ndim == 1
        assert np.all(0 <= sv) and np.all(sv <= 1)
    scores = pd.DataFrame(scores)
    return scores

def validate_X(X):
    assert isinstance(X, np.ndarray)
    assert X.dtype.kind == 'f'
    batch_size, dim = X.shape
    assert X.shape == (batch_size, dim)
    assert np.all(np.isfinite(X))
    return X

def validate(R):
    '''
    X : ndarray, shape (batch_size, nc, image_size, image_size)
    scores : dict of str -> ndarray of shape (batch_size,)
    '''
    X, scores = R
    X = validate_X(X)
    scores = validate_scores(scores)
    assert len(X) == len(scores)
    return X, scores

def batched_gen_and_disc(gen_and_disc, n_batches, batch_size):
    '''
    Get a large batch of images. Pytorch might run out of memory if we set
    the batch size to n_images=n_batches*batch_size directly.
    g_d_f : callable returning (X, scores) compliant with `validate`
    n_images : int
        assumed to be multiple of batch size
    '''
    X, scores = zip(*[validate(gen_and_disc(batch_size))
                      for _ in range(n_batches)])
    X = np.concatenate(X, axis=0)
    scores = pd.concat(scores, axis=0, ignore_index=True)
    return X, scores

def enhance_samples(scores_df, scores_max, scores_real_df, clf_df,
                    pickers):
    '''
    Return selected image (among a batcf on n images) for each picker.
    scores_df : DataFrame, shape (n, n_discriminators)
    scores_real_df : DataFrame, shape (m, n_discriminators)
    clf_df : Series, shape (n_classifiers x n_calibrators,)
    pickers : dict of str -> callable
    '''
    assert len(scores_df.columns.names) == 1
    assert list(scores_df.columns) == list(scores_real_df.columns)

    init_idx = np.random.choice(len(scores_real_df))

    picked = pd.DataFrame(data=0, index=pickers.keys(), columns=clf_df.index,
                          dtype=int)
    cap_out = pd.DataFrame(data=False,
                           index=pickers.keys(), columns=clf_df.index,
                           dtype=bool)
    alpha = pd.DataFrame(data=np.nan,
                         index=pickers.keys(), columns=clf_df.index,
                         dtype=float)
    for disc_name in sorted(scores_df.columns):
        assert isinstance(disc_name, str)
        s0 = scores_real_df[disc_name].values[init_idx]
        assert np.ndim(s0) == 0
        for calib_name in sorted(clf_df[disc_name].index):
            assert isinstance(calib_name, str)
            #print(f"calibrator name = {calib_name}, discriminator name = {disc_name}")
            calibrator = clf_df[(disc_name, calib_name)]
            s_ = np.concatenate(([s0], scores_df[disc_name].values))
            s_ = calibrator.predict(s_)
            s_max, = calibrator.predict(np.array([scores_max[disc_name]]))
            for picker_name in sorted(pickers.keys()):
                assert isinstance(picker_name, str)
                #print(f"picker name = {picker_name}")
                idx, aa = pickers[picker_name](s_, score_max=s_max)

                if idx == 0:
                    # Try again but init from first fake
                    cap_out.loc[picker_name, (disc_name, calib_name)] = True
                    idx, aa = pickers[picker_name](s_[1:], score_max=s_max)
                else:
                    idx = idx - 1
                assert idx >= 0

                picked.loc[picker_name, (disc_name, calib_name)] = idx
                alpha.loc[picker_name, (disc_name, calib_name)] = aa
    
    return picked, cap_out, alpha

def enhance_samples_series(g_d_f, scores_real_df, clf_df,
                           pickers, n_samples=16,
                           batch_size = 16,
                           chain_batches = 10,
                           max_est_batches = 156):
    '''
    Call enhance_samples multiple times to build up a batch of selected images.
    Stores list of used images X separate from the indices of the images
    selected by each method. This is more memory efficient if there are
    duplicate images selected.
    g_d_f : callable returning (X, scores) compliant with `validate`
    calibrator : dict of str -> trained sklearn classifier
        same keys as scores
    n_images : int
    '''
    #batch_size = 16   # Batch size to use when calling the pytorch generator G
    #chain_batches = 10  # Number of batches to use total for the pickers
    #max_est_batches = 156  # Num batches for estimating M in DRS pilot samples

    assert n_samples > 0

    _, scores_max = batched_gen_and_disc(g_d_f, max_est_batches, batch_size)
    scores_max = scores_max.max(axis=0)

    #print('max scores')
    #print(scores_max.to_string())

    X = []
    picked = [None] * n_samples
    cap_out = [None] * n_samples
    alpha = [None] * n_samples
    picked_num = 0
    all_generated_num = 0
    for nn in tqdm(range(n_samples)):
        X_, scores_fake_df = \
            batched_gen_and_disc(g_d_f, chain_batches, batch_size)
        #print(f"Shape of generated random images = {X_.shape}")
        picked_, cc, aa = \
            enhance_samples(scores_fake_df, scores_max, scores_real_df, clf_df,
                            pickers=pickers)
        picked_ = picked_.unstack()  # Convert to series
        # Only save the used images for memory, so some index x-from needed
        assert np.ndim(picked_.values) == 1
        used_idx, idx_new = np.unique(picked_.values, return_inverse=True)
        picked_ = pd.Series(data=idx_new, index=picked_.index)

        # A bit of index manipulation in our memory saving scheme
        picked[nn] = len(X) + picked_
        add_X = list(X_[used_idx])
        picked_num += len(add_X)
        all_generated_num += len(X_)
        #print(f"number of selected images = {len(add_X)} out of {len(X_)}")

        X.extend(add_X)  # Unravel first index to list
        cap_out[nn] = cc.unstack()
        alpha[nn] = aa.unstack()

    acceptence_rate = picked_num/all_generated_num
    #print(f"acceptance rate = {acceptence_rate}")
    X = np.asarray(X)
    #assert X.ndim == 4
    picked = pd.concat(picked, axis=1).T
    assert picked.shape == (n_samples, len(picked_))
    cap_out = pd.concat(cap_out, axis=1).T
    assert cap_out.shape == (n_samples, len(picked_))
    alpha = pd.concat(alpha, axis=1).T
    assert alpha.shape == (n_samples, len(picked_))
    return X, picked, cap_out, alpha

In [13]:
@torch.no_grad()
def mh_sampling(X_train, G, D, device, n_calib_pts, 
                batch_size_sample,
                normalize_to_0_1=True,
                type_calibrator='iso'):
    calib_ids = np.random.choice(np.arange(X_train.shape[0]), n_calib_pts)
    real_calib_data = [torch.FloatTensor(X_train[calib_ids])]

    BASE_D = 'base'
    scores_real = {}
    scores_real[BASE_D] = np.concatenate([D(data.to(device)).detach().cpu().numpy()[:, 0] for data in real_calib_data])
    if normalize_to_0_1:
       scores_real[BASE_D] = expit(scores_real[BASE_D])
    scores_real_df = validate_scores(scores_real)
    n_real_batches, rem = divmod(len(scores_real[BASE_D]), batch_size_sample)

    n_dim = X_train.shape[1]


    def gen_disc_f(batch_size_fixed_):
        noise = torch.randn(batch_size_fixed_, n_dim, device=device)
        x = G(noise).detach()

        scores = {BASE_D: D(x).detach().cpu().numpy()[:, 0]}
        if normalize_to_0_1:
           scores[BASE_D] = expit(scores[BASE_D])

        x = x.cpu().numpy()
        return x, scores

    _, scores_fake_df = batched_gen_and_disc(gen_disc_f, n_real_batches, batch_size_sample)
    ref_method = (BASE_D, 'raw')
    incep_ref = BASE_D + '_iso_base'
    #score_fname = os.path.join(outf, '%d_scores.csv' % epoch)
    if type_calibrator=='iso':
        calib_dict = {'iso': cl.Isotonic}
    elif type_calibrator=='raw':
        calib_dict = {'raw': cl.Identity}
    elif type_calibrator=='linear':
        calib_dict = {'linear': cl.Linear}
    elif type_calibrator=='beta1':
        calib_dict = {'beta1': cl.Beta1}
    elif type_calibrator=='beta2':
        calib_dict = {'beta2': cl.Beta2}
    else:
        raise TypeError('Unknown calibrator type')

    #perf_report, calib_report, clf_df = \
    #    discriminator_analysis(scores_fake_df, scores_real_df, ref_method,
    #                           dump_fname=score_fname)
    pred_df_dump, clf_df = \
        discriminator_analysis(scores_fake_df, scores_real_df, ref_method,
                                calib_dict=calib_dict)

    #print('image dumps...')
    # Some image dumps in case we want to actually look at generated images
    pickers = {'MH': mh.mh_sample}
    X, picked, cap_out, alpha = enhance_samples_series(gen_disc_f, 
                                                       scores_real_df, 
                                                       clf_df, 
                                                       pickers, 
                                                       n_samples=batch_size_sample)

    return X

In [14]:
if scaler is not None:
    X_train_scale = scaler.transform(X_train)
else:
    X_train_scale = X_train

print("Start to do MH sampling....")
type_calibrator = 'iso'
X_mh = mh_sampling(X_train_scale, 
                   G, 
                   D, 
                   G.device, 
                   n_calib_pts, 
                   batch_size_sample=batch_size_sample,
                   normalize_to_0_1=normalize_to_0_1,
                   type_calibrator=type_calibrator)

Start to do MH sampling....


AssertionError: 

In [15]:
calib_ids = np.random.choice(np.arange(X_train.shape[0]), n_calib_pts)
real_calib_data = [torch.FloatTensor(X_train[calib_ids])]

BASE_D = 'base'
scores_real = {}
scores_real[BASE_D] = np.concatenate([D(data.to(device)).detach().cpu().numpy()[:, 0] for data in real_calib_data])
scores_real[BASE_D] = expit(scores_real[BASE_D])
scores_real_df = validate_scores(scores_real)
n_real_batches, rem = divmod(len(scores_real[BASE_D]), batch_size_sample)

n_dim = X_train.shape[1]


def gen_disc_f(batch_size_fixed_):
    noise = torch.randn(batch_size_fixed_, n_dim, device=device)
    x = G(noise).detach()

    scores = {BASE_D: D(x).detach().cpu().numpy()[:, 0]}
    if normalize_to_0_1:
       scores[BASE_D] = expit(scores[BASE_D])

    x = x.cpu().numpy()
    return x, scores

_, scores_fake_df = batched_gen_and_disc(gen_disc_f, n_real_batches, batch_size_sample)
ref_method = (BASE_D, 'raw')
incep_ref = BASE_D + '_iso_base'
#score_fname = os.path.join(outf, '%d_scores.csv' % epoch)
calib_dict = {'iso': cl.Isotonic}

In [16]:
scores_fake_df

Unnamed: 0,base
0,0.546832
1,0.560116
2,0.577192
3,0.553612
4,0.531129
...,...
19995,0.625313
19996,0.550318
19997,0.502336
19998,0.514151


In [17]:
scores_real_df

Unnamed: 0,base
0,0.553219
1,0.594622
2,0.557208
3,0.653977
4,0.595332
...,...
24295,0.548325
24296,0.591191
24297,0.607150
24298,0.324111
