In [1]:
import sys
assert sys.version_info >= (3, 5)

In [2]:
import tensorflow as tf
from tensorflow import keras
assert tf.__version__ >= "2.0"

import numpy as np
import time

In [3]:
import os

os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   
os.environ["CUDA_VISIBLE_DEVICES"]="1"

#check os.environ ld_library_path is the same here as when I do it in python via terminal, if I get issues

#sometimes I can't select the GPU. In this case, try: https://forums.fast.ai/t/tip-limiting-tensorflow-to-one-gpu/1995

In [16]:
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [6]:
tf.__version__


'2.1.0'

In [4]:
K = keras.backend


In [5]:
mnist = keras.datasets.mnist
(X_train_full, y_train_full), (X_test,y_test) = mnist.load_data()

X_valid, X_train = X_train_full[:5000] / 255.0, X_train_full[5000:] / 255.0

y_valid, y_train = y_train_full[:5000], y_train_full[5000:]

X_test = X_test / 255.0

#1000 labelled points in train set for this example

X_train_la = X_train[:1000]
X_train_un = X_train[1000:]

y_train_la = y_train[:1000]
y_train_un = np.empty((X_train_la.shape[0]))


In [6]:
y_train_la = y_train_la.reshape(-1,1)

In [7]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()
y_train_la = encoder.fit_transform(y_train_la)

In [8]:
y_train_la=y_train_la.toarray()

In [9]:
y_valid = y_valid.reshape(-1,1)
y_valid = encoder.transform(y_valid)
y_valid = y_valid.toarray()

# Encoder #

In [10]:
tf.random.set_seed(42)
np.random.seed(42)

#reparameterization trick
class Sampling(keras.layers.Layer):
    def call(self, inputs):
        mean, log_var = inputs
        return K.random_normal(tf.shape(log_var)) * K.exp(log_var/2) + mean

codings_size = 50

x_in = keras.layers.Input(shape=[28, 28])
f = keras.layers.Flatten()(x_in)
z = keras.layers.Dense(600, activation="softplus")(f)
z = keras.layers.Dense(300, activation="softplus")(z)

codings_mean = keras.layers.Dense(codings_size)(z)
codings_log_var = keras.layers.Dense(codings_size)(z)
codings = Sampling()([codings_mean, codings_log_var])

 

variational_encoder = keras.models.Model(
    inputs=[x_in], outputs=[codings_mean, codings_log_var, codings])

## Classifier ##

In [11]:

y_classifier = keras.layers.Dense(300, activation="selu")(f)
y_classifier = keras.layers.Dense(100, activation="selu")(y_classifier)
y_pred = keras.layers.Dense(10,activation="softmax")(y_classifier) 

classifier = keras.models.Model(
    inputs=[x_in], outputs=[y_pred])

# Decoder #

In [12]:

latent = keras.layers.Input(shape=[codings_size])
y = keras.layers.Input(shape=[10])

l_merged = keras.layers.concatenate([latent,y])
x = keras.layers.Dense(300, activation="softplus")(l_merged)
x = keras.layers.Dense(600, activation="softplus")(x)
x = keras.layers.Dense(28 * 28, activation="sigmoid")(x)
x_out = keras.layers.Reshape([28,28])(x)


variational_decoder = keras.models.Model(inputs=[latent,y], outputs=[x_out])

In [13]:
variational_decoder.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 50)]         0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, 10)]         0                                            
__________________________________________________________________________________________________
concatenate (Concatenate)       (None, 60)           0           input_2[0][0]                    
                                                                 input_3[0][0]                    
__________________________________________________________________________________________________
dense_7 (Dense)                 (None, 300)          18300       concatenate[0][0]          

# Compile model# 

In [14]:
#labelled vae
_,_, codings = variational_encoder(x_in)
y_pred = classifier(x_in)
reconstructions = variational_decoder([codings,y])
label_vae = keras.models.Model(inputs=(x_in,y), outputs=(reconstructions,y_pred))


In [15]:
#unlabelled vae
_,_, codings = variational_encoder(x_in)
y_pred = classifier(x_in)
reconstructions_un = variational_decoder([codings,y_pred])
unlabel_vae = keras.models.Model(inputs=x_in, outputs=reconstructions_un)

In [18]:
def labelled_loss_reconstruction(codings_log_var,codings_mean):
    def loss_functions_labelled(x, x_decoded_mean):
        x = K.reshape(x,[-1,28*28])
        x_decoded_mean = K.reshape(x_decoded_mean,[-1,28*28])
        xent_loss = 28*28*keras.losses.binary_crossentropy(x, x_decoded_mean)        
        kl_loss = - 0.5 * K.sum(1 + codings_log_var - K.square(codings_mean) - K.exp(codings_log_var), axis=-1)
        return K.mean(xent_loss + kl_loss)
    return loss_functions_labelled 

def unlabelled_loss_reconstruction(codings_log_var,codings_mean,y_pred):
    def loss_functions_unlabelled(x,x_decoded_mean):
        x = K.reshape(x,[-1,28*28])
        x_decoded_mean = K.reshape(x_decoded_mean,[-1,28*28])
        kl_loss = - 0.5 * K.sum(1 + codings_log_var - K.square(codings_mean) - K.exp(codings_log_var), axis=-1)
        xent_loss = 28*28*keras.losses.binary_crossentropy(x, x_decoded_mean)        
        entropy = keras.losses.categorical_crossentropy(y_pred,y_pred)
        loss = K.mean(kl_loss + xent_loss)
        #need to check below. We are summing over y, but we are assuming that the loss term is independent of y
        #which is not the case. How to do it though? https://github.com/bjlkeng/sandbox/issues/3
        #and how to do it for regression?
        return K.mean(K.sum(y_pred*loss,axis=-1)) + K.mean(entropy)
    return loss_functions_unlabelled

def labelled_cls_loss(y, y_pred,N=1000):
    alpha = 0.1*N
    cat_xent_loss = keras.losses.categorical_crossentropy(y, y_pred)
    return alpha*K.mean(cat_xent_loss)


In [19]:
label_vae.compile(loss=[labelled_loss_reconstruction(codings_log_var,codings_mean)
                        ,labelled_cls_loss], optimizer="rmsprop", experimental_run_tf_function=False)

In [20]:
unlabel_vae.compile(loss=unlabelled_loss_reconstruction(codings_log_var,codings_mean,y_pred),
                    optimizer="rmsprop", experimental_run_tf_function=False)

In [48]:
#for debugging

history = label_vae.fit(
    [X_train_la,y_train_la], [X_train_la,y_train_la], epochs=2, #batch_size=128,
    validation_data=([X_valid,y_valid], [X_valid,y_valid])
)

Train on 1000 samples, validate on 5000 samples
Epoch 1/2
Epoch 2/2


In [49]:
#for debugging

history = unlabel_vae.fit(
    [X_train_un], [X_train_un], epochs=2, #batch_size=128,
    validation_data=([X_valid], [X_valid])
)

Train on 54000 samples, validate on 5000 samples
Epoch 1/2
Epoch 2/2


# Create mini-batches # 

This is not the official implementation, as it performs the gradient update after each batch of labelled and then after each batch of unlabelled (as opposed to calculating the loss for everything in the batch and then performing the update on it).

Omer's implementation may help with doing the gradient updates properly.

In [47]:
#This method hasn't been finished by me. See below.
batch_size = 64
proportion_unlabelled = X_train_un.shape[0]/(X_train_la.shape[0] + X_train_un.shape[0])
proportion_unlabelled

unlabelled_points_per_batch = np.ceil((proportion_unlabelled*batch_size))

labelled_points_per_batch = batch_size - unlabelled_points_per_batch

max_batches_with_labelled_points_per_batch = np.floor(X_train_la.shape[0]/labelled_points_per_batch)
print(max_batches_with_labelled_points_per_batch)

max_batches_with_unlabelled_points_per_batch = np.floor(X_train_un.shape[0]/unlabelled_points_per_batch)
print(max_batches_with_unlabelled_points_per_batch)

num_complete_batches = int(np.floor((X_train_la.shape[0] + X_train_un.shape[0])/batch_size)) #regardless of composition
print(num_complete_batches)

#unlabelled points
number_of_points_in_complete_batches = max_batches_with_unlabelled_points_per_batch*unlabelled_points_per_batch
print(number_of_points_in_complete_batches)

max_full_batch = np.max(max_batches_with_unlabelled_points_per_batch,max_batches_with_labelled_points_per_batch)

for i in range(max_full_batch):
    index_start = i*unlabelled_points_per_batch
    index_end = (i+1)*unlabelled_points_per_batch
    batch_unlabelled_X = X_train_un[index_start:index_end]
    loss = label_vae.train_on_batch(batch....)
    last_index_unlabelled = index_end
    
    index_start2 = i*labelled_points_per_batch
    index_end2 = (i+1)*labelled_points_per_batch
    batch_labelled_X = X_train_la[index_start2:index_end2]
    batch_labelled_y = y_train_la[index_start2:index_end2]
    loss += unlabelled_vae.train_on_batch(....)
    last_index_labelled = index_end2
    
#this will train it on batches that are 'full'

#then need to deal with the remaining 

if max_full_batch*unlabelled_points_per_batch < X_train_un.shape[0]:
    if X_train_un.shape[0] - max_full_batch*unlabelled_points_per_batch <= batch_size: #then just one batch remaining
        batch = X_train_un[last_index:]
        loss += label_vae.train_on_batch(batch....)
    else:
        #make batches and then train on the remaining ones
        
#repeat for unlabelled
        

#does train on batch shuffle the batches? if not then I need to shuffle after each epoch. can just shuffle the indices
    

53991.0


In [21]:
#omer method - just picks things at random so each epoch may not necessarily go through every single point.
#but easier implementation for now

def create_batch(x_label, y_label, x_unlabel, batch_s=64):
    '''
    Creates batches of labelled and unlabelled data. The total number of points in both batches is equal to batch_s. 
    
    '''
    proportion_labelled = x_label.shape[0]/(x_label.shape[0] + x_unlabel.shape[0])
    
    shape_label = x_label.shape[0]
    label_per_batch = int(np.ceil(proportion_labelled*batch_s))
    batch_idx_la = np.random.choice(list(range(shape_label)), label_per_batch)
    batch_x_la = (x_label[batch_idx_la, :])
    batch_y_la = (y_label[batch_idx_la])
    
    shape_unlabel = x_unlabel.shape[0]
    unlabel_per_batch = batch_s - label_per_batch
    batch_idx_un = np.random.choice(list(range(shape_unlabel)), unlabel_per_batch)
    batch_x_un = (x_unlabel[batch_idx_un, :])
                          
            
    return batch_x_la, batch_y_la, batch_x_un

# Train #

In [22]:
def progress_bar(iteration, total, size=30):
    running = iteration < total
    c = ">" if running else "="
    p = (size - 1) * iteration // total
    fmt = "{{:-{}d}}/{{}} [{{}}]".format(len(str(total)))
    params = [iteration, total, "=" * p + c + "." * (size - p - 1)]
    return fmt.format(*params)

In [50]:
def print_status_bar(iteration, total, loss, metrics=None, size=30):
    metrics = " - ".join(["Loss for batch: {:.4f}".format(loss)])
    end = "" if iteration < total else "\n"
    print("\r{} - {}".format(progress_bar(iteration, total), metrics), end=end)
    
def print_status_bar_epoch(iteration, total, loss, validation_loss, metrics=None, size=30):
    metrics = " - ".join(["Loss: {:.4f} Validation loss: {:.4f} ".format(loss,validation_loss)])
    end = "" if iteration < total else "\n"
    print("\r{} - {}".format(progress_bar(iteration, total), metrics), end=end)
    
#could make these functions into just one which works for both if I want. See the Geron textbook.

In [48]:
def fit_model(X_train_la, y_train_la, X_train_un,epochs,X_valid_la, y_valid_la,batch_size=64):

    
    start = time.time()
    history = []
    
    validation_loss = []
    
    batches_per_epoch = int(np.floor((X_train_la.shape[0] + X_train_un.shape[0])/batch_size))

    for epoch in range(epochs):
            
            print("Epoch {}/{}".format(epoch,epochs))
            
            for i in range(batches_per_epoch):

                batch_x_la, batch_y_la, batch_x_un = create_batch(X_train_la,y_train_la,X_train_un,batch_size)

                loss = unlabel_vae.train_on_batch(batch_x_un,batch_x_un)

                loss += label_vae.train_on_batch([batch_x_la,batch_y_la],
                                                [batch_x_la,batch_y_la])[0] #selecting the overall loss term
                

                history.append(loss)
                print_status_bar(i*batch_size,X_train_la.shape[0] + X_train_un.shape[0],loss)
                
            val_loss = unlabel_vae.evaluate(X_train_un,X_train_un,verbose=0);
            
            val_loss += label_vae.evaluate([X_train_la,y_train_la],[X_train_la,y_train_la],verbose=0)[0];
            
            validation_loss.append(val_loss)
                
            print_status_bar_epoch(X_train_la.shape[0] + X_train_un.shape[0]
                             ,(X_train_la.shape[0] + X_train_un.shape[0]),loss,val_loss)
            
            #need to check this callback
            patience = 10
            latest_val_loss = validation_loss[:-patience]
            if all(i<=val_loss for i in latest_val_loss) is True:
                break
            
            
                
    done = time.time()
    elapsed = done-start
    print("Elapsed: ",elapsed)
    print("Final training loss: ",loss)
    
    return history

In [49]:
history = fit_model(X_train_la, y_train_la, X_train_un,1,X_valid,y_valid,batch_size=64)

Epoch 0/1
Elapsed:  15.52076244354248
Final training loss:  194.06216
