In [1]:
import sys
assert sys.version_info >= (3, 5)

In [1]:
import tensorflow as tf
from tensorflow import keras
assert tf.__version__ >= "2.0"

import numpy as np

In [2]:
import os

os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   
os.environ["CUDA_VISIBLE_DEVICES"]="1"

#check os.environ ld_library_path is the same here as when I do it in python via terminal, if I get issues

#sometimes I can't select the GPU. In this case, try: https://forums.fast.ai/t/tip-limiting-tensorflow-to-one-gpu/1995

In [16]:
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [6]:
tf.__version__


'2.1.0'

In [3]:
K = keras.backend


In [4]:
mnist = keras.datasets.mnist
(X_train_full, y_train_full), (X_test,y_test) = mnist.load_data()

X_valid, X_train = X_train_full[:5000] / 255.0, X_train_full[5000:] / 255.0

y_valid, y_train = y_train_full[:5000], y_train_full[5000:]

X_test = X_test / 255.0

#1000 labelled points in train set for this example

X_train_la = X_train[:1000]
X_train_un = X_train[1000:]

y_train_la = y_train[:1000]
y_train_un = np.empty((X_train_la.shape[0]))


In [5]:
y_train_la = y_train_la.reshape(-1,1)

In [6]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()
y_train_la = encoder.fit_transform(y_train_la)

In [7]:
y_train_la=y_train_la.toarray()

In [8]:
y_valid = y_valid.reshape(-1,1)
y_valid = encoder.transform(y_valid)
y_valid = y_valid.toarray()

# Encoder #

In [9]:
tf.random.set_seed(42)
np.random.seed(42)

#reparameterization trick
class Sampling(keras.layers.Layer):
    def call(self, inputs):
        mean, log_var = inputs
        return K.random_normal(tf.shape(log_var)) * K.exp(log_var/2) + mean

codings_size = 50

x_in = keras.layers.Input(shape=[28, 28])
#y_in = keras.layers.Input(shape=[1],name="labelled_y") #check this shape. maybe need to reshape y by .reshape(-1,1)
f = keras.layers.Flatten()(x_in)
z = keras.layers.Dense(600, activation="softplus")(f)
z = keras.layers.Dense(300, activation="softplus")(z)

codings_mean = keras.layers.Dense(codings_size)(z)
codings_log_var = keras.layers.Dense(codings_size)(z)
codings = Sampling()([codings_mean, codings_log_var])

 

variational_encoder = keras.models.Model(
    inputs=[x_in], outputs=[codings_mean, codings_log_var, codings])

## Classifier ##

In [10]:

y_classifier = keras.layers.Dense(300, activation="selu")(f)
y_classifier = keras.layers.Dense(100, activation="selu")(y_classifier)
y_pred = keras.layers.Dense(10,activation="softmax")(y_classifier) 

classifier = keras.models.Model(
    inputs=[x_in], outputs=[y_pred])

# Decoder #

In [11]:
#labelled decoder
#l_z = keras.layers.Input(shape=[codings_size])
#l_y = keras.layers.Input(shape=[1]) #this is the y_in
#l_merged = keras.layers.concatenate([l_y,l_z])
latent = keras.layers.Input(shape=[codings_size])
y = keras.layers.Input(shape=[10])

l_merged = keras.layers.concatenate([latent,y])
x = keras.layers.Dense(300, activation="softplus")(l_merged)
x = keras.layers.Dense(600, activation="softplus")(x)
x = keras.layers.Dense(28 * 28, activation="sigmoid")(x)
x_out = keras.layers.Reshape([28,28])(x)


#unlabelled decoder
#u_z = keras.layers.Input(shape=[codings_size])
#u_merged = keras.layers.concatenate([y_pred,u_z])
#x = keras.layers.Dense(300, activation="softplus")(u_merged)
#x = keras.layers.Dense(600, activation="softplus")(x)
#x = keras.layers.Dense(28 * 28, activation="sigmoid")(x)
#u_x_out = keras.layers.Reshape([28,28])(x)


variational_decoder = keras.models.Model(inputs=[latent,y], outputs=[x_out])

In [12]:
variational_decoder.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 50)]         0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, 10)]         0                                            
__________________________________________________________________________________________________
concatenate (Concatenate)       (None, 60)           0           input_2[0][0]                    
                                                                 input_3[0][0]                    
__________________________________________________________________________________________________
dense_7 (Dense)                 (None, 300)          18300       concatenate[0][0]          

# Compile model# 

In [34]:
#labelled vae
_,_, codings = variational_encoder(x_in)
y_pred = classifier(x_in)
reconstructions = variational_decoder([codings,y])
label_vae = keras.models.Model(inputs=(x_in,y), outputs=(reconstructions,y_pred))

#label_vae = keras.models.Model(inputs=[x_in,y_in], outputs=[reconstructions,y_pred])
#label_vae = keras.models.Model(inputs=[x_in,y_in], outputs=[l_x_out,y_pred])

#def dummy_loss(x, x_decoded):
   # kl = -0.5*K.sum(1 + codings_log_var - K.exp(codings_log_var) - K.square(codings_mean), axis=-1)
    #return K.mean(kl)

#def cls_loss(y, y_pred, N=1000):
   # alpha = 0.1*N
   # return alpha * keras.metrics.categorical_crossentropy(y, y_pred)

#label_vae.compile(loss=[dummy_loss,cls_loss], optimizer="rmsprop")



In [33]:
#unlabelled vae
_,_, codings = variational_encoder(x_in)
y_pred = classifier(x_in)
reconstructions_un = variational_decoder([codings,y_pred])
unlabel_vae = keras.models.Model(inputs=x_in, outputs=reconstructions_un)

In [31]:
def labelled_loss_reconstruction(codings_log_var,codings_mean):
    def loss_functions_labelled(x, x_decoded_mean):
        xent_loss = 28*28*keras.losses.binary_crossentropy(x, x_decoded_mean)        
        kl_loss = - 0.5 * K.sum(1 + codings_log_var - K.square(codings_mean) - K.exp(codings_log_var), axis=-1)
        return xent_loss  + K.mean(kl_loss)
    return loss_functions_labelled 

def unlabelled_loss_reconstruction(codings_log_var,codings_mean,y_pred):
    def loss_functions_unlabelled(x,x_decoded_mean):
        kl_loss = - 0.5 * K.sum(1 + codings_log_var - K.square(codings_mean) - K.exp(codings_log_var), axis=-1)
        xent_loss = 28*28*keras.losses.binary_crossentropy(x, x_decoded_mean)        
        entropy = keras.losses.categorical_crossentropy(y_pred,y_pred)
        loss = kl_loss + xent_loss
        #need to check below. We are summing over y, but we are assuming that the loss term is independent of y
        #which is not the case. How to do it though? and how to do it for regression?
        return K.mean(K.sum(y_pred*loss,axis=-1)) + entropy
    return loss_functions_unlabelled

# def kl_loss(x, x_decoded_mean, codings_mean=codings_mean, codings_log_var=codings_log_var):
#     kl_loss = - 0.5 * K.sum(1. + codings_log_var - K.square(codings_mean) - K.exp(codings_log_var), axis=-1)
#     return K.mean(kl_loss)

# def reconstruction_loss(x, x_decoded_mean):
#     xent_loss = 28*28*keras.losses.binary_crossentropy(x, x_decoded_mean)        
#     return xent_loss

# def labeled_vae_loss(x, x_decoded_mean):
#     return reconstruction_loss(x, x_decoded_mean) + kl_loss(x, x_decoded_mean)

def labelled_cls_loss(y, y_pred,N=1000):
    alpha = 0.1*N
    cat_xent_loss = keras.losses.categorical_crossentropy(y, y_pred)
    return alpha*cat_xent_loss 

# def unlabeled_vae_loss(x, x_decoded_mean):
#     entropy = keras.losses.categorical_crossentropy(y_pred, y_pred)
#     # This is probably not correct, see discussion here: https://github.com/bjlkeng/sandbox/issues/3
#     labeled_loss = logxy_loss(x, x_decoded_mean) + kl_loss(x, x_decoded_mean)
    
#     return K.mean(K.sum(y_pred * labeled_loss, axis=-1)) + entropy

#kl_loss = -0.5*K.sum(1 + codings_log_var - K.exp(codings_log_var) - K.square(codings_mean), axis=-1)
#label_vae.add_loss(K.mean(kl_loss))
#unlabel_vae.add_loss(K.mean(kl_loss))


#unlabel_vae.compile(loss=xent, optimizer="rmsprop")

In [None]:
label_vae.compile(loss=[labelled_loss_reconstruction(codings_log_var,codings_mean)
                        ,labelled_cls_loss], optimizer="rmsprop", experimental_run_tf_function=False)

unlabel_vae.compile(loss=loss_functions_unlabelled(codings_log_var,codings_mean,y_pred),
                    optimizer="rmsprop", experimental_run_tf_function=False)

In [35]:
label_vae.layers[1].name

'model_7'

In [39]:
variational_decoder.summary()

Model: "model_9"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_9 (InputLayer)            [(None, 50)]         0                                            
__________________________________________________________________________________________________
input_10 (InputLayer)           [(None, 10)]         0                                            
__________________________________________________________________________________________________
concatenate_3 (Concatenate)     (None, 60)           0           input_9[0][0]                    
                                                                 input_10[0][0]                   
__________________________________________________________________________________________________
dense_23 (Dense)                (None, 300)          18300       concatenate_3[0][0]        

In [44]:
variational_decoder.layers[3].get_weights()[0]

array([[-0.09232819, -0.02540258, -0.04296383, ..., -0.0084036 ,
        -0.02555051,  0.0707534 ],
       [ 0.04802527, -0.03397375,  0.08732726, ...,  0.01954718,
        -0.04615254, -0.0089247 ],
       [ 0.00515142, -0.03057862,  0.02474202, ..., -0.10528823,
         0.04132915, -0.12675573],
       ...,
       [-0.10764248, -0.01161164,  0.11734843, ...,  0.0612096 ,
         0.03693882, -0.12346322],
       [-0.01044971,  0.00875248, -0.10003087, ...,  0.11242384,
         0.02709192, -0.08294469],
       [ 0.11261559, -0.00051823,  0.04721989, ...,  0.05338033,
        -0.08965599,  0.06663036]], dtype=float32)

In [32]:
history = label_vae.fit(
    [X_train_la,y_train_la], [X_train_la,y_train_la], epochs=2, #batch_size=128,
    validation_data=([X_valid,y_valid], [X_valid,y_valid])
)

Train on 1000 samples, validate on 5000 samples
Epoch 1/2
  32/1000 [..............................] - ETA: 23s

_SymbolicException: Inputs to eager execution function cannot be Keras symbolic tensors, but found [<tf.Tensor 'model_4/Identity_1:0' shape=(None, 50) dtype=float32>, <tf.Tensor 'model_4/Identity:0' shape=(None, 50) dtype=float32>]

In [48]:
variational_decoder.layers[3].get_weights()[0]

array([[-0.07822745, -0.03029954, -0.0353783 , ...,  0.01632262,
        -0.01634867,  0.08539767],
       [ 0.05397691, -0.03482179,  0.09450759, ...,  0.00685038,
        -0.04446088, -0.00280934],
       [-0.00926459, -0.01847685,  0.00953186, ..., -0.12741618,
         0.04164019, -0.13967206],
       ...,
       [-0.06770626, -0.01606098,  0.08281747, ...,  0.09970969,
         0.06407818, -0.12844528],
       [-0.00827528, -0.00751313, -0.10342819, ...,  0.08580919,
         0.00274213, -0.12007884],
       [ 0.1334239 , -0.01653085,  0.07267592, ...,  0.0648092 ,
        -0.05652961,  0.11391716]], dtype=float32)

In [57]:
history = unlabel_vae.fit(
    [X_train_un], [X_train_un], epochs=2, #batch_size=128,
    validation_data=([X_valid], [X_valid])
)

Train on 54000 samples, validate on 5000 samples
Epoch 1/2
Epoch 2/2


In [50]:
variational_decoder.layers[3].get_weights()[0]

array([[-0.4085583 ,  0.21380813,  0.1844387 , ..., -0.19951518,
         0.22544022,  0.09068006],
       [ 0.03381001, -0.03907054,  0.05096668, ..., -0.01590356,
         0.0084176 ,  0.05683507],
       [ 0.02828356, -0.06043579,  0.02203771, ..., -0.08185209,
         0.02327271, -0.11006324],
       ...,
       [-0.50188917,  0.29346964, -0.38844398, ...,  0.31701285,
         0.64155203, -0.48085108],
       [ 0.11821338, -0.36241302, -0.5141053 , ...,  0.31004784,
        -0.42054015, -0.90871555],
       [ 0.6186034 , -0.24054687,  0.632534  , ...,  0.24116048,
        -0.16839913,  0.766121  ]], dtype=float32)

# Create mini-batches # 

This is not the official implementation, as it performs the gradient update after each batch of labelled and then after each batch of unlabelled (as opposed to calculating the loss for everything in the batch and then performing the update on it).

In [10]:
X_train_la.shape

(1000, 28, 28)

In [11]:
X_train_un.shape

(54000, 28, 28)

In [16]:
batch_size = 64
proportion_unlabelled = X_train_un.shape[0]/(X_train_la.shape[0] + X_train_un.shape[0])

In [17]:
proportion_unlabelled

0.9818181818181818

In [37]:
unlabelled_points_per_batch = np.ceil((proportion_unlabelled*batch_size))

In [38]:
unlabelled_points_per_batch

63.0

In [39]:
labelled_points_per_batch = batch_size - unlabelled_points_per_batch

In [46]:
max_batches_with_labelled_points_per_batch = np.floor(X_train_la.shape[0]/labelled_points_per_batch)
print(max_batches_with_labelled_points_per_batch)

1000.0


In [44]:
max_batches_with_unlabelled_points_per_batch = np.floor(X_train_un.shape[0]/unlabelled_points_per_batch)
print(max_batches_with_unlabelled_points_per_batch)

857.0


In [43]:
num_complete_batches = int(np.floor((X_train_la.shape[0] + X_train_un.shape[0])/batch_size)) #regardless of composition
print(num_complete_batches)

859


In [47]:
#unlabelled points
number_of_points_in_complete_batches = max_batches_with_unlabelled_points_per_batch*unlabelled_points_per_batch
print(number_of_points_in_complete_batches)

max_full_batch = np.max(max_batches_with_unlabelled_points_per_batch,max_batches_with_labelled_points_per_batch)

for i in range(max_full_batch):
    index_start = i*unlabelled_points_per_batch
    index_end = (i+1)*unlabelled_points_per_batch
    batch_unlabelled_X = X_train_un[index_start:index_end]
    loss = label_vae.train_on_batch(batch....)
    
    index_start2 = i*labelled_points_per_batch
    index_end2 = (i+1)*labelled_points_per_batch
    batch_labelled_X = X_train_la[index_start:index_end]
    batch_labelled_y = y_train_la[index_start:index_end]
    loss += unlabelled_vae.train_on_batch(....)
    
#this will train it on batches that are 'full'

#then need to deal with the remaining 

#does train on batch shuffle the batches? if not then I need to shuffle after each epoch. can just shuffle the indices
    

53991.0


In [57]:
size_remaining_batch = ((X_train_la.shape[0] + X_train_un.shape[0]) - (max_full_batch*batch_size))

NameError: name 'max_full_batch' is not defined

In [33]:
remaining_labelled = X_train_la.shape[0] - labelled_points_per_batch*num_complete_batches

In [34]:
remaining_labelled

-718.0

In [None]:
early_stopping_cb = keras.callbacks.EarlyStopping(patience=10,restore_best_weights=True)

history = variational_ae.fit(X_train, X_train, epochs=200, batch_size=32,
                             validation_data=(X_valid, X_valid),  callbacks=[early_stopping_cb])

# Rough working #

In [49]:
##### check vae from keras
codings_size=10

#reparameterization trick
class Sampling(keras.layers.Layer):
    def call(self, inputs):
        mean, log_var = inputs
        return K.random_normal(tf.shape(log_var)) * K.exp(log_var/2) + mean
    
inputs = keras.layers.Input(shape=[28, 28])
z = keras.layers.Flatten()(inputs)
z = keras.layers.Dense(150, activation="selu")(z)
z = keras.layers.Dense(100, activation="selu")(z)
codings_mean = keras.layers.Dense(codings_size)(z)  # μ
codings_log_var = keras.layers.Dense(codings_size)(z)  # γ
codings = Sampling()([codings_mean, codings_log_var])
variational_encoder = keras.Model(
inputs=[inputs], outputs=[codings_mean, codings_log_var, codings])


In [45]:
codings_mean

<tf.Tensor 'dense_2/Identity:0' shape=(None, 10) dtype=float32>

In [10]:
variational_encoder.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 28, 28)]     0                                            
__________________________________________________________________________________________________
flatten (Flatten)               (None, 784)          0           input_1[0][0]                    
__________________________________________________________________________________________________
dense (Dense)                   (None, 150)          117750      flatten[0][0]                    
__________________________________________________________________________________________________
dense_1 (Dense)                 (None, 100)          15100       dense[0][0]                      
______________________________________________________________________________________________

In [50]:
decoder_inputs = keras.layers.Input(shape=[codings_size])
x = keras.layers.Dense(100, activation="selu")(decoder_inputs)
x = keras.layers.Dense(150, activation="selu")(x)
x = keras.layers.Dense(28 * 28, activation="sigmoid")(x)
outputs = keras.layers.Reshape([28, 28])(x)
variational_decoder = keras.Model(inputs=[decoder_inputs], outputs=[outputs])

variational_decoder.summary()

Model: "model_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         [(None, 10)]              0         
_________________________________________________________________
dense_11 (Dense)             (None, 100)               1100      
_________________________________________________________________
dense_12 (Dense)             (None, 150)               15150     
_________________________________________________________________
dense_13 (Dense)             (None, 784)               118384    
_________________________________________________________________
reshape_1 (Reshape)          (None, 28, 28)            0         
Total params: 134,634
Trainable params: 134,634
Non-trainable params: 0
_________________________________________________________________


In [51]:
_, _, codings = variational_encoder(inputs)
reconstructions = variational_decoder(codings)
variational_ae = keras.Model(inputs=[inputs], outputs=[reconstructions])


In [13]:
variational_ae.summary()

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 28, 28)]          0         
_________________________________________________________________
model (Model)                [(None, 10), (None, 10),  134870    
_________________________________________________________________
model_1 (Model)              (None, 28, 28)            134634    
Total params: 269,504
Trainable params: 269,504
Non-trainable params: 0
_________________________________________________________________


In [None]:
import tensorflow as tf

In [38]:
reconstructions.shape

TensorShape([None, 28, 28])

In [42]:
with tf.name_scope('weird_loss'): 
    xent_loss = tf.losses.binary_crossentropy(inputs, reconstructions)
    kl_loss = tf.reduce_mean(tf.reduce_sum(1 + codings_log_var - tf.square(codings_mean) - tf.exp(codings_log_var), axis=1))
    loss = xent_loss + kl_loss

In [58]:
codings_log_var.shape

TensorShape([None, 10])

In [56]:
def loss(codings_log_var):

    def vae_loss(x, x_decoded_mean):
    
        xent_loss = tf.losses.binary_crossentropy(x, x_decoded_mean)
        kl_loss = K.sum(codings_log_var,axis=-1)
    #tf.sum(1 + codings_log_var - tf.square(codings_mean) - tf.exp(codings_log_var))
    kl_loss = (- 0.5 * K.sum(1 + codings_log_var - K.square(codings_mean) - K.exp(codings_log_var), axis=-1), dtype=xent_loss)
    #return #tf.add(xent_loss, K.mean(kl_loss/784))
        return K.mean(kl_loss) + xent_loss
    return vae_loss


variational_ae.compile(optimizer='rmsprop', loss=loss(codings_log_var),experimental_run_tf_function=False)


In [46]:
optimizer = tf.compat.v1.train.RMSPropOptimizer(learning_rate = 0.001).minimize(loss, var_list = [codings_log_var, codings_mean])

TypeError: loss() missing 2 required positional arguments: 'codings_mean' and 'codings_log_var'

In [101]:
tf.config.experimental_run_functions_eagerly(True)


In [57]:
history = variational_ae.fit(X_train, X_train, epochs=1, batch_size=128,
validation_data=[X_valid, X_valid])

Train on 55000 samples, validate on 5000 samples
