In [19]:
import numpy as np
import pandas as pd
import pickle as pkl

import time
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

from sklearn.metrics import log_loss
from sklearn.decomposition import PCA, NMF, FastICA
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings("ignore")

%tensorflow_version 1.x
import tensorflow as tf

from keras import backend as K
from keras import optimizers, metrics
from keras.layers import Input, Dense, Lambda, Activation, Dropout, Layer
from keras.layers.normalization import BatchNormalization
from keras.layers.merge import concatenate
from keras.models import Model, Sequential
from keras.regularizers import l1
from keras import activations
from keras import backend as K
from keras.utils import plot_model
from keras.callbacks import Callback

In [2]:
from google.colab import drive
drive.mount("/content/drive/")

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [3]:
###########################################################
###################### PCA, ICA, NMF ######################
###########################################################

def get_cost_reconstruction(X, model, k_list):
    bce_loss = []
    l2_error = []
    output = {}
    
    print("Calculating Reconstruction Error for:", model.upper())
    time.sleep(1)
    
    if model == "pca":
        for k in tqdm(k_list):
            if k <= len(X):
              model = PCA(n_components=k, random_state=4)
              model.fit(X)
              reduced = model.transform(X)
              reconstructed = model.inverse_transform(reduced)
              bce_loss.append(log_loss(X.reshape(-1,).astype(int), reconstructed.reshape(-1,)))
              l2_error.append(np.linalg.norm(X-reconstructed))
              output[k] = model.components_

    if model == "ica":
        for k in tqdm(k_list):
            if k <= len(X):
              model = FastICA(n_components=k, random_state=4, max_iter=400)
              model.fit(X)
              reduced = model.transform(X)
              reconstructed = model.inverse_transform(reduced)
              bce_loss.append(log_loss(X.reshape(-1,).astype(int), reconstructed.reshape(-1,)))
              l2_error.append(np.linalg.norm(X-reconstructed))
              output[k] = model.components_

    if model == "nmf":
        for k in tqdm(k_list):
            if k <= len(X):
              model = NMF(n_components=k, random_state=4, max_iter=400)
              model.fit(X)
              reduced = model.transform(X)
              reconstructed = model.inverse_transform(reduced)
              bce_loss.append(log_loss(X.reshape(-1,).astype(int), reconstructed.reshape(-1,)))
              l2_error.append(np.linalg.norm(X-reconstructed))
              output[k] = model.components_
    
    return bce_loss, l2_error, output

In [4]:
###########################################################
############## Autoencoder Utility Classes ################
###########################################################

# From tybalt.utils.base
class BaseModel():
    def __init__(self):
        pass

    def get_summary(self):
        self.full_model.summary()

    def visualize_architecture(self, output_file):
        # Visualize the connections of the custom VAE model
        plot_model(self.full_model, to_file=output_file)

    def visualize_training(self, output_file=None):
        # Visualize training performance
        history_df = pd.DataFrame(self.hist.history)
        ax = history_df.plot()
        ax.set_xlabel('Epochs')
        ax.set_ylabel('Loss')
        fig = ax.get_figure()
        if output_file:
            fig.savefig(output_file)
        else:
            fig.show()

    def get_weights(self, decoder=True):
        # Extract weight matrices from encoder or decoder
        weights = []
        if decoder:
            for layer in self.decoder.layers:
                weights.append(layer.get_weights())
        else:
            for layer in self.encoder.layers:
                # Encoder weights must be transposed
                encoder_weights = layer.get_weights()
                encoder_weights = [np.transpose(x) for x in encoder_weights]
                weights.append(encoder_weights)
        return weights

    def save_models(self, encoder_file, decoder_file):
        self.encoder.save(encoder_file)
        self.decoder.save(decoder_file)


# From tybalt.utils.vae_utils
def approx_keras_binary_cross_entropy(x, z, p, epsilon=1e-07):
    # Ensure numpy arrays
    x = np.array(x)
    z = np.array(z)

    # Add clip to value
    x[x < epsilon] = epsilon
    x[x > (1 - epsilon)] = (1 - epsilon)

    # Perform logit
    x = np.log(x / (1 - x))

    # Return approximate binary cross entropy
    return np.mean(p * np.mean(- x * z + np.log(1 + np.exp(x)), axis=-1))


class WarmUpCallback(Callback):
    def __init__(self, beta, kappa):
        self.beta = beta
        self.kappa = kappa

    def on_epoch_end(self, epoch, logs={}):
        if K.get_value(self.beta) <= 1:
            K.set_value(self.beta, K.get_value(self.beta) + self.kappa)


class LossCallback(Callback):
    def __init__(self, training_data, original_dim, encoder_cbk, decoder_cbk):
        self.training_data = training_data
        self.original_dim = original_dim
        self.encoder_cbk = encoder_cbk
        self.decoder_cbk = decoder_cbk

    def on_train_begin(self, logs={}):
        self.xent_loss = []
        self.kl_loss = []

    def on_epoch_end(self, epoch, logs={}):
        recon = self.decoder_cbk.predict(self.encoder_cbk.predict(self.training_data))
        xent_loss = approx_keras_binary_cross_entropy(x=recon, z=self.training_data, p=self.original_dim)
        full_loss = logs.get('loss')
        self.xent_loss.append(xent_loss)
        self.kl_loss.append(full_loss - xent_loss)
        return

In [5]:
###########################################################
############## Denoising Autoencoder (DAE) ################
###########################################################

# From tybalt.utils.adage_utils
class TiedWeightsDecoder(Layer):
    def __init__(self, output_dim, encoder, activation=None, **kwargs):
        self.output_dim = output_dim
        self.encoder = encoder
        self.activation = activations.get(activation)
        super(TiedWeightsDecoder, self).__init__(**kwargs)

    def build(self, input_shape):
        self.kernel = self.encoder.weights
        super(TiedWeightsDecoder, self).build(input_shape)

    def call(self, x):
        # Encoder weights: [weight_matrix, bias_term]
        output = K.dot(x - self.encoder.weights[1], K.transpose(self.encoder.weights[0]))
        if self.activation is not None:
            output = self.activation(output)
        return output

    def compute_output_shape(self, input_shape):
        return (input_shape[0], self.output_dim)


class Adage(BaseModel):
    def __init__(self, original_dim, latent_dim, noise=0.05, batch_size=50,
                 epochs=100, sparsity=0, learning_rate=0.0005, loss='mse',
                 optimizer='adam', tied_weights=True, verbose=True):
        BaseModel.__init__(self)
        self.model_name = 'ADAGE'
        self.original_dim = original_dim
        self.latent_dim = latent_dim
        self.noise = noise
        self.batch_size = batch_size
        self.epochs = epochs
        self.sparsity = sparsity
        self.learning_rate = learning_rate
        self.loss = loss
        self.optimizer = optimizer
        self.tied_weights = tied_weights
        self.verbose = verbose

    def _build_graph(self):
        # Build the Keras graph for an ADAGE model
        self.input_rnaseq = Input(shape=(self.original_dim, ))
        drop = Dropout(self.noise)(self.input_rnaseq)
        self.encoded = Dense(self.latent_dim, activity_regularizer=l1(self.sparsity))(drop)
        activation = Activation('relu')(self.encoded)
        decoded_rnaseq = Dense(self.original_dim, activation='sigmoid')(activation)

        self.full_model = Model(self.input_rnaseq, decoded_rnaseq)

    def _build_tied_weights_graph(self):
        # Build Keras graph for an ADAGE model with tied weights
        self.encoded = Dense(self.latent_dim, input_shape=(self.original_dim, ), activity_regularizer=l1(self.sparsity), activation='relu')
        dropout_layer = Dropout(self.noise)
        self.tied_decoder = TiedWeightsDecoder(input_shape=(self.latent_dim, ), output_dim=self.original_dim, activation='sigmoid', encoder=self.encoded)
        self.full_model = Sequential()
        self.full_model.add(self.encoded)
        self.full_model.add(dropout_layer)
        self.full_model.add(self.tied_decoder)

    def _compile_adage(self):
        # Compile the autoencoder to prepare for training
        if self.optimizer == 'adadelta':
            optim = optimizers.Adadelta(lr=self.learning_rate)
        elif self.optimizer == 'adam':
            optim = optimizers.Adam(lr=self.learning_rate)
        self.full_model.compile(optimizer=optim, loss=self.loss)

    def _connect_layers(self):
        # Separate out the encoder and decoder model
        encoded_input = Input(shape=(self.latent_dim, ))
        decoder_layer = self.full_model.layers[-1]
        self.decoder = Model(encoded_input, decoder_layer(encoded_input))

        if self.tied_weights:
            # The keras graph is built differently for a tied weight model
            # Build a model with input and output Tensors of the encoded layer
            self.encoder = Model(self.encoded.input, self.encoded.output)
        else:
            self.encoder = Model(self.input_rnaseq, self.encoded)

    def initialize_model(self):
        if self.tied_weights:
            self._build_tied_weights_graph()
        else:
            self._build_graph()
        self._connect_layers()
        self._compile_adage()

    def train_adage(self, train_df, test_df, adage_comparable_loss=False):
        self.hist = self.full_model.fit(np.array(train_df), np.array(train_df),
                                        shuffle=True,
                                        epochs=self.epochs,
                                        verbose=self.verbose,
                                        batch_size=self.batch_size,
                                        validation_data=(np.array(test_df),
                                                         np.array(test_df)))
        self.history_df = pd.DataFrame(self.hist.history)

        # ADAGE loss is a mean over all features - to make this value more
        # comparable to the VAE reconstruciton loss, multiply by num genes
        if adage_comparable_loss:
            self.history_df = self.history_df * self.original_dim

    def compress(self, df):
        # Encode rnaseq into the hidden/latent representation - and save output
        encoded_df = self.encoder.predict(np.array(df))
        encoded_df = pd.DataFrame(encoded_df, index=df.index, columns=range(1, self.latent_dim + 1))
        return encoded_df

In [6]:
###########################################################
############## Variational Autoencoder (VAE) ##############
###########################################################

class VariationalLayer(Layer):
    def __init__(self, var_layer, mean_layer, original_dim, beta, loss, **kwargs):
        self.is_placeholder = True
        self.var_layer = var_layer
        self.mean_layer = mean_layer
        self.original_dim = original_dim
        self.beta = beta
        self.loss = loss
        super(VariationalLayer, self).__init__(**kwargs)

    def vae_loss(self, x_input, x_decoded):
        if self.loss == 'binary_crossentropy':
            recon_loss = self.original_dim * \
                         metrics.binary_crossentropy(x_input, x_decoded)
        elif self.loss == 'mse':
            recon_loss = self.original_dim * \
                         metrics.mean_squared_error(x_input, x_decoded)

        kl_loss = - 0.5 * K.sum(1 + self.var_layer -
                                K.square(self.mean_layer) -
                                K.exp(self.var_layer), axis=-1)

        return K.mean(recon_loss + (K.get_value(self.beta) * kl_loss))

    def call(self, inputs):
        x, x_decoded = inputs
        loss = self.vae_loss(x, x_decoded)
        self.add_loss(loss, inputs=inputs)
        # We won't actually use the output.
        return x


class VAE(BaseModel):
    def __init__(self):
        BaseModel.__init__(self)

    def _sampling(self, args):
        # Function with args required for Keras Lambda function
        z_mean, z_log_var = args

        # Draw epsilon of the same shape from a standard normal distribution
        epsilon = K.random_normal(shape=tf.shape(z_mean), mean=0., stddev=self.epsilon_std)

        # The latent vector is non-deterministic and differentiable
        # in respect to z_mean and z_log_var
        z = z_mean + K.exp(z_log_var / 2) * epsilon
        return z

    def initialize_model(self):
        self._build_encoder_layer()
        self._build_decoder_layer()
        self._compile_vae()
        self._connect_layers()

    def compress(self, df):
        # Encode rnaseq into the hidden/latent representation - and save output
        # a cVAE expects a list of [rnaseq_df, y_df]
        encoded_df = self.encoder.predict_on_batch(df)

        if self.model_name == 'cTybalt':
            named_index = df[0].index
        else:
            named_index = df.index

        encoded_df = pd.DataFrame(encoded_df, columns=range(1, self.latent_dim + 1), index=named_index)
        return encoded_df


class Tybalt(VAE):
    def __init__(self, original_dim, latent_dim, batch_size=50, epochs=50,
                 learning_rate=0.0005, kappa=1, epsilon_std=1.0,
                 beta=K.variable(0), loss='binary_crossentropy',
                 verbose=True):
        VAE.__init__(self)
        self.model_name = 'Tybalt'
        self.original_dim = original_dim
        self.latent_dim = latent_dim
        self.batch_size = batch_size
        self.epochs = epochs
        self.learning_rate = learning_rate
        self.kappa = kappa
        self.epsilon_std = epsilon_std
        self.beta = beta
        self.loss = loss
        self.verbose = verbose

    def _build_encoder_layer(self):
        # Input place holder for RNAseq data with specific input size
        self.rnaseq_input = Input(shape=(self.original_dim, ))

        # Input layer is compressed into a mean and log variance vector of
        # size `latent_dim`. Each layer is initialized with glorot uniform
        # weights and each step (dense connections, batch norm, and relu
        # activation) are funneled separately.
        # Each vector are connected to the rnaseq input tensor

        # input layer to latent mean layer
        z_mean = Dense(self.latent_dim, kernel_initializer='glorot_uniform')(self.rnaseq_input)
        z_mean_batchnorm = BatchNormalization()(z_mean)
        self.z_mean_encoded = Activation('relu')(z_mean_batchnorm)

        # input layer to latent standard deviation layer
        z_var = Dense(self.latent_dim, kernel_initializer='glorot_uniform')(self.rnaseq_input)
        z_var_batchnorm = BatchNormalization()(z_var)
        self.z_var_encoded = Activation('relu')(z_var_batchnorm)

        # return the encoded and randomly sampled z vector
        # Takes two keras layers as input to the custom sampling function layer
        self.z = Lambda(self._sampling, output_shape=(self.latent_dim, ))([self.z_mean_encoded, self.z_var_encoded])

    def _build_decoder_layer(self):
        # The decoding layer is much simpler with a single layer glorot uniform
        # initialized and sigmoid activation
        self.decoder_model = Sequential()
        self.decoder_model.add(Dense(self.original_dim, activation='sigmoid', input_dim=self.latent_dim))
        self.rnaseq_reconstruct = self.decoder_model(self.z)

    def _compile_vae(self):
        adam = optimizers.Adam(lr=self.learning_rate)
        vae_layer = VariationalLayer(var_layer=self.z_var_encoded,
                                     mean_layer=self.z_mean_encoded,
                                     original_dim=self.original_dim,
                                     beta=self.beta, loss=self.loss)([self.rnaseq_input, self.rnaseq_reconstruct])
        self.full_model = Model(self.rnaseq_input, vae_layer)
        self.full_model.compile(optimizer=adam, loss=None,
                                loss_weights=[self.beta])

    def _connect_layers(self):
        self.encoder = Model(self.rnaseq_input, self.z_mean_encoded)

        decoder_input = Input(shape=(self.latent_dim, ))
        _x_decoded_mean = self.decoder_model(decoder_input)
        self.decoder = Model(decoder_input, _x_decoded_mean)

    def train_vae(self, train_df, test_df, separate_loss=False):
        cbks = [WarmUpCallback(self.beta, self.kappa)]
        if separate_loss:
            tybalt_loss_cbk = LossCallback(training_data=np.array(train_df), encoder_cbk=self.encoder, decoder_cbk=self.decoder, original_dim=self.original_dim)
            cbks += [tybalt_loss_cbk]

        self.hist = self.full_model.fit(np.array(train_df),
                                        shuffle=True,
                                        epochs=self.epochs,
                                        batch_size=self.batch_size,
                                        verbose=self.verbose,
                                        validation_data=(np.array(test_df), None),
                                        callbacks=cbks)
        self.history_df = pd.DataFrame(self.hist.history)

        if separate_loss:
            self.history_df = self.history_df.assign(recon=tybalt_loss_cbk.xent_loss)
            self.history_df = self.history_df.assign(kl=tybalt_loss_cbk.kl_loss)


Instructions for updating:
If using Keras pass *_constraint arguments to layers.


In [7]:
common_norm_df = pd.read_csv('/content/drive/MyDrive/aacb_project/datasets/common_normalized.csv', index_col=0)
df_new = pd.read_csv("/content/drive/MyDrive/aacb_project/datasets/control_pcos_celltype_mapping.csv")

normal = ["control", "obese", "pioglitazone", "lean", "valproic"]
pcos = ["PCOS", "PCOS_lean", "PCOS_obese", "PCOS_insulin_res"]
replacement = {}
for i in normal:
    replacement[i] = 0
for i in pcos:
    replacement[i] = 1

position = []
values = list(df_new["sample_id"])
for i,j in enumerate(common_norm_df["sample_id"]):
    position.append(values.index(j))

df_new = df_new.loc[position]
df_new = df_new.reset_index()
df_new = df_new.drop("index", axis=1)

result = pd.merge(common_norm_df, df_new[df_new.columns[:-1]], how='inner', on='sample_id')
gene_feature_ids = result.columns[1:-3]

result["PCOS"] = df_new["PCOS/Control"]
result.drop(["PCOS/Control"], axis=1, inplace=True)
result.replace({"PCOS":replacement}, inplace=True)

gene_feature_ids = result.columns[1:-3]

test_set_percent = 0.1

In [8]:
cell_type_groups = [pd.DataFrame(y) for x, y in result.groupby(['cell_type'], as_index=False)]
cell_type_dfs = [x[['PCOS', 'cell_type']] for x in cell_type_groups]
cell_type_pcos_controls = []
for cell_type_df in cell_type_groups:
    cell_type = list(cell_type_df['cell_type'])[0]
    numPCOS = list(cell_type_df['PCOS']).count(1)
    numControl = list(cell_type_df['PCOS']).count(0)
    cell_type_pcos_controls.append(pd.DataFrame([[cell_type, numPCOS, numControl]], columns=['cell_type', 'PCOS', 'Control']))
cell_type_pcos_controls_df = pd.concat(cell_type_pcos_controls).reset_index(drop=True)
print(cell_type_pcos_controls_df)

                 cell_type  PCOS  Control
0                  adipose     8        7
1                  cumulus    12       11
2              endothelial     3        4
3               epithelial     4        3
4                granulosa     7        3
5              mesenchymal     3        4
6                  stromal     4        4
7                    theca    10       16
8  vastus_lateralis_muscle    26       46


In [9]:
pcos = (result.query("PCOS == 1").sample_id.tolist())
control = (result.query("PCOS == 0").sample_id.tolist())
print("PCOS samples:", len(pcos))
print("Control samples:", len(control))

PCOS samples: 77
Control samples: 98


In [10]:
X1 = common_norm_df[common_norm_df['sample_id'].isin(pcos)]
X_pcos_df = X1[X1.columns[1:-1]]
X_pcos = X_pcos_df.to_numpy()
X2 = common_norm_df[common_norm_df['sample_id'].isin(control)]
X_control_df = X2[X2.columns[1:-1]]
X_control = X_control_df.to_numpy()
print(X_pcos.shape, X_pcos_df.shape, X_control.shape, X_control_df.shape)

(77, 1667) (77, 1667) (98, 1667) (98, 1667)


In [11]:
# Setting up the possible latent dimensions
# A total of 27 latent dimensions are taken under consideration

k_list = []
k_list.extend(list(range(2, 10)))
k_list.extend(list(range(10, 20, 2)))
k_list.extend(list(range(20, 50, 5)))
k_list.extend(list(range(50, 61, 10)))
k_list.append(78)
k_list.extend(list(range(80, 100, 10)))
k_list.extend(list(range(100, 176, 25)))

print("Latent dimensions:")
print(k_list)

model_list = ["pca", "ica", "nmf", "dae", "vae"]
print(model_list)

Latent dimensions:
[2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 25, 30, 35, 40, 45, 50, 60, 78, 80, 90, 100, 125, 150, 175]
['pca', 'ica', 'nmf', 'dae', 'vae']


In [12]:
z_dict_pcos = {}
z_dict_control = {}
for algo in model_list:
  z_dict_pcos[algo] = {}
  z_dict_control[algo] = {}

In [13]:
# PCA, ICA, NMF Training and Saving Weights

weight_list_pcos = {}
weight_list_control = {}

print("="*50)
print("PCOS group")
for model in ["pca", "ica", "nmf"]:
    bce_p, l2_p, output_p = get_cost_reconstruction(X_pcos, model, k_list)
    weight_list_pcos[model] = output_p
print("="*50)

for algo in ['pca', 'ica', 'nmf']:
  for k in k_list:
    if k in weight_list_pcos[algo]:
      z_dict_pcos[algo][k] = weight_list_pcos[algo][k]
    else:
      z_dict_pcos[algo][k] = [[[] for _ in range(k)] for _ in range(len(gene_feature_ids))]

print("Control group")
for model in ["pca", "ica", "nmf"]:
    bce_c, l2_c, output_c = get_cost_reconstruction(X_control, model, k_list)
    weight_list_control[model] = output_c
print("="*50)

for algo in ['pca', 'ica', 'nmf']:
  for k in k_list:
    if k in weight_list_control[algo]:
      z_dict_control[algo][k] = weight_list_control[algo][k]
    else:
      z_dict_control[algo][k] = [[[] for _ in range(k)] for _ in range(len(gene_feature_ids))]

PCOS group
Calculating Reconstruction Error for: PCA


100%|██████████| 28/28 [00:01<00:00, 20.88it/s]


Calculating Reconstruction Error for: ICA


100%|██████████| 28/28 [00:02<00:00,  9.95it/s]


Calculating Reconstruction Error for: NMF


100%|██████████| 28/28 [00:28<00:00,  1.03s/it]


Control group
Calculating Reconstruction Error for: PCA


100%|██████████| 28/28 [00:01<00:00, 14.34it/s]


Calculating Reconstruction Error for: ICA


100%|██████████| 28/28 [00:03<00:00,  7.89it/s]


Calculating Reconstruction Error for: NMF


100%|██████████| 28/28 [01:03<00:00,  2.27s/it]




In [14]:
# VAE Training and Saving Weights

# PCOS

original_dim = X_pcos_df.shape[1]
pcos_test_df = X_pcos_df.sample(frac=test_set_percent)
pcos_train_df = X_pcos_df.drop(pcos_test_df.index)

vae_weights_list = []

for latent_dim in tqdm(k_list):
    if latent_dim <= len(X_pcos_df):
      vae = Tybalt(original_dim, latent_dim, batch_size=50, epochs=200,
                      learning_rate=0.005, kappa=1, epsilon_std=1.0,
                      beta=K.variable(0), loss='binary_crossentropy',
                      verbose=False)
      vae._build_encoder_layer()
      vae._build_decoder_layer()
      vae._compile_vae()
      vae._connect_layers()
      vae.train_vae(pcos_train_df, pcos_test_df, separate_loss=False)
      vae_weights_list.append(vae.get_weights()[1])

for i in range(len(k_list)):
  k = k_list[i]
  if i < len(vae_weights_list):
    z_dict_pcos['vae'][k] = vae_weights_list[i][0]
  else:
    z_dict_pcos['vae'][k] = [[[] for _ in range(k)] for _ in range(len(gene_feature_ids))]


# Control

original_dim = X_control_df.shape[1]
control_test_df = X_control_df.sample(frac=test_set_percent)
control_train_df = X_control_df.drop(control_test_df.index)

vae_weights_list2 = []

for latent_dim in tqdm(k_list):
    if latent_dim <= len(X_control_df):
      vae = Tybalt(original_dim, latent_dim, batch_size=50, epochs=200,
                      learning_rate=0.005, kappa=1, epsilon_std=1.0,
                      beta=K.variable(0), loss='binary_crossentropy',
                      verbose=False)
      vae._build_encoder_layer()
      vae._build_decoder_layer()
      vae._compile_vae()
      vae._connect_layers()
      vae.train_vae(control_train_df, control_test_df, separate_loss=False)
      vae_weights_list2.append(vae.get_weights()[1])

for i in range(len(k_list)):
  k = k_list[i]
  if i < len(vae_weights_list2):
    z_dict_control['vae'][k] = vae_weights_list2[i][0]
  else:
    z_dict_control['vae'][k] = [[[] for _ in range(k)] for _ in range(len(gene_feature_ids))]

  0%|          | 0/28 [00:00<?, ?it/s]

tracking <tf.Variable 'Variable_1:0' shape=() dtype=float32> beta
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where



  4%|▎         | 1/28 [00:03<01:42,  3.80s/it]

tracking <tf.Variable 'Variable_2:0' shape=() dtype=float32> beta


  7%|▋         | 2/28 [00:07<01:41,  3.90s/it]

tracking <tf.Variable 'Variable_3:0' shape=() dtype=float32> beta


 11%|█         | 3/28 [00:12<01:43,  4.12s/it]

tracking <tf.Variable 'Variable_4:0' shape=() dtype=float32> beta


 14%|█▍        | 4/28 [00:17<01:44,  4.37s/it]

tracking <tf.Variable 'Variable_5:0' shape=() dtype=float32> beta


 18%|█▊        | 5/28 [00:22<01:47,  4.67s/it]

tracking <tf.Variable 'Variable_6:0' shape=() dtype=float32> beta


 21%|██▏       | 6/28 [00:28<01:51,  5.05s/it]

tracking <tf.Variable 'Variable_7:0' shape=() dtype=float32> beta


 25%|██▌       | 7/28 [00:35<01:55,  5.49s/it]

tracking <tf.Variable 'Variable_8:0' shape=() dtype=float32> beta


 29%|██▊       | 8/28 [00:42<01:59,  5.97s/it]

tracking <tf.Variable 'Variable_9:0' shape=() dtype=float32> beta


 32%|███▏      | 9/28 [00:49<02:02,  6.43s/it]

tracking <tf.Variable 'Variable_10:0' shape=() dtype=float32> beta


 36%|███▌      | 10/28 [00:57<02:03,  6.86s/it]

tracking <tf.Variable 'Variable_11:0' shape=() dtype=float32> beta


 39%|███▉      | 11/28 [01:06<02:05,  7.36s/it]

tracking <tf.Variable 'Variable_12:0' shape=() dtype=float32> beta


 43%|████▎     | 12/28 [01:15<02:05,  7.87s/it]

tracking <tf.Variable 'Variable_13:0' shape=() dtype=float32> beta


 46%|████▋     | 13/28 [01:25<02:05,  8.39s/it]

tracking <tf.Variable 'Variable_14:0' shape=() dtype=float32> beta


 50%|█████     | 14/28 [01:35<02:04,  8.92s/it]

tracking <tf.Variable 'Variable_15:0' shape=() dtype=float32> beta


 54%|█████▎    | 15/28 [01:45<02:02,  9.42s/it]

tracking <tf.Variable 'Variable_16:0' shape=() dtype=float32> beta


 57%|█████▋    | 16/28 [01:56<01:59,  9.92s/it]

tracking <tf.Variable 'Variable_17:0' shape=() dtype=float32> beta


 61%|██████    | 17/28 [02:08<01:56, 10.58s/it]

tracking <tf.Variable 'Variable_18:0' shape=() dtype=float32> beta


 64%|██████▍   | 18/28 [02:21<01:51, 11.17s/it]

tracking <tf.Variable 'Variable_19:0' shape=() dtype=float32> beta


 68%|██████▊   | 19/28 [02:34<01:46, 11.84s/it]

tracking <tf.Variable 'Variable_20:0' shape=() dtype=float32> beta


 71%|███████▏  | 20/28 [02:48<01:39, 12.47s/it]

tracking <tf.Variable 'Variable_21:0' shape=() dtype=float32> beta


100%|██████████| 28/28 [03:03<00:00,  6.57s/it]
  0%|          | 0/28 [00:00<?, ?it/s]

tracking <tf.Variable 'Variable_22:0' shape=() dtype=float32> beta


  4%|▎         | 1/28 [00:13<06:12, 13.81s/it]

tracking <tf.Variable 'Variable_23:0' shape=() dtype=float32> beta


  7%|▋         | 2/28 [00:28<06:02, 13.95s/it]

tracking <tf.Variable 'Variable_24:0' shape=() dtype=float32> beta


 11%|█         | 3/28 [00:42<05:54, 14.17s/it]

tracking <tf.Variable 'Variable_25:0' shape=() dtype=float32> beta


 14%|█▍        | 4/28 [00:58<05:49, 14.55s/it]

tracking <tf.Variable 'Variable_26:0' shape=() dtype=float32> beta


 18%|█▊        | 5/28 [01:14<05:44, 14.96s/it]

tracking <tf.Variable 'Variable_27:0' shape=() dtype=float32> beta


 21%|██▏       | 6/28 [01:30<05:37, 15.34s/it]

tracking <tf.Variable 'Variable_28:0' shape=() dtype=float32> beta


 25%|██▌       | 7/28 [01:47<05:32, 15.84s/it]

tracking <tf.Variable 'Variable_29:0' shape=() dtype=float32> beta


 29%|██▊       | 8/28 [02:04<05:26, 16.35s/it]

tracking <tf.Variable 'Variable_30:0' shape=() dtype=float32> beta


 32%|███▏      | 9/28 [02:22<05:19, 16.80s/it]

tracking <tf.Variable 'Variable_31:0' shape=() dtype=float32> beta


 36%|███▌      | 10/28 [02:41<05:11, 17.28s/it]

tracking <tf.Variable 'Variable_32:0' shape=() dtype=float32> beta


 39%|███▉      | 11/28 [03:00<05:02, 17.79s/it]

tracking <tf.Variable 'Variable_33:0' shape=() dtype=float32> beta


 43%|████▎     | 12/28 [03:19<04:53, 18.34s/it]

tracking <tf.Variable 'Variable_34:0' shape=() dtype=float32> beta


 46%|████▋     | 13/28 [03:39<04:42, 18.85s/it]

tracking <tf.Variable 'Variable_35:0' shape=() dtype=float32> beta


 50%|█████     | 14/28 [04:00<04:30, 19.33s/it]

tracking <tf.Variable 'Variable_36:0' shape=() dtype=float32> beta


 54%|█████▎    | 15/28 [04:21<04:19, 19.94s/it]

tracking <tf.Variable 'Variable_37:0' shape=() dtype=float32> beta


 57%|█████▋    | 16/28 [04:43<04:05, 20.44s/it]

tracking <tf.Variable 'Variable_38:0' shape=() dtype=float32> beta


 61%|██████    | 17/28 [05:06<03:52, 21.17s/it]

tracking <tf.Variable 'Variable_39:0' shape=() dtype=float32> beta


 64%|██████▍   | 18/28 [05:29<03:39, 21.99s/it]

tracking <tf.Variable 'Variable_40:0' shape=() dtype=float32> beta


 68%|██████▊   | 19/28 [05:54<03:24, 22.74s/it]

tracking <tf.Variable 'Variable_41:0' shape=() dtype=float32> beta


 71%|███████▏  | 20/28 [06:19<03:08, 23.53s/it]

tracking <tf.Variable 'Variable_42:0' shape=() dtype=float32> beta


 75%|███████▌  | 21/28 [06:46<02:50, 24.34s/it]

tracking <tf.Variable 'Variable_43:0' shape=() dtype=float32> beta


 79%|███████▊  | 22/28 [07:13<02:31, 25.21s/it]

tracking <tf.Variable 'Variable_44:0' shape=() dtype=float32> beta


 82%|████████▏ | 23/28 [07:41<02:10, 26.00s/it]

tracking <tf.Variable 'Variable_45:0' shape=() dtype=float32> beta


100%|██████████| 28/28 [08:09<00:00, 17.48s/it]


In [15]:
# DAE Training and Saving Weights

# PCOS

original_dim = X_pcos_df.shape[1]
pcos_test_df = X_pcos_df.sample(frac=test_set_percent)
pcos_train_df = X_pcos_df.drop(pcos_test_df.index)

dae_weights_list = []

for latent_dim in tqdm(k_list):
    if latent_dim <= len(X_pcos_df):
      dae = Adage(original_dim, latent_dim, noise=0.05, batch_size=50,
                      epochs=100, sparsity=0, learning_rate=0.0005, loss='mse',
                      optimizer='adam', tied_weights=True, verbose=False)
      dae._build_graph()
      dae._build_tied_weights_graph()
      dae._compile_adage()
      dae._connect_layers()
      dae.initialize_model()
      dae.train_adage(pcos_train_df, pcos_test_df)
      dae_weights_list.append(dae.get_weights()[1])

for i in range(len(k_list)):
  k = k_list[i]
  if i < len(dae_weights_list):
    z_dict_pcos['dae'][k] = dae_weights_list[i][0].transpose()
  else:
    z_dict_pcos['dae'][k] = [[[] for _ in range(k)] for _ in range(len(gene_feature_ids))]


# Control

original_dim = X_control_df.shape[1]
control_test_df = X_control_df.sample(frac=test_set_percent)
control_train_df = X_control_df.drop(control_test_df.index)

dae_weights_list2 = []

for latent_dim in tqdm(k_list):
    if latent_dim <= len(X_control_df):
      dae = Adage(original_dim, latent_dim, noise=0.05, batch_size=50,
                      epochs=100, sparsity=0, learning_rate=0.0005, loss='mse',
                      optimizer='adam', tied_weights=True, verbose=False)
      dae._build_graph()
      dae._build_tied_weights_graph()
      dae._compile_adage()
      dae._connect_layers()
      dae.initialize_model()
      dae.train_adage(pcos_train_df, pcos_test_df)
      dae_weights_list2.append(dae.get_weights()[1])

for i in range(len(k_list)):
  k = k_list[i]
  if i < len(dae_weights_list2):
    z_dict_control['dae'][k] = dae_weights_list2[i][0].transpose()
  else:
    z_dict_control['dae'][k] = [[[] for _ in range(k)] for _ in range(len(gene_feature_ids))]

100%|██████████| 28/28 [04:50<00:00, 10.36s/it]
100%|██████████| 28/28 [06:39<00:00, 14.25s/it]


In [16]:
with open('/content/drive/MyDrive/aacb_project/datasets/z_dict_pcos.p', 'wb') as f1:
  pkl.dump(z_dict_pcos, f1)
with open('/content/drive/MyDrive/aacb_project/datasets/z_dict_control.p', 'wb') as f2:
  pkl.dump(z_dict_control, f2)