In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import tensorflow as tf

print(tf.__version__)


In [None]:
import tensorflow as tf

class Autoencoder(object):
    def __init__(self, n_input, n_hidden1, n_hidden2, n_hidden3, transfer_function=tf.nn.softplus, optimizer=tf.keras.optimizers.Adam()):
        self.n_input = n_input
        self.n_hidden1 = n_hidden1
        self.n_hidden2 = n_hidden2
        self.n_hidden3 = n_hidden3
        self.transfer = transfer_function
        
        # Define encoder weights and biases
        self.W1 = tf.Variable(tf.random.normal([self.n_input, self.n_hidden1], dtype=tf.float32))
        self.b1 = tf.Variable(tf.zeros([self.n_hidden1], dtype=tf.float32))
        self.W2 = tf.Variable(tf.random.normal([self.n_hidden1, self.n_hidden2], dtype=tf.float32))
        self.b2 = tf.Variable(tf.zeros([self.n_hidden2], dtype=tf.float32))
        self.W3 = tf.Variable(tf.random.normal([self.n_hidden2, self.n_hidden3], dtype=tf.float32))
        self.b3 = tf.Variable(tf.zeros([self.n_hidden3], dtype=tf.float32))
        
        # Define decoder weights and biases
        self.W4 = tf.Variable(tf.random.normal([self.n_hidden3, self.n_hidden2], dtype=tf.float32))
        self.b4 = tf.Variable(tf.zeros([self.n_hidden2], dtype=tf.float32))
        self.W5 = tf.Variable(tf.random.normal([self.n_hidden2, self.n_hidden1], dtype=tf.float32))
        self.b5 = tf.Variable(tf.zeros([self.n_hidden1], dtype=tf.float32))
        self.W6 = tf.Variable(tf.random.normal([self.n_hidden1, self.n_input], dtype=tf.float32))
        self.b6 = tf.Variable(tf.zeros([self.n_input], dtype=tf.float32))
        
        self.optimizer = optimizer
    
    def encoder(self, x):
        hidden1 = self.transfer(tf.add(tf.matmul(x, self.W1), self.b1))
        hidden2 = self.transfer(tf.add(tf.matmul(hidden1, self.W2), self.b2))
        hidden3 = self.transfer(tf.add(tf.matmul(hidden2, self.W3), self.b3))
        return hidden3
    
    def decoder(self, encoded):
        hidden1 = self.transfer(tf.add(tf.matmul(encoded, self.W4), self.b4))
        hidden2 = self.transfer(tf.add(tf.matmul(hidden1, self.W5), self.b5))
        reconstruction = tf.nn.sigmoid(tf.add(tf.matmul(hidden2, self.W6), self.b6))
        return reconstruction
    
    def call(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded
    
    def compute_loss(self, x):
        reconstruction = self.call(x)
        loss = tf.reduce_mean(tf.square(x - reconstruction))
        return loss
    
    def partial_fit(self, X):
        with tf.GradientTape() as tape:
            cost = self.compute_loss(X)
        trainable_vars = [self.W1, self.b1, self.W2, self.b2, self.W3, self.b3,
                          self.W4, self.b4, self.W5, self.b5, self.W6, self.b6]
        grads = tape.gradient(cost, trainable_vars)
        self.optimizer.apply_gradients(zip(grads, trainable_vars))
        return cost


In [None]:
import tensorflow as tf
import numpy as np

class MaskingNoiseAutoencoder(object):
    def __init__(self, n_input, n_hidden, transfer_function=tf.nn.softplus, optimizer=tf.keras.optimizers.Adam(),
                 dropout_probability=0.95):
        self.n_input = n_input
        self.n_hidden = n_hidden
        self.transfer = transfer_function
        self.dropout_probability = dropout_probability

        self.weights = self._initialize_weights()

        # Model inputs
        self.x = tf.keras.layers.Input(shape=(self.n_input,))
        self.keep_prob = tf.keras.layers.Input(dtype=tf.float32)

        # Encoder
        self.hidden = self.transfer(tf.add(tf.matmul(tf.nn.dropout(self.x, rate=1 - self.keep_prob), self.weights['w1']),
                                           self.weights['b1']))

        # Decoder
        self.reconstruction = tf.add(tf.matmul(self.hidden, self.weights['w2']), self.weights['b2'])

        # Define optimizer and cost function
        self.optimizer = optimizer
        self.cost = 0.5 * tf.reduce_sum(tf.pow(tf.subtract(self.reconstruction, self.x), 2.0))

        # Compile model
        self.autoencoder = tf.keras.Model(inputs=[self.x, self.keep_prob], outputs=self.reconstruction)
        self.autoencoder.compile(optimizer=self.optimizer, loss='mse')

    def _initialize_weights(self):
        all_weights = {}
        all_weights['w1'] = tf.Variable(tf.random.normal([self.n_input, self.n_hidden], dtype=tf.float32))
        all_weights['b1'] = tf.Variable(tf.zeros([self.n_hidden], dtype=tf.float32))
        all_weights['w2'] = tf.Variable(tf.zeros([self.n_hidden, self.n_input], dtype=tf.float32))
        all_weights['b2'] = tf.Variable(tf.zeros([self.n_input], dtype=tf.float32))
        return all_weights

    def partial_fit(self, X):
        X = np.array(X)
        cost = self.autoencoder.train_on_batch([X, self.dropout_probability], X)
        return cost

    def calc_total_cost(self, X):
        return self.autoencoder.evaluate([X, 1.0], X, verbose=0)

    def transform(self, X):
        return self.sess.run(self.hidden, feed_dict={self.x: X, self.keep_prob: 1.0})

    def generate(self, hidden=None):
        if hidden is None:
            hidden = np.random.normal(size=[1, self.n_hidden])
        return self.sess.run(self.reconstruction, feed_dict={self.hidden: hidden})

    def reconstruct(self, X):
        return self.autoencoder.predict([X, 1.0])

    def getWeights(self):
        return self.weights['w1'].numpy()

    def getBiases(self):
        return self.weights['b1'].numpy()


In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
import sklearn.preprocessing as prep
import csv

class Autoencoder(tf.keras.Model):
    def __init__(self, n_input, n_hidden1, n_hidden2, n_hidden3, transfer_function=tf.nn.softplus,
                 optimizer=tf.keras.optimizers.Adam()):
        super(Autoencoder, self).__init__()
        self.n_input = n_input
        self.n_hidden1 = n_hidden1
        self.n_hidden2 = n_hidden2
        self.n_hidden3 = n_hidden3
        self.transfer = transfer_function

        # Encoder layers
        self.hidden1 = tf.keras.layers.Dense(self.n_hidden1, activation=self.transfer)
        self.hidden2 = tf.keras.layers.Dense(self.n_hidden2, activation=self.transfer)
        self.hidden3 = tf.keras.layers.Dense(self.n_hidden3, activation=self.transfer)

        # Decoder layers
        self.reconstruction = tf.keras.layers.Dense(self.n_input)

        # Optimizer
        self.optimizer = optimizer

    def call(self, inputs):
        # Encoder
        encoded = self.hidden1(inputs)
        encoded = self.hidden2(encoded)
        encoded = self.hidden3(encoded)

        # Decoder
        reconstructed = self.reconstruction(encoded)
        return reconstructed

# Helper function to standard scale data
def standard_scale(X_train, X_test):
    preprocessor = prep.StandardScaler().fit(X_train)
    X_train = preprocessor.transform(X_train)
    X_test = preprocessor.transform(X_test)
    return X_train, X_test

# Helper function to get random block from data
def get_random_block_from_data(data, batch_size):
    shape = tf.shape(data)
    start_index = tf.random.uniform(shape=[], minval=0, maxval=shape[0] - batch_size, dtype=tf.int32)
    return data[start_index:(start_index + batch_size)]

# Paths and data loading (adjust paths accordingly)
path1 = '/kaggle/input/ace-dataset/features/'
AAC = pd.read_csv(path1 + 'ACE_AAC.csv').iloc[:, 1:]
ASDC = pd.read_csv(path1 + 'ACE_ASDC.csv').iloc[:, 1:]
OPF_7bit_type_1 = pd.read_csv(path1 + 'opf_7bit_type_1_features.csv').iloc[1:, 1:]
OPF_7bit_type_2 = pd.read_csv(path1 + 'opf_7bit_type_2_features.csv').iloc[1:, 1:]
OPF_7bit_type_3 = pd.read_csv(path1 + 'opf_7bit_type_3_features.csv').iloc[1:, 1:]
OPF_10bit = pd.read_csv(path1 + 'opf_10bit_features.csv').iloc[1:, 1:]
esmv1 = pd.read_csv(path1 + 'esmv1_feat_ACE.csv').iloc[:, :]
esm2 = pd.read_csv(path1 + 'esm2_t6_8M_feat_ACE.csv').iloc[:, :]
prot_t5 = pd.read_csv(path1 + 'ACE_embeddings_prot_t5_xl_bfd.csv').iloc[1:, 1:]

# Concatenate all features into a single array
all_feat = np.column_stack((AAC, ASDC, OPF_7bit_type_1, OPF_7bit_type_2, OPF_7bit_type_3, OPF_10bit, esmv1, esm2, prot_t5))

# Standard scale the data
X_train, _ = standard_scale(all_feat, all_feat)

# Define parameters
bs = X_train.shape[0] - 1
num = X_train.shape[1]
n_samples, _ = np.shape(X_train)
training_epochs = 1000
batch_size = bs
display_step = 1

# Initialize and train autoencoder
autoencoder = Autoencoder(
    n_input=num,
    n_hidden1=800,
    n_hidden2=200,
    n_hidden3=800,
    transfer_function=tf.nn.softplus,
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001))

# Convert X_train to TensorFlow tensor
X_train_tf = tf.convert_to_tensor(X_train, dtype=tf.float32)

for epoch in range(training_epochs):
    avg_cost = 0.
    total_batch = int(n_samples / batch_size)
    
    # Loop over all batches
    for i in range(total_batch):
        batch_xs = get_random_block_from_data(X_train_tf, batch_size)
        
        # Fit training using batch data
        with tf.GradientTape() as tape:
            reconstruction = autoencoder(batch_xs)
            loss = tf.reduce_mean(tf.square(batch_xs - reconstruction))
        
        gradients = tape.gradient(loss, autoencoder.trainable_variables)
        autoencoder.optimizer.apply_gradients(zip(gradients, autoencoder.trainable_variables))
        
        # Compute average loss
        avg_cost += loss / n_samples * batch_size
    
    # Display logs per epoch step
    if epoch % display_step == 0:
        print("Epoch:", '%d,' % (epoch + 1),
              "Cost:", "{:.9f}".format(avg_cost.numpy()))

# Assuming your `autoencoder` model has been defined and trained

# Compile the model
autoencoder.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss='mse')  # Adjust optimizer and loss function as needed

# Evaluate the loss on the training set
loss = autoencoder.evaluate(X_train_tf, X_train_tf)

print("Total loss: ", loss)

# Transform and reconstruct the data
X_test_reconstruct = autoencoder.predict(X_train_tf)

# Save reconstructed data to CSV
import csv

with open('ting_auto.csv', 'w', newline='') as fout:
    writer = csv.writer(fout, delimiter=',')
    for i in X_test_reconstruct:
        writer.writerow(i)



In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import sklearn.preprocessing as prep
import csv

class Autoencoder(tf.keras.Model):
    def __init__(self, n_input, n_hidden1, n_hidden2, n_hidden3, transfer_function=tf.nn.softplus,
                 optimizer=tf.keras.optimizers.Adam()):
        super(Autoencoder, self).__init__()
        self.n_input = n_input
        self.n_hidden1 = n_hidden1
        self.n_hidden2 = n_hidden2
        self.n_hidden3 = n_hidden3
        self.transfer = transfer_function
        
        # Ensure n_hidden2 is smaller than n_input
        assert n_hidden2 < n_input, "n_hidden2 must be smaller than n_input"

        # Encoder layers
        self.encoder = tf.keras.Sequential([
            tf.keras.layers.Dense(self.n_hidden1, activation=self.transfer),
            tf.keras.layers.Dense(self.n_hidden2, activation=self.transfer),
            tf.keras.layers.Dense(self.n_hidden3, activation=self.transfer)
        ])

        # Decoder layers
        self.decoder = tf.keras.Sequential([
            tf.keras.layers.Dense(self.n_hidden2, activation=self.transfer),
            tf.keras.layers.Dense(self.n_hidden1, activation=self.transfer),
            tf.keras.layers.Dense(self.n_input, activation='linear')
        ])

        # Optimizer
        self.optimizer = optimizer

    def call(self, inputs):
        encoded = self.encoder(inputs)
        decoded = self.decoder(encoded)
        return decoded

# Helper function to standard scale data
def standard_scale(X_train, X_test):
    preprocessor = prep.StandardScaler().fit(X_train)
    X_train = preprocessor.transform(X_train)
    X_test = preprocessor.transform(X_test)
    return X_train, X_test

# Paths and data loading (adjust paths accordingly)
path1 = '/kaggle/input/ace-dataset/features/'
AAC = pd.read_csv(path1 + 'ACE_AAC.csv').iloc[:, 1:]
ASDC = pd.read_csv(path1 + 'ACE_ASDC.csv').iloc[:, 1:]
OPF_7bit_type_1 = pd.read_csv(path1 + 'opf_7bit_type_1_features.csv').iloc[1:, 1:]
OPF_7bit_type_2 = pd.read_csv(path1 + 'opf_7bit_type_2_features.csv').iloc[1:, 1:]
OPF_7bit_type_3 = pd.read_csv(path1 + 'opf_7bit_type_3_features.csv').iloc[1:, 1:]
OPF_10bit = pd.read_csv(path1 + 'opf_10bit_features.csv').iloc[1:, 1:]
esmv1 = pd.read_csv(path1 + 'esmv1_feat_ACE.csv').iloc[:, :]
esm2 = pd.read_csv(path1 + 'esm2_t6_8M_feat_ACE.csv').iloc[:, :]
prot_t5 = pd.read_csv(path1 + 'ACE_embeddings_prot_t5_xl_bfd.csv').iloc[1:, 1:]

# Concatenate all features into a single array
all_feat = np.column_stack((AAC, ASDC, OPF_7bit_type_1, OPF_7bit_type_2, OPF_7bit_type_3, OPF_10bit, esmv1, esm2, prot_t5))

# Standard scale the data
X_train, _ = standard_scale(all_feat, all_feat)

# Define parameters
num_features = X_train.shape[1]
n_samples, _ = np.shape(X_train)
training_epochs = 1000
batch_size = X_train.shape[0] - 1
display_step = 1

# Initialize and train autoencoder
autoencoder = Autoencoder(
    n_input=num_features,
    n_hidden1=800,
    n_hidden2=200,
    n_hidden3=800,
    transfer_function=tf.nn.softplus,
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001))

# Convert X_train to TensorFlow tensor
X_train_tf = tf.convert_to_tensor(X_train, dtype=tf.float32)

for epoch in range(training_epochs):
    avg_cost = 0.
    total_batch = int(n_samples / batch_size)
    
    # Loop over all batches
    for i in range(total_batch):
        batch_xs = X_train_tf[i * batch_size:(i + 1) * batch_size]
        
        # Fit training using batch data
        with tf.GradientTape() as tape:
            reconstruction = autoencoder(batch_xs)
            loss = tf.reduce_mean(tf.square(batch_xs - reconstruction))
        
        gradients = tape.gradient(loss, autoencoder.trainable_variables)
        autoencoder.optimizer.apply_gradients(zip(gradients, autoencoder.trainable_variables))
        
        # Compute average loss
        avg_cost += loss / n_samples * batch_size
    
    # Display logs per epoch step
    if epoch % display_step == 0:
        print("Epoch:", '%d,' % (epoch + 1),
              "Cost:", "{:.9f}".format(avg_cost.numpy()))

# Compile the model
autoencoder.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss='mse')

# Evaluate the loss on the training set
loss = autoencoder.evaluate(X_train_tf, X_train_tf)
print("Total loss: ", loss)

# Transform and reconstruct the data
X_test_reconstruct = autoencoder.predict(X_train_tf)

# Save reconstructed data to CSV
with open('ting_auto.csv', 'w', newline='') as fout:
    writer = csv.writer(fout, delimiter=',')
    for i in X_test_reconstruct:
        writer.writerow(i)


2024-06-30 08:48:09.521747: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-30 08:48:09.521884: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-30 08:48:09.676266: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Epoch: 1, Cost: 1.191434026
Epoch: 2, Cost: 1.040842652
Epoch: 3, Cost: 1.022527575
Epoch: 4, Cost: 1.016745567
Epoch: 5, Cost: 1.004657269
Epoch: 6, Cost: 0.987540066
Epoch: 7, Cost: 0.973439813
Epoch: 8, Cost: 0.963098109
Epoch: 9, Cost: 0.955717266
Epoch: 10, Cost: 0.950278699
Epoch: 11, Cost: 0.945162535
Epoch: 12, Cost: 0.938891172
Epoch: 13, Cost: 0.931070387
Epoch: 14, Cost: 0.920981169
Epoch: 15, Cost: 0.909448683
Epoch: 16, Cost: 0.897775710
Epoch: 17, Cost: 0.886345387
Epoch: 18, Cost: 0.874743521
Epoch: 19, Cost: 0.863596737
Epoch: 20, Cost: 0.853292584
Epoch: 21, Cost: 0.843355954
Epoch: 22, Cost: 0.833392143
Epoch: 23, Cost: 0.823311210
Epoch: 24, Cost: 0.812482119
Epoch: 25, Cost: 0.800881565
Epoch: 26, Cost: 0.788570225
Epoch: 27, Cost: 0.775662363
Epoch: 28, Cost: 0.762386322
Epoch: 29, Cost: 0.749638319
Epoch: 30, Cost: 0.736678123
Epoch: 31, Cost: 0.720449626
Epoch: 32, Cost: 0.705861747
Epoch: 33, Cost: 0.694787860
Epoch: 34, Cost: 0.681813061
Epoch: 35, Cost: 0.6675

I0000 00:00:1719737362.941008     118 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.
W0000 00:00:1719737362.954956     118 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update


[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - loss: 0.0429  
Total loss:  0.04339296743273735
[1m 1/32[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m7s[0m 242ms/step

W0000 00:00:1719737363.529856     118 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update


[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step


In [4]:
import tensorflow as tf
import numpy as np
import pandas as pd
import sklearn.preprocessing as prep
import csv

class Autoencoder(tf.keras.Model):
    def __init__(self, n_input, n_hidden1, n_hidden2, n_hidden3, transfer_function=tf.nn.softplus,
                 optimizer=tf.keras.optimizers.Adam()):
        super(Autoencoder, self).__init__()
        self.n_input = n_input
        self.n_hidden1 = n_hidden1
        self.n_hidden2 = n_hidden2
        self.n_hidden3 = n_hidden3
        self.transfer = transfer_function
        
        # Ensure n_hidden2 is smaller than n_input
        assert n_hidden2 < n_input, "n_hidden2 must be smaller than n_input"

        # Encoder layers
        self.encoder = tf.keras.Sequential([
            tf.keras.layers.Dense(self.n_hidden1, activation=self.transfer),
            tf.keras.layers.Dense(self.n_hidden2, activation=self.transfer)
        ])

        # Decoder layers
        self.decoder = tf.keras.Sequential([
            tf.keras.layers.Dense(self.n_hidden1, activation=self.transfer),
            tf.keras.layers.Dense(self.n_input, activation='linear')
        ])

        # Optimizer
        self.optimizer = optimizer

    def call(self, inputs):
        encoded = self.encoder(inputs)
        decoded = self.decoder(encoded)
        return decoded

# Helper function to standard scale data
def standard_scale(X_train, X_test):
    preprocessor = prep.StandardScaler().fit(X_train)
    X_train = preprocessor.transform(X_train)
    X_test = preprocessor.transform(X_test)
    return X_train, X_test

# Paths and data loading (adjust paths accordingly)
path1 = '/kaggle/input/ace-dataset/features/'
AAC = pd.read_csv(path1 + 'ACE_AAC.csv').iloc[:, 1:]
ASDC = pd.read_csv(path1 + 'ACE_ASDC.csv').iloc[:, 1:]
OPF_7bit_type_1 = pd.read_csv(path1 + 'opf_7bit_type_1_features.csv').iloc[1:, 1:]
OPF_7bit_type_2 = pd.read_csv(path1 + 'opf_7bit_type_2_features.csv').iloc[1:, 1:]
OPF_7bit_type_3 = pd.read_csv(path1 + 'opf_7bit_type_3_features.csv').iloc[1:, 1:]
OPF_10bit = pd.read_csv(path1 + 'opf_10bit_features.csv').iloc[1:, 1:]
esmv1 = pd.read_csv(path1 + 'esmv1_feat_ACE.csv').iloc[:, :]
esm2 = pd.read_csv(path1 + 'esm2_t6_8M_feat_ACE.csv').iloc[:, :]
prot_t5 = pd.read_csv(path1 + 'ACE_embeddings_prot_t5_xl_bfd.csv').iloc[1:, 1:]

# Concatenate all features into a single array
all_feat = np.column_stack((AAC, ASDC, OPF_7bit_type_1, OPF_7bit_type_2, OPF_7bit_type_3, OPF_10bit, esmv1, esm2, prot_t5))

# Standard scale the data
X_train, _ = standard_scale(all_feat, all_feat)

# Define parameters
num_features = X_train.shape[1]
n_samples, _ = np.shape(X_train)
training_epochs = 1000
batch_size = X_train.shape[0] - 1
display_step = 1

# Initialize and train autoencoder
autoencoder = Autoencoder(
    n_input=num_features,
    n_hidden1=800,
    n_hidden2=200,  # Bottleneck layer with lower dimension
    n_hidden3=800,
    transfer_function=tf.nn.softplus,
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001))

# Convert X_train to TensorFlow tensor
X_train_tf = tf.convert_to_tensor(X_train, dtype=tf.float32)

for epoch in range(training_epochs):
    avg_cost = 0.
    total_batch = int(n_samples / batch_size)
    
    # Loop over all batches
    for i in range(total_batch):
        batch_xs = X_train_tf[i * batch_size:(i + 1) * batch_size]
        
        # Fit training using batch data
        with tf.GradientTape() as tape:
            reconstruction = autoencoder(batch_xs)
            loss = tf.reduce_mean(tf.square(batch_xs - reconstruction))
        
        gradients = tape.gradient(loss, autoencoder.trainable_variables)
        autoencoder.optimizer.apply_gradients(zip(gradients, autoencoder.trainable_variables))
        
        # Compute average loss
        avg_cost += loss / n_samples * batch_size
    
    # Display logs per epoch step
    if epoch % display_step == 0:
        print("Epoch:", '%d,' % (epoch + 1),
              "Cost:", "{:.9f}".format(avg_cost.numpy()))

# Compile the model
autoencoder.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss='mse')

# Evaluate the loss on the training set
loss = autoencoder.evaluate(X_train_tf, X_train_tf)
print("Total loss: ", loss)

# Transform and reconstruct the data
X_test_reconstruct = autoencoder.predict(X_train_tf)

# Save reconstructed data to CSV
with open('ting_auto.csv', 'w', newline='') as fout:
    writer = csv.writer(fout, delimiter=',')
    for i in X_test_reconstruct:
        writer.writerow(i)


Epoch: 1, Cost: 1.217963576
Epoch: 2, Cost: 1.037773252
Epoch: 3, Cost: 1.017764211
Epoch: 4, Cost: 1.011155009
Epoch: 5, Cost: 0.993533134
Epoch: 6, Cost: 0.967011273
Epoch: 7, Cost: 0.941068053
Epoch: 8, Cost: 0.918215811
Epoch: 9, Cost: 0.896862626
Epoch: 10, Cost: 0.876513362
Epoch: 11, Cost: 0.856943667
Epoch: 12, Cost: 0.837242305
Epoch: 13, Cost: 0.817873716
Epoch: 14, Cost: 0.799437642
Epoch: 15, Cost: 0.781643212
Epoch: 16, Cost: 0.763967633
Epoch: 17, Cost: 0.746426642
Epoch: 18, Cost: 0.729302764
Epoch: 19, Cost: 0.712934613
Epoch: 20, Cost: 0.696876347
Epoch: 21, Cost: 0.681015790
Epoch: 22, Cost: 0.665298462
Epoch: 23, Cost: 0.649683475
Epoch: 24, Cost: 0.634131491
Epoch: 25, Cost: 0.618766427
Epoch: 26, Cost: 0.603630602
Epoch: 27, Cost: 0.588757515
Epoch: 28, Cost: 0.574198186
Epoch: 29, Cost: 0.559990585
Epoch: 30, Cost: 0.546244025
Epoch: 31, Cost: 0.533082604
Epoch: 32, Cost: 0.520389259
Epoch: 33, Cost: 0.508247852
Epoch: 34, Cost: 0.496640861
Epoch: 35, Cost: 0.4855

W0000 00:00:1719737821.847025     116 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update


[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 0.0177 
Total loss:  0.017482249066233635
[1m 1/32[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m5s[0m 180ms/step

W0000 00:00:1719737822.280030     117 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update


[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step


In [3]:
import math
import pandas as pd
import tensorflow as tf
import kerastuner.tuners as kt
import matplotlib.pyplot as plt
from tensorflow.keras import Model
from tensorflow.keras import Sequential
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.layers import Dense, Dropout
from sklearn.model_selection import train_test_split
from tensorflow.keras.losses import MeanSquaredLogarithmicError

# data in google colab
TRAIN_DATA_PATH = '/content/sample_data/california_housing_train.csv'
TEST_DATA_PATH = '/content/sample_data/california_housing_test.csv'
TARGET_NAME = 'median_house_value'

train_data = pd.read_csv(TRAIN_DATA_PATH)
test_data = pd.read_csv(TEST_DATA_PATH)

x_train, y_train = train_data.drop(TARGET_NAME, axis=1), train_data[TARGET_NAME]
x_test, y_test = test_data.drop(TARGET_NAME, axis=1), test_data[TARGET_NAME]

#Scale the dataset using MinMaxScaler.

from sklearn.preprocessing import MinMaxScaler

def scale_datasets(x_train, x_test):
  """
  Standard Scale test and train data
  """
  standard_scaler = MinMaxScaler()
  x_train_scaled = pd.DataFrame(
      standard_scaler.fit_transform(x_train),
      columns=x_train.columns
  )
  x_test_scaled = pd.DataFrame(
      standard_scaler.transform(x_test),
      columns = x_test.columns
  )
  return x_train_scaled, x_test_scaled
  
x_train_scaled, x_test_scaled = scale_datasets(x_train, x_test)

  import kerastuner.tuners as kt


FileNotFoundError: [Errno 2] No such file or directory: '/content/sample_data/california_housing_train.csv'

In [None]:
class AutoEncoders(Model):

  def __init__(self, output_units):

    super().__init__()
    self.encoder = Sequential(
        [
          Dense(32, activation="relu"),
          Dense(16, activation="relu"),
          Dense(7, activation="relu")
        ]
    )

    self.decoder = Sequential(
        [
          Dense(16, activation="relu"),
          Dense(32, activation="relu"),
          Dense(output_units, activation="sigmoid")
        ]
    )

def call(self, inputs):

  encoded = self.encoder(inputs)
  decoded = self.decoder(encoded)
  return decoded
  
auto_encoder = AutoEncoders(len(x_train_scaled.columns))

auto_encoder.compile(
    loss='mae',
    metrics=['mae'],
    optimizer='adam'
)

history = auto_encoder.fit(
    x_train_scaled, 
    x_train_scaled, 
    epochs=15, 
    batch_size=32, 
    validation_data=(x_test_scaled, x_test_scaled)
)

In [None]:
encoder_layer = auto_encoder.get_layer('sequential')
reduced_df = pd.DataFrame(encoder_layer.predict(x_train_scaled))
reduced_df = reduced_df.add_prefix('feature_')