In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots

modules

In [2]:
def encoder_embedder(timesteps, features, hidden_dim, num_layers):
    '''
    Encoder embedder, takes as input the actual sequences and returns the actual embeddings.
    '''
    x = tf.keras.layers.Input(shape=(timesteps, features))
    for _ in range(num_layers):
        e = tf.keras.layers.GRU(units=hidden_dim, return_sequences=True)(x if _ == 0 else e)
    return tf.keras.models.Model(x, e, name='encoder_embedder')


def encoder(timesteps, hidden_dim, num_layers):
    '''
    Encoder, takes as input the actual embeddings and returns the actual latent vector.
    '''
    e = tf.keras.layers.Input(shape=(timesteps, hidden_dim))
    for _ in range(num_layers):
        h = tf.keras.layers.GRU(units=hidden_dim, return_sequences=True)(e if _ == 0 else h)
    h = tf.keras.layers.Dense(units=hidden_dim)(h)
    return tf.keras.models.Model(e, h, name='encoder')


def decoder(timesteps, features, hidden_dim, num_layers):
    '''
    Decoder, takes as input the actual or synthetic latent vector and returns the reconstructed or synthetic sequences.
    '''
    h = tf.keras.layers.Input(shape=(timesteps, hidden_dim))
    for _ in range(num_layers):
        y = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(units=hidden_dim, activation='relu'))(h if _ == 0 else y)
    y = tf.keras.layers.Dense(units=features)(y)
    return tf.keras.models.Model(h, y, name='decoder')


def generator_embedder(timesteps, features, hidden_dim, num_layers):
    '''
    Generator embedder, takes as input the synthetic sequences and returns the synthetic embeddings.
    '''
    z = tf.keras.layers.Input(shape=(timesteps, features))
    for _ in range(num_layers):
        e = tf.keras.layers.GRU(units=hidden_dim, return_sequences=True)(z if _ == 0 else e)
    return tf.keras.models.Model(z, e, name='generator_embedder')


def generator(timesteps, hidden_dim, num_layers):
    '''
    Generator, takes as input the synthetic embeddings and returns the synthetic latent vector.
    '''
    e = tf.keras.layers.Input(shape=(timesteps, hidden_dim))
    for _ in range(num_layers):
        h = tf.keras.layers.GRU(units=hidden_dim, return_sequences=True)(e if _ == 0 else h)
    h = tf.keras.layers.Dense(units=hidden_dim)(h)
    return tf.keras.models.Model(e, h, name='generator')


def discriminator(timesteps, hidden_dim, num_layers):
    '''
    Discriminator, takes as input the actual or synthetic embedding or latent vector and returns the log-odds.
    '''
    h = tf.keras.layers.Input(shape=(timesteps, hidden_dim))
    for _ in range(num_layers):
        p = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(units=hidden_dim, return_sequences=True if _ < num_layers - 1 else False))(h if _ == 0 else p)
    p = tf.keras.layers.Dense(units=1)(p)
    return tf.keras.models.Model(h, p, name='discriminator')


def simulator(samples, timesteps, features):
    '''
    Simulator, generates synthetic sequences from a Wiener process.
    '''
    z = tf.random.normal(mean=0, stddev=1, shape=(samples * timesteps, features), dtype=tf.float32)
    z = tf.cumsum(z, axis=0) / tf.sqrt(tf.cast(samples * timesteps, dtype=tf.float32))
    z = (z - tf.reduce_mean(z, axis=0)) / tf.math.reduce_std(z, axis=0)
    z = tf.reshape(z, (samples, timesteps, features))
    return z


In [3]:
def time_series_to_sequences(time_series, timesteps):
    '''
    Reshape the time series as sequences.
    '''
    
    sequences = np.array([time_series[t - timesteps: t] for t in range(timesteps, len(time_series) + timesteps, timesteps)])
    return sequences


def sequences_to_time_series(sequences):
    '''
    Reshape the sequences as time series.
    '''
    time_series = np.concatenate([sequence for sequence in sequences], axis=0)
    return time_series


losses

In [60]:
@tf.function
def mean_squared_error(y_true, y_pred):
    '''
    Mean squared error, used for calculating the supervised loss and the reconstruction loss.
    '''
    loss = tf.keras.losses.MSE(y_true=tf.expand_dims(y_true, axis=-1), y_pred=tf.expand_dims(y_pred, axis=-1))
    return tf.reduce_mean(tf.reduce_sum(loss, axis=-1))


@tf.function
def binary_crossentropy(y_true, y_pred):
    '''
    Binary cross-entropy, used for calculating the unsupervised loss.
    '''
    loss = tf.keras.losses.binary_crossentropy(y_true=y_true, y_pred=y_pred, from_logits=True)
    return tf.reduce_mean(loss)


model implementation

In [5]:
class TimeGAN():
    def __init__(self,
                 x,
                 timesteps,
                 hidden_dim,
                 num_layers,
                 lambda_param,
                 eta_param,
                 learning_rate,
                 batch_size):
        '''
        Implementation of synthetic time series generation model introduced in Yoon, J., Jarrett, D. and Van der Schaar, M., 2019.
        Time-series generative adversarial networks. Advances in neural information processing systems, 32.
        '''
        
        # extract the length of the time series
        samples = x.shape[0]

        # extract the number of time series
        features = x.shape[1]

        # scale the time series
        mu = np.mean(x, axis=0)
        sigma = np.std(x, axis=0)
        x = (x - mu) / sigma

        # reshape the time series as sequences
        x = time_series_to_sequences(time_series=x, timesteps=timesteps)
        
        # create the dataset
        dataset = tf.data.Dataset.from_tensor_slices(x)
        dataset = dataset.cache().shuffle(samples).batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE)
        
        # build the models
        autoencoder_model = tf.keras.models.Sequential([
            encoder_embedder(timesteps=timesteps, features=features, hidden_dim=hidden_dim, num_layers=1),
            encoder(timesteps=timesteps, hidden_dim=hidden_dim, num_layers=num_layers - 1),
            decoder(timesteps=timesteps, features=features, hidden_dim=hidden_dim, num_layers=num_layers)
        ])
    
        generator_model = tf.keras.models.Sequential([
            generator_embedder(timesteps=timesteps, features=features, hidden_dim=hidden_dim, num_layers=1),
            generator(timesteps=timesteps, hidden_dim=hidden_dim, num_layers=num_layers - 1),
        ])
        
        discriminator_model = discriminator(timesteps=timesteps, hidden_dim=hidden_dim, num_layers=num_layers)
        
        # instantiate the optimizers
        autoencoder_optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
        generator_optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
        discriminator_optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
        
        # save the objects
        self.mu = mu
        self.sigma = sigma
        self.samples = samples
        self.timesteps = timesteps
        self.features = features
        self.lambda_param = lambda_param
        self.eta_param = eta_param
        self.dataset = dataset
        self.autoencoder_model = autoencoder_model
        self.generator_model = generator_model
        self.discriminator_model = discriminator_model
        self.autoencoder_optimizer = autoencoder_optimizer
        self.generator_optimizer = generator_optimizer
        self.discriminator_optimizer = discriminator_optimizer
    
    def fit(self, epochs, verbose=True):
        '''
        Train the model.
        '''
        
        # define the training loop
        @tf.function
        def train_step(data):
            with tf.GradientTape() as autoencoder_tape, tf.GradientTape() as generator_tape, tf.GradientTape() as discriminator_tape:
                
                # get the actual sequences
                x = tf.cast(data, dtype=tf.float32)
                
                # generate the synthetic sequences
                z = simulator(samples=x.shape[0], timesteps=self.timesteps, features=self.features)

                # get the encoder outputs
                ex = self.autoencoder_model.get_layer('encoder_embedder')(x)     # actual embedding
                hx = self.autoencoder_model.get_layer('encoder')(ex)             # actual latent vector

                # get the generator outputs
                ez = self.generator_model.get_layer('generator_embedder')(z)     # synthetic embedding
                hz = self.generator_model.get_layer('generator')(ez)             # synthetic latent vector
                hx_hat = self.generator_model.get_layer('generator')(ex)         # conditional synthetic latent vector (i.e. given the actual embedding)
                
                # get the decoder outputs
                x_hat = self.autoencoder_model.get_layer('decoder')(hx)          # reconstructed sequences

                # get the discriminator outputs
                p_ex = self.discriminator_model(ex)                              # log-odds of actual embedding
                p_ez = self.discriminator_model(ez)                              # log-odds of synthetic embedding
                p_hx = self.discriminator_model(hx)                              # log-odds of actual latent vector
                p_hz = self.discriminator_model(hz)                              # log-odds of synthetic latent vector

                # calculate the supervised loss
                supervised_loss = mean_squared_error(hx[:, 1:, :], hx_hat[:, :-1, :])
                
                # calculate the autoencoder loss
                autoencoder_loss = mean_squared_error(x, x_hat) + \
                                   self.lambda_param * supervised_loss
                                   
                # calculate the generator loss
                generator_loss = binary_crossentropy(tf.ones_like(p_hz), p_hz) + \
                                 binary_crossentropy(tf.ones_like(p_ez), p_ez) + \
                                 self.eta_param * supervised_loss

                # calculate the discriminator loss
                discriminator_loss = binary_crossentropy(tf.zeros_like(p_hz), p_hz) + \
                                     binary_crossentropy(tf.zeros_like(p_ez), p_ez) + \
                                     binary_crossentropy(tf.ones_like(p_hx), p_hx) + \
                                     binary_crossentropy(tf.ones_like(p_ex), p_ex)
            
            # calculate the gradients
            autoencoder_gradient = autoencoder_tape.gradient(autoencoder_loss, self.autoencoder_model.trainable_variables)
            generator_gradient = generator_tape.gradient(generator_loss, self.generator_model.trainable_variables)
            discriminator_gradient = discriminator_tape.gradient(discriminator_loss, self.discriminator_model.trainable_variables)
            
            # update the weights
            self.autoencoder_optimizer.apply_gradients(zip(autoencoder_gradient, self.autoencoder_model.trainable_variables))
            self.generator_optimizer.apply_gradients(zip(generator_gradient, self.generator_model.trainable_variables))
            self.discriminator_optimizer.apply_gradients(zip(discriminator_gradient, self.discriminator_model.trainable_variables))
            
            return autoencoder_loss, generator_loss, discriminator_loss

        # train the model
        for epoch in range(epochs):
            for data in self.dataset:
                autoencoder_loss, generator_loss, discriminator_loss = train_step(data)
            if verbose:
                print(
                    f'epoch: {1 + epoch} '
                    f'autoencoder_loss: {format(autoencoder_loss.numpy(), ".6f")} '
                    f'generator_loss: {format(generator_loss.numpy(), ".6f")} '
                    f'discriminator_loss: {format(discriminator_loss.numpy(), ".6f")}'
                )

    def reconstruct(self, x):
        '''
        Reconstruct the time series.
        '''
        
        # scale the time series
        x = (x - self.mu) / self.sigma

        # reshape the time series as sequences
        x = time_series_to_sequences(time_series=x, timesteps=self.timesteps)

        # get the reconstructed sequences
        x_hat = self.autoencoder_model(x)
        
        # transform the reconstructed sequences back to time series
        x_hat = sequences_to_time_series(x_hat.numpy())
   
        # transform the reconstructed time series back to the original scale
        x_hat = self.mu + self.sigma * x_hat
        
        return x_hat
    
    def simulate(self, samples):
        '''
        Simulate the time series.
        '''
        
        # generate the synthetic sequences
        z = simulator(samples=samples // self.timesteps, timesteps=self.timesteps, features=self.features)
        
        # get the simulated sequences
        x_sim = self.autoencoder_model.get_layer('decoder')(self.generator_model(z))
    
        # transform the simulated sequences back to time series
        x_sim = sequences_to_time_series(x_sim.numpy())
    
        # transform the simulated time series back to the original scale
        x_sim = self.mu + self.sigma * x_sim
    
        return x_sim


plot model

In [6]:
def plot(actual, reconstructed, synthetic):
    '''
    Plot the actual, reconstructed and synthetic time series.
    '''
    
    fig = make_subplots(
        subplot_titles=['Actual', 'Reconstructed', 'Synthetic'],
        vertical_spacing=0.15,
        rows=3,
        cols=1
    )
    
    fig.update_layout(
        plot_bgcolor='white',
        paper_bgcolor='white',
        margin=dict(t=60, b=60, l=30, r=30),
        font=dict(
            color='#1b1f24',
            size=8,
        ),
        legend=dict(
            traceorder='normal',
            font=dict(
                color='#1b1f24',
                size=10,
            ),
            x=0,
            y=-0.1,
            orientation='h'
        ),
    )
    
    fig.update_annotations(
        font=dict(
            color='#1b1f24',
            size=12,
        )
    )
    
    # plot the actual time series
    for i in range(actual.shape[1]):
        fig.add_trace(
            go.Scatter(
                y=actual[:, i],
                showlegend=False,
                mode='lines',
                line=dict(
                    color='rgba(175,184,193,0.2)',
                    width=0.5
                )
            ),
            row=1,
            col=1
        )

    fig.add_trace(
        go.Scatter(
            y=np.mean(actual, axis=1),
            name='Actual Avg.',
            showlegend=True,
            mode='lines',
            line=dict(
                color='#0969da',
                width=1,
                shape='spline',
            )
        ),
        row=1,
        col=1
    )
    
    # plot the reconstructed time series
    for i in range(reconstructed.shape[1]):
        fig.add_trace(
            go.Scatter(
                y=reconstructed[:, i],
                showlegend=False,
                mode='lines',
                line=dict(
                    color='rgba(175,184,193,0.2)',
                    width=0.5
                )
            ),
            row=2,
            col=1
        )
    
    fig.add_trace(
        go.Scatter(
            y=np.mean(reconstructed, axis=1),
            name='Reconstructed Avg.',
            showlegend=True,
            mode='lines',
            line=dict(
                color='#0969da',
                width=1,
                shape='spline',
                dash='dash',
            )
        ),
        row=2,
        col=1
    )
    
    # plot the synthetic time series
    for i in range(synthetic.shape[1]):
        fig.add_trace(
            go.Scatter(
                y=synthetic[:, i],
                showlegend=False,
                mode='lines',
                line=dict(
                    color='rgba(175,184,193,0.2)',
                    width=0.5
                )
            ),
            row=3,
            col=1
        )
        
    fig.add_trace(
        go.Scatter(
            y=np.mean(synthetic, axis=1),
            name='Synthetic Avg.',
            showlegend=True,
            mode='lines',
            line=dict(
                color='#0969da',
                width=1,
                shape='spline',
                dash='dot',
            )
        ),
        row=3,
        col=1
    )
    
    for i in [1, 2, 3]:
        fig.update_xaxes(
            title='Time',
            color='#424a53',
            tickfont=dict(
                color='#6e7781',
                size=6,
            ),
            linecolor='#eaeef2',
            mirror=True,
            showgrid=False,
            row=i,
            col=1
        )
        
        fig.update_yaxes(
            range=[0.9 * np.min(actual), 1.1 * np.max(actual)],
            title='Value',
            color='#424a53',
            tickfont=dict(
                color='#6e7781',
                size=6,
            ),
            linecolor='#eaeef2',
            mirror=True,
            showgrid=False,
            zeroline=False,
            row=i,
            col=1
        )
    
    return fig


In [24]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

In [67]:
data = pd.read_excel("maindata.xlsx")


data = data.astype(np.float64)

data = scaler.fit_transform(data)

data = data[:-8]


# Split data into training and testing sets
x_train = data[:234]
x_test = data[234:]

# Fit the model to the training data
model = TimeGAN(
    x=x_train,
    timesteps=18,
    hidden_dim=64,
    num_layers=3,
    lambda_param=0.1,
    eta_param=10,
    learning_rate=0.001,
    batch_size=16
)

model.fit(
    epochs=2000,
    verbose=True
)

# Reconstruct the test data
x_hat = model.reconstruct(x=x_test)

# Generate the synthetic data
x_sim = model.simulate(samples=len(x_test))

# Plot the actual, reconstructed and synthetic data
fig = plot(actual=x_test, reconstructed=x_hat, synthetic=x_sim)
# fig.write_image('results.png', scale=4, height=900, width=700)
fig.show()


epoch: 1 autoencoder_loss: 18.321800 generator_loss: 22.621742 discriminator_loss: 2.785777
epoch: 2 autoencoder_loss: 17.865599 generator_loss: 17.195457 discriminator_loss: 2.753920
epoch: 3 autoencoder_loss: 17.514521 generator_loss: 14.935451 discriminator_loss: 2.710884
epoch: 4 autoencoder_loss: 17.172216 generator_loss: 14.836706 discriminator_loss: 2.680139
epoch: 5 autoencoder_loss: 16.796722 generator_loss: 16.190586 discriminator_loss: 2.653770
epoch: 6 autoencoder_loss: 16.389910 generator_loss: 18.849909 discriminator_loss: 2.621807
epoch: 7 autoencoder_loss: 15.958915 generator_loss: 22.632713 discriminator_loss: 2.546427
epoch: 8 autoencoder_loss: 15.518735 generator_loss: 27.005619 discriminator_loss: 2.543513
epoch: 9 autoencoder_loss: 15.058032 generator_loss: 31.560852 discriminator_loss: 2.459703
epoch: 10 autoencoder_loss: 14.558754 generator_loss: 35.513809 discriminator_loss: 2.404084
epoch: 11 autoencoder_loss: 14.017208 generator_loss: 38.664059 discriminator_l