# Tutorial 2 - GANs

Let's generate some (fake) NORMAL data based on Tutorial 1.

# Setup

In [None]:
# Common imports
import numpy as np
import pandas as pd

random_state=42

# Get the data

In [None]:
inland = pd.read_csv("inland.csv")


In [None]:
inland.shape

# Data Prep

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

from sklearn.preprocessing import FunctionTransformer

##  Identify the numerical and categorical columns

In [None]:
inland.dtypes

In [None]:
# Identify the numerical columns
numeric_columns = inland.select_dtypes(include=[np.number]).columns.to_list()

# Identify the categorical columns
categorical_columns = inland.select_dtypes('object').columns.to_list()

In [None]:
numeric_columns

In [None]:
categorical_columns

# Pipeline

In [None]:
numeric_transformer = Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='median')),
                ('scaler', StandardScaler())])

In [None]:
preprocessor = ColumnTransformer([
        ('num', numeric_transformer, numeric_columns)],
        remainder='passthrough')
    
#passtrough is an optional step. You don't have to use it.

# Transform: fit_transform() for NORMAL data

In [None]:
#Fit and transform the train data
normal_x = preprocessor.fit_transform(inland)

normal_x

In [None]:
normal_x.shape

# GAN

In [None]:
import tensorflow as tf
from tensorflow import keras

In [None]:
codings_size = 50   # this is the number of input variables we want the generator to use

generator = keras.models.Sequential([
    keras.layers.InputLayer(input_shape=codings_size),
    keras.layers.Dense(40, activation="relu"),
    keras.layers.Dense(20, activation="relu"),
    keras.layers.Dense(9, activation=None)    # Number of variables in the real data set
])

discriminator = keras.models.Sequential([
    keras.layers.InputLayer(input_shape=[9]),   # Number of variables in the real data set
    keras.layers.Dense(5, activation="relu"),
    keras.layers.Dense(1, activation="sigmoid")  # Real/fake prediction
])

gan = keras.models.Sequential([generator, discriminator])



In [None]:
discriminator.compile(loss="binary_crossentropy", optimizer="Adam")
discriminator.trainable = False
gan.compile(loss="binary_crossentropy", optimizer="Adam")

In [None]:
batch_size = 32

dataset = tf.data.Dataset.from_tensor_slices(normal_x).shuffle(1000)

dataset = dataset.batch(batch_size, drop_remainder=True).prefetch(1)

In [None]:
def train_gan(gan, dataset, batch_size, codings_size, n_epochs=10):
    generator, discriminator = gan.layers
    for epoch in range(n_epochs):
        for X_batch in dataset:
            # phase 1 - training the discriminator
            noise = tf.random.normal(shape=[batch_size, codings_size])
            generated_data = tf.cast(generator(noise), tf.float64)
            X_fake_and_real = tf.concat([generated_data, X_batch], axis=0)
            y1 = tf.constant([[0.]] * batch_size + [[1.]] * batch_size)
            discriminator.train_on_batch(X_fake_and_real, y1)
            
            # phase 2 - training the generator
            noise = tf.random.normal(shape=[batch_size, codings_size])
            y2 = tf.constant([[1.]] * batch_size)
            gan.train_on_batch(noise, y2)
        print("Epoch: {}/{}".format(epoch, n_epochs))
        

In [None]:
train_gan(gan, dataset, batch_size, codings_size, n_epochs=10)

# 10 epochs are not enough!!!

# Generate new data using trained generator

In [None]:
noise = tf.random.normal(shape=[1, codings_size])
generated_data = tf.cast(generator(noise), tf.float64)

In [None]:
generated_data

# How can you check the validity of the "fake" data?

Save the data, go back to Tutorial 1, and send into the Autoencoder. If the error rate is low, it is "normal" data.