In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import precision_score, recall_score, f1_score,\
                            accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
#from imblearn.over_sampling import SMOTE
#from imblearn.pipeline import Pipeline

import os
from collections import Counter

np.random.seed(34)
path = '../input/tabular-playground-series-sep-2021/'

In [None]:
#

# Data Exploration and Cleaning

In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

dataset_train1 = pd.read_csv(f'{path}train.csv', index_col='id')
#dataset_test1 = pd.read_csv(f'{path}test.csv', index_col='id')

y = dataset_train1.claim
y = pd.DataFrame(y)
#dataset_train = dataset_train1.drop(['claim'], axis=1)
dataset_train = dataset_train1.copy()

dataset_train['nan'] = dataset_train.isnull().sum(axis=1)
dataset_train['nan'] = dataset_train['nan']/dataset_train['nan'].max()

#dataset_test1['nan'] = dataset_test1.isnull().sum(axis=1)
#dataset_test1['nan'] = dataset_test1['nan']/dataset_test1['nan'].max()

In [None]:
dataset_train3 = dataset_train1.drop(['claim'], axis=1)
dataset_train3['nan'] = dataset_train3.isnull().sum(axis=1)
dataset_train3['nan'] = dataset_train3['nan']/dataset_train3['nan'].max()

In [None]:
from sklearn.preprocessing import QuantileTransformer, KBinsDiscretizer

from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='mean')
dataset_train = imputer.fit_transform(dataset_train)
#dataset_test = imputer.transform(dataset_test1)

In [None]:
from sklearn.preprocessing import MinMaxScaler
x_scaler = MinMaxScaler()
dataset_train_sc = x_scaler.fit_transform(dataset_train)
#dataset_test_sc = x_scaler.transform(dataset_test1)

In [None]:
N_split = int(0.2 * len(dataset_train_sc))
X_train = dataset_train_sc[:-N_split, :]
X_test = dataset_train_sc[-N_split:, :]
y_train = y[:-N_split]
y_test = y[-N_split:]

# Using GANs to generate new data

In [None]:
from tensorflow.keras.layers import Input, Dense, Reshape, Flatten, Dropout, multiply, Concatenate
from tensorflow.keras.layers import BatchNormalization, Activation, Embedding, ZeroPadding2D, LeakyReLU
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras.initializers import RandomNormal
import tensorflow.keras.backend as K
from sklearn.utils import shuffle

In [None]:
class cGAN():
    def __init__(self):
        self.latent_dim = 120
        self.out_shape = 120
        self.num_classes = 2
        self.clip_value = 0.01
        #optimizer = Adam(0.00001)
        optimizer = Adam(0.00001, 0.5)
        #optimizer = RMSprop(lr=0.00005)

        # build discriminator
        self.discriminator = self.build_discriminator()
        self.discriminator.compile(loss=['binary_crossentropy'],
                                   optimizer=optimizer,
                                   metrics=['accuracy'])

        # build generator
        self.generator = self.build_generator()

        # generating new data samples
        noise = Input(shape=(self.latent_dim,))
        label = Input(shape=(1,))
        gen_samples = self.generator([noise, label])

        self.discriminator.trainable = False

        # passing gen samples through disc. 
        valid = self.discriminator([gen_samples, label])

        # combining both models
        self.combined = Model([noise, label], valid)
        self.combined.compile(loss=['binary_crossentropy'],
                              optimizer=optimizer,
                             metrics=['accuracy'])
        self.combined.summary()

    def wasserstein_loss(self, y_true, y_pred):
        return K.mean(y_true * y_pred)

    def build_generator(self):
        init = RandomNormal(mean=0.0, stddev=0.02)
        model = Sequential()

        model.add(Dense(64, input_dim=self.latent_dim))
        #model.add(Dropout(0.2))
        model.add(LeakyReLU(alpha=0.2))
        model.add(BatchNormalization(momentum=0.8))

        model.add(Dense(128))
        #model.add(Dropout(0.2))
        model.add(LeakyReLU(alpha=0.2))
        model.add(BatchNormalization(momentum=0.8))

        model.add(Dense(256))
        #model.add(Dropout(0.2))
        model.add(LeakyReLU(alpha=0.2))
        model.add(BatchNormalization(momentum=0.8))

        model.add(Dense(self.out_shape, activation='tanh'))
        model.summary()

        noise = Input(shape=(self.latent_dim,))
        label = Input(shape=(1,), dtype='int32')
        label_embedding = Flatten()(Embedding(self.num_classes, self.latent_dim)(label))
        
        model_input = multiply([noise, label_embedding])
        gen_sample = model(model_input)

        return Model([noise, label], gen_sample, name="Generator")

    
    def build_discriminator(self):
        init = RandomNormal(mean=0.0, stddev=0.02)
        model = Sequential()

        model.add(Dense(256, input_dim=self.out_shape, kernel_initializer=init))
        model.add(LeakyReLU(alpha=0.2))
        
        model.add(Dense(128, kernel_initializer=init))
        model.add(LeakyReLU(alpha=0.2))
        model.add(Dropout(0.3))
        
        model.add(Dense(64, kernel_initializer=init))
        model.add(LeakyReLU(alpha=0.2))
        model.add(Dropout(0.3))
        
        model.add(Dense(1, activation='sigmoid'))
        model.summary()
        
        gen_sample = Input(shape=(self.out_shape,))
        label = Input(shape=(1,), dtype='int32')
        label_embedding = Flatten()(Embedding(self.num_classes, self.out_shape)(label))

        model_input = multiply([gen_sample, label_embedding])
        validity = model(model_input)

        return Model(inputs=[gen_sample, label], outputs=validity, name="Discriminator")


    def train(self, X_train, y_train, pos_index, neg_index, epochs, batch_size=32, sample_interval=50):

        # Adversarial ground truths
        valid = np.ones((batch_size, 1))
        fake = np.zeros((batch_size, 1))

        for epoch in range(epochs):
            
            #  Train Discriminator with 8 sample from postivite class and rest with negative class
            idx1 = np.random.choice(pos_index, 8)
            idx0 = np.random.choice(neg_index, batch_size-8)
            idx = np.concatenate((idx1, idx0))
            samples, labels = X_train[idx], y_train[idx]
            samples, labels = shuffle(samples, labels)
            # Sample noise as generator input
            noise = np.random.normal(0, 1, (batch_size, self.latent_dim))

            # Generate a half batch of new images
            gen_samples = self.generator.predict([noise, labels])

            # label smoothing
            if epoch < epochs//1.5:
                valid_smooth = (valid+0.1)-(np.random.random(valid.shape)*0.1)
                fake_smooth = (fake-0.1)+(np.random.random(fake.shape)*0.1)
            else:
                valid_smooth = valid 
                fake_smooth = fake
                
            # Train the discriminator
            self.discriminator.trainable = True
            d_loss_real = self.discriminator.train_on_batch([samples, labels], valid_smooth)
            d_loss_fake = self.discriminator.train_on_batch([gen_samples, labels], fake_smooth)
            d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)

            # Train Generator
            # Condition on labels
            self.discriminator.trainable = False
            sampled_labels = np.random.randint(0, 2, batch_size).reshape(-1, 1)
            # Train the generator
            g_loss = self.combined.train_on_batch([noise, sampled_labels], valid)

            # Plot the progress
            if (epoch+1)%sample_interval==0:
                print (f"{epoch} [D loss: {d_loss[0]}, acc.: {100*d_loss[1]}] [G loss: {g_loss}]")

In [None]:
y_train = np.array(y_train)
y_train

In [None]:
y_train = y_train.reshape(-1,1)
pos_index = np.where(y_train==1)[0]
neg_index = np.where(y_train==0)[0]

In [None]:
cgan = cGAN()

In [None]:
cgan.train(X_train, y_train, pos_index, neg_index, epochs=10000)

In [None]:
# generating new samples
noise = np.random.normal(0, 1, (200000, 120))
sampled_labels = np.ones(200000).reshape(-1, 1)

gen_samples = cgan.generator.predict([noise, sampled_labels])
gen_samples = x_scaler.inverse_transform(gen_samples)
print(gen_samples.shape)

In [None]:
len(dataset_train3.columns)

In [None]:
for i in range(len(gen_samples)):
    gen_samples[i,-1] = 1 if gen_samples[i,-1] >= 0.5 else 0

In [None]:
gen_samples[210:350,-1]


In [None]:
gen_samples[-1,:]

In [None]:
dataset_train1 = pd.read_csv(f'{path}train.csv', index_col='id')

In [None]:
cols = [f'{i}' for i in range(118)]
cols.append('nan')
cols.append('claim')

In [None]:
gen_df = pd.DataFrame(data = gen_samples, columns=cols)
gen_df

In [None]:
len(gen_df)

In [None]:
path = 'gen_dataset.csv'
gen_df.to_csv(path, index=False)

In [None]:
noise = np.random.normal(0, 1, (200000, 120))
noise
