## Libraries

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
! pip install xgboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt


from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import xgboost as xgb

## Datasets

In [4]:
df1 = pd.read_csv("/content/drive/MyDrive/24hr/PS1/Datasets/1A/PS1A_train.csv")
df2 = pd.read_csv("/content/drive/MyDrive/24hr/PS1/Datasets/1B/PS1B_train.csv")
df3 = pd.read_csv("/content/drive/MyDrive/24hr/PS1/Datasets/1C/PS1C_train.csv")
df4 = pd.read_csv("/content/drive/MyDrive/24hr/PS1/Datasets/1D/PS1D_train.csv")

## Method : Datageneration + LightGBM

In [None]:
def method1(df,tol,num_samples_frac):

  #==================== VAE CLASS DEFINITION ===================================
  class VAE(nn.Module):
      def __init__(self, input_dim, latent_dim):
          super(VAE, self).__init__()
          self.input_dim = input_dim
          self.latent_dim = latent_dim
          
          self.encoder = nn.Sequential(
              nn.Linear(input_dim, 128),
              nn.ReLU(),
              nn.Linear(128, 64),
              nn.ReLU(),
              nn.Linear(64, latent_dim*2)
          )
          
          self.decoder = nn.Sequential(
              nn.Linear(latent_dim, 64),
              nn.ReLU(),
              nn.Linear(64, 128),
              nn.ReLU(),
              nn.Linear(128, input_dim)
          )
      
      def reparameterize(self, mu, logvar):
          std = torch.exp(0.5*logvar)
          eps = torch.randn_like(std)
          return mu + eps*std
      
      def forward(self, x):
          h = self.encoder(x)
          mu, logvar = torch.chunk(h, 2, dim=-1)
          z = self.reparameterize(mu, logvar)
          return self.decoder(z), mu, logvar


  # ======== CLASS VAM =============================================

    
  # Define the VAM model
  class VAM(nn.Module):
      def __init__(self, input_dim, latent_dim, hidden_dim):
          super(VAM, self).__init__()
          self.input_dim = input_dim
          self.latent_dim = latent_dim
          self.hidden_dim = hidden_dim

          # Encoder layers
          self.encoder = nn.Sequential(
              nn.Linear(input_dim, hidden_dim),
              nn.ReLU(),
              nn.Linear(hidden_dim, latent_dim * 2)
          )

          # Decoder layers
          self.decoder = nn.Sequential(
              nn.Linear(latent_dim, hidden_dim),
              nn.ReLU(),
              nn.Linear(hidden_dim, input_dim)
          )

          # Discriminator layers
          self.discriminator = nn.Sequential(
              nn.Linear(latent_dim, hidden_dim),
              nn.ReLU(),
              nn.Linear(hidden_dim, 1),
              nn.Sigmoid()
          )

      def encode(self, x):
          mu, logvar = torch.chunk(self.encoder(x), 2, dim=1)
          return mu, logvar

      def reparameterize(self, mu, logvar):
          std = torch.exp(0.5 * logvar)
          eps = torch.randn_like(std)
          z = mu + eps * std
          return z

      def decode(self, z):
          x_hat = self.decoder(z)
          return x_hat

      def discriminate(self, z):
          d = self.discriminator(z)
          return d

      def forward(self, x):
          mu, logvar = self.encode(x)
          z = self.reparameterize(mu, logvar)
          x_hat = self.decode(z)
          d = self.discriminate(z)
          return x_hat, mu, logvar, d

  # =================== INITIAL SPLITTING INTO TRAIN AND TEST AND COMBINING X AND Y (TRAIN) ==============
  train_init, test= train_test_split(df, test_size=0.2, random_state=42)
  
  X_train_init = train_init[:,:-1]
  y_train_init = train_init[:,-1]
  X_test = test[:,:-1]
  y_test = test[:,-1]
  y_train_init = pd.get_dummies(y_train_init).values
  y_test = pd.get_dummies(y_test).values

  X_y_train = np.concatenate((X_train_init,y_train_init),axis=1)
  # Convert the data to PyTorch tensors
  X_y_train_tensor = torch.Tensor(X_y_train)
  # Define the PyTorch dataloaders
  X_y_dataset = torch.utils.data.TensorDataset(X_y_train_tensor)
  X_y_loader = torch.utils.data.DataLoader(X_y_dataset, batch_size=128, shuffle=True)

  # =============== Train the VAE model ====================================================
  vae = VAM(input_dim=X_y_train.shape[1], latent_dim=2,hidden_dim=6)
  optimizer = torch.optim.Adam(vae.parameters(), lr=1e-4)
  def vam_loss(recon_x, x, mu, logvar):
      # Reconstruction loss
      recon_loss = F.mse_loss(recon_x, x, reduction='sum')    
      # KL divergence
      kl_divergence = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())   
      # Total loss
      loss = recon_loss + kl_divergence
      return loss

  num_epochs = 1000
  losses=[]
  for epoch in range(num_epochs):
      for batch_idx, (data,) in enumerate(X_y_loader):
          optimizer.zero_grad()
          recon_batch, mu, logvar,d = vae(data)
          loss = vam_loss(recon_batch, data, mu, logvar)
          loss.backward()
          optimizer.step()
      losses.append(loss.item())
      # if epoch % 20 == 0:
      #   print('Epoch [{}/{}], Loss: {}'.format(epoch+1, num_epochs, loss.item()))
      if len(losses)>100 and (abs(losses[-1]-losses[-2]))/abs(losses[-2]) < tol:
        print(epoch,"done")
        break

    
  #============= Generate synthetic data using the VAE model ================
  import math
  with torch.no_grad():
      z = torch.randn(math.floor(num_samples_frac*X_train_init.shape[0]), 2)
      synthetic_data_tensor = vae.decoder(z)
  synthetic_data = synthetic_data_tensor.numpy()
  # Concatenate the original and synthetic data
  X_y_combined = np.concatenate((X_y_train, synthetic_data), axis=0)
  X_train = X_y_combined[:,:-y_train_init.shape[1]]
  y_train = X_y_combined[:,-y_train_init.shape[1]:]


  #=========== LGBM with only org training data================
  print('LGBM with only org data')
  # Train an XGBoost classifier on the train set
  xgb_model = lgb.LGBMClassifier()
  xgb_model.fit(X_train_init, np.argmax(y_train_init,axis=1))

  # Predict on the test set and evaluate the accuracy
  y_pred = xgb_model.predict(X_test)
  accuracy = accuracy_score(np.argmax(y_test,axis=1), y_pred)
  print('Test Accuracy: {:.4f}'.format(accuracy))


  #=========== LGBM with org + syn training data================
  print('LGBM with org + syn data')
  # Train an XGBoost classifier on the train set
  xgb_model = lgb.LGBMClassifier()
  xgb_model.fit(X_train, np.argmax(y_train,axis=1))

  # Predict on the test set and evaluate the accuracy
  y_pred = xgb_model.predict(X_test)
  accuracy = accuracy_score(np.argmax(y_test,axis=1), y_pred)
  print('Test Accuracy: {:.4f}'.format(accuracy))

  return accuracy

## Tuning everything except D1

In [None]:
tols = [100,50,10,5,1,0.5,0.1,0.05,0.01]
num_samples_fracs = [100,50,10,5,1,0.5,0.1,0.05,0.01]

In [None]:
# D2
best_acc=0
best_params=None
for tol in tols:
  for num_samples_frac in num_samples_fracs:
    print("\n=====",tol,num_samples_frac)
    acc = method1(df2.values,tol,num_samples_frac)
    if acc>best_acc:
      best_acc=acc
      best_params=[tol,num_samples_frac]
print("Best tol and frac",best_params)


===== 100 100
LGBM with only org data
Test Accuracy: 0.6613
LGBM with org + syn data
Test Accuracy: 0.7097

===== 100 50
101 done
LGBM with only org data
Test Accuracy: 0.6613
LGBM with org + syn data
Test Accuracy: 0.6613

===== 100 10
100 done
LGBM with only org data
Test Accuracy: 0.6613
LGBM with org + syn data
Test Accuracy: 0.7097

===== 100 5
100 done
LGBM with only org data
Test Accuracy: 0.6613
LGBM with org + syn data
Test Accuracy: 0.6774

===== 100 1
100 done
LGBM with only org data
Test Accuracy: 0.6613
LGBM with org + syn data
Test Accuracy: 0.6452

===== 100 0.5
101 done
LGBM with only org data
Test Accuracy: 0.6613
LGBM with org + syn data
Test Accuracy: 0.6613

===== 100 0.1
100 done
LGBM with only org data
Test Accuracy: 0.6613
LGBM with org + syn data
Test Accuracy: 0.6613

===== 100 0.05
100 done
LGBM with only org data
Test Accuracy: 0.6613
LGBM with org + syn data
Test Accuracy: 0.7097

===== 100 0.01
100 done
LGBM with only org data
Test Accuracy: 0.6613
LGBM wi

In [None]:
# D3

best_acc=0
best_params=None
for tol in tols:
  for num_samples_frac in num_samples_fracs:
    print("\n=====",tol,num_samples_frac)
    acc = method1(df3.values,tol,num_samples_frac)
    if acc>best_acc:
      best_acc=acc
      best_params=[tol,num_samples_frac]
print("Best tol and frac",best_params)



===== 100 100
LGBM with only org data
Test Accuracy: 0.9950
LGBM with org + syn data
Test Accuracy: 0.9350

===== 100 50
LGBM with only org data
Test Accuracy: 0.9950
LGBM with org + syn data
Test Accuracy: 0.9950

===== 100 10
100 done
LGBM with only org data
Test Accuracy: 0.9950
LGBM with org + syn data
Test Accuracy: 0.9550

===== 100 5
LGBM with only org data
Test Accuracy: 0.9950
LGBM with org + syn data
Test Accuracy: 0.9950

===== 100 1
LGBM with only org data
Test Accuracy: 0.9950
LGBM with org + syn data
Test Accuracy: 0.9950

===== 100 0.5
LGBM with only org data
Test Accuracy: 0.9950
LGBM with org + syn data
Test Accuracy: 0.9950

===== 100 0.1
100 done
LGBM with only org data
Test Accuracy: 0.9950
LGBM with org + syn data
Test Accuracy: 0.9950

===== 100 0.05
LGBM with only org data
Test Accuracy: 0.9950
LGBM with org + syn data
Test Accuracy: 0.9950

===== 100 0.01
LGBM with only org data
Test Accuracy: 0.9950
LGBM with org + syn data
Test Accuracy: 0.9950

===== 50 100


In [None]:
# D4

best_acc=0
best_params=None
for tol in tols:
  for num_samples_frac in num_samples_fracs:
    print("\n=====",tol,num_samples_frac)
    acc = method1(df4.values,tol,num_samples_frac)
    if acc>best_acc:
      best_acc=acc
      best_params=[tol,num_samples_frac]
print("Best tol and frac",best_params)



===== 100 100
100 done
LGBM with only org data
Test Accuracy: 0.2395
LGBM with org + syn data
Test Accuracy: 0.0539

===== 100 50
100 done
LGBM with only org data
Test Accuracy: 0.2395
LGBM with org + syn data
Test Accuracy: 0.0000

===== 100 10
100 done
LGBM with only org data
Test Accuracy: 0.2395
LGBM with org + syn data
Test Accuracy: 0.1737

===== 100 5
100 done
LGBM with only org data
Test Accuracy: 0.2395
LGBM with org + syn data
Test Accuracy: 0.2455

===== 100 1
100 done
LGBM with only org data
Test Accuracy: 0.2395
LGBM with org + syn data
Test Accuracy: 0.2515

===== 100 0.5
100 done
LGBM with only org data
Test Accuracy: 0.2395
LGBM with org + syn data
Test Accuracy: 0.2395

===== 100 0.1
100 done
LGBM with only org data
Test Accuracy: 0.2395
LGBM with org + syn data
Test Accuracy: 0.2335

===== 100 0.05
100 done
LGBM with only org data
Test Accuracy: 0.2395
LGBM with org + syn data
Test Accuracy: 0.2275

===== 100 0.01
100 done
LGBM with only org data
Test Accuracy: 0.239

In [None]:
# D1

tols = [10,1,0.1]
num_samples_fracs = [10,1,0.1]

best_acc=0
best_params=None
for tol in tols:
  for num_samples_frac in num_samples_fracs:
    print("\n=====",tol,num_samples_frac)
    acc = method1(df1.values,tol,num_samples_frac)
    if acc>best_acc:
      best_acc=acc
      best_params=[tol,num_samples_frac]
print("Best tol and frac",best_params)



===== 10 10
100 done
LGBM with only org data
Test Accuracy: 0.0036
LGBM with org + syn data
Test Accuracy: 0.0043

===== 10 1
100 done
LGBM with only org data
Test Accuracy: 0.0036
LGBM with org + syn data
Test Accuracy: 0.0072

===== 10 0.1
100 done
LGBM with only org data
Test Accuracy: 0.0036
LGBM with org + syn data
Test Accuracy: 0.0079

===== 1 10
100 done
LGBM with only org data
Test Accuracy: 0.0036
LGBM with org + syn data
Test Accuracy: 0.0014

===== 1 1
LGBM with only org data
Test Accuracy: 0.0036
LGBM with org + syn data
Test Accuracy: 0.0072

===== 1 0.1
100 done
LGBM with only org data
Test Accuracy: 0.0036
LGBM with org + syn data
Test Accuracy: 0.0072

===== 0.1 10
101 done
LGBM with only org data
Test Accuracy: 0.0036
LGBM with org + syn data
Test Accuracy: 0.0072

===== 0.1 1
125 done
LGBM with only org data
Test Accuracy: 0.0036
LGBM with org + syn data
Test Accuracy: 0.0036

===== 0.1 0.1
105 done
LGBM with only org data
Test Accuracy: 0.0036
LGBM with org + syn d