# Import csv dataset

In [None]:
from google.colab import files

files.upload()    # Upload the training data csv. The following implementation drops the null values. Custom handling before uploading is encouraged

In [None]:
dataset_name = '...'      # use the name .csv file imported above
discrete_columns = []
attributes_to_ignore = []     # attributes with unique values such as names and ids that dont hold any information for the data and make learning harder

num_features = 15
num_negatives = 5

In [None]:
import pandas as pd

train_data = pd.read_csv(dataset_name)
train_data = train_data.dropna().reset_index(drop=True)
train_data = train_data.drop(attributes_to_ignore, axis = 1)
discrete_columns = [c for c in train_data.columns if c in discrete_columns]    # puts disrete_columns in the right order

discrete_df = train_data[discrete_columns]

# Embeddings


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
import random
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import DataLoader, Dataset


class EmbeddingsNN(nn.Module):
    def __init__(self, discrete_df, num_features, num_negatives, pairs_pc, batch_size = 4000):
        super(EmbeddingsNN, self).__init__()
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.criterion = nn.BCELoss()
        self.num_epochs = 150
        self.num_negatives = num_negatives
        self.pairs_pc = pairs_pc
        self.batch_size = batch_size
        self.negtive_samples = []
        self.discrete_columns = discrete_df.columns
        self.num_words = self.compute_num_words(discrete_df)
        self.num_features = num_features
        self.f_matrix = self.get_f_matrix(self.word_occurance_pairs)
        self.column_pairs= [
        (col1, col2)
        for col1 in self.discrete_columns
        for col2 in self.discrete_columns
        if col1 != col2
        ]

        # E is the matrix we want to learn
        self.one_hot_matrix = torch.eye(self.num_words, device=self.device)
        self.E = nn.Parameter(torch.empty(self.num_words, self.num_features, device=self.device))  # Matrix E (learnable)
        self.theta = nn.Parameter(torch.empty(self.num_features, self.num_words, device=self.device))  # Vector θ (learnable)

        nn.init.xavier_uniform_(self.E)
        nn.init.xavier_uniform_(self.theta)

        self.optimizer = torch.optim.Adam(self.parameters(), lr=0.001, weight_decay=1e-5)

        self.to(self.device)


    def forward(self, idx_c, idx_t, batched=False):
        #O = self.one_hot_matrix[idx_c]
        #e_c = torch.matmul(O, self.E)
        e_c = self.E[idx_c]
        #e_c = torch.tanh(e_c)
        theta_t = self.theta[:, idx_t]
        #theta_t = torch.tanh(theta_t)
        if batched:
            return torch.sigmoid(torch.sum((e_c *(theta_t.T)), dim=1))
        else:
            return torch.sigmoid(torch.dot(e_c, theta_t))

    def compute_num_words(self, discrete_df):
        self.word_occurance_pairs = []
        self.values_per_column = [0]*len(self.discrete_columns)
        for column in self.discrete_columns:
            value_counts_pairs = discrete_df[column].value_counts().items()
            for v,c in value_counts_pairs:
              self.word_occurance_pairs.append([v,c])
              self.values_per_column[self.discrete_columns.get_loc(column)] += 1

        self.words = [word for word, count in self.word_occurance_pairs]
        self.word_to_idx = {word: idx for idx, word in enumerate(self.words)}
        return len(self.words)

    def get_f_matrix(self, word_occurance_pairs):
      total = sum(x[1] for x in word_occurance_pairs)
      for i in range(len(word_occurance_pairs)):
         word_occurance_pairs[i][1] = word_occurance_pairs[i][1]/total
      sum_of_fs = sum(x[1]**(3/4) for x in word_occurance_pairs)
      f_matrix =[]
      for i in range(len(word_occurance_pairs)):
          f_matrix.append([i, word_occurance_pairs[i][1]**(3/4)/sum_of_fs])
      return f_matrix


    def get_training_pairs(self, pairs_pc, total_batch):
      num_pairs = int(len(self.column_pairs)*pairs_pc)
      pairs_set = random.sample(self.column_pairs, k=num_pairs)
      #pairs_set = self.column_pairs
      training_pairs = []
      for c, t in pairs_set:
        centers = total_batch[c].map(self.word_to_idx)
        targets = total_batch[t].map(self.word_to_idx)
        pairs = torch.stack([
            torch.tensor(centers.values, device=self.device),
            torch.tensor(targets.values, device=self.device)
        ], dim=1)
        training_pairs.append(pairs)

        # Concatenate all pairs from all column combinations
      training_pairs = torch.cat(training_pairs, dim=0)
      return training_pairs


    def get_negative_samples(self, exclude_idx, num_negatives):
      neg_samples = []
      numbers, probabilities = zip(*self.f_matrix)
      for _ in range(num_negatives):
          r = random.choices(numbers, weights=probabilities, k=len(exclude_idx))
          neg_samples.append(r)
      return torch.tensor(neg_samples, device = self.device).T


    def train(self):
        num_samples = len(discrete_df)
        counter = 0

        for epoch in range(self.num_epochs):
            total_loss = 0

            for i in range(0, num_samples, self.batch_size):
                counter+=1
                batch_df = discrete_df.iloc[i:i+self.batch_size]
                if len(batch_df) == 0:
                    continue

                training_pairs = self.get_training_pairs(self.pairs_pc, batch_df)
                targets = training_pairs[:, 1]
                negative_samples = self.get_negative_samples(targets, self.num_negatives)

                tp_exp = training_pairs[:, 0].repeat_interleave(negative_samples.shape[1])
                negative_samples = negative_samples.flatten()
                negative_samples = torch.stack([tp_exp, negative_samples], dim=1)

                self.optimizer.zero_grad()

                pos_scores = self.forward(training_pairs[:,0], training_pairs[:,1], batched=True)
                pos_loss = self.criterion(pos_scores, torch.ones_like(pos_scores))

                neg_scores = self.forward(negative_samples[:,0], negative_samples[:,1], batched=True)
                neg_loss = self.criterion(neg_scores, torch.zeros_like(neg_scores))

                loss = (pos_loss + neg_loss)
                loss.backward()
                self.optimizer.step()

                total_loss += loss.item()

            if epoch%10 == 0:
                print(f"Epoch {epoch}, Loss: {total_loss:.4f}")



In [None]:
pair_prob = 0.3  # the probability of selecting a positive sample for training in a given epoch
ebd = EmbeddingsNN(discrete_df, num_features, num_negatives, pair_prob)
ebd.train()

In [None]:
E = torch.tanh(ebd.E.T).detach().cpu().numpy()
embeddings_df = pd.DataFrame(E, columns = ebd.words)

In [None]:
embeddings_df = (embeddings_df - embeddings_df.mean()) / embeddings_df.std()

In [None]:
#normalize the values of the embeddings
max = embeddings_df.abs().max().max()
embeddings_df = (embeddings_df / max).astype('float32')

#apply embeddings on words
def word_to_vec(word):
    if word in embeddings_df.columns.tolist():
       return embeddings_df[word].values

#construct the new dataframe with continuous ad embedded columns
def construct_dataset(train_df):
    i=0
    _df = train_df.copy()
    cat_col_pos = []
    for column in _df:
       if column in discrete_columns:
          cat_col_pos.append(column)
          _df[column] = _df[column].apply(lambda x: word_to_vec(x))
       else:
          cat_col_pos.append(None)
    return _df, cat_col_pos

new_df, cat_cols_pos = construct_dataset(train_data)

In [None]:
new_rows = []
for _, row in new_df.iterrows():
    flat = []
    for val in row:
        if isinstance(val, (list, np.ndarray)):
            flat.extend(val)  # unpack lists or arrays
        else:
            flat.append(val)  # keep scalar values
    new_rows.append(flat)

new_rows = np.asarray(new_rows)

# eGAN

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from torch.nn import BatchNorm1d, Dropout, LeakyReLU, Linear, Module, ReLU, Sequential, functional
from torch.autograd import grad


class Residual(Module):
    """Residual layer for the CTGAN."""

    def __init__(self, i, o):
        super(Residual, self).__init__()
        self.fc = Linear(i, o)
        self.bn = BatchNorm1d(o)
        self.relu = ReLU()

    def forward(self, input_):
        """Apply the Residual layer to the `input_`."""
        out = self.fc(input_)
        out = self.bn(out)
        out = self.relu(out)
        return out


# ==== Generator ====
class Generator(nn.Module):
    def __init__(self, embedding_dim, generator_dims, data_dim):
        super(Generator, self).__init__()
        dim = embedding_dim
        seq = []
        for item in list(generator_dims):
            seq += [Residual(dim, item)]
            dim = item
        seq.append(Linear(dim, data_dim))
        self.seq = Sequential(*seq)

    def forward(self, input_):
        """Apply the Generator to the `input_`."""
        data = self.seq(input_)
        return data


# ==== Discriminator ====
class Discriminator(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(Discriminator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.LeakyReLU(0.2), #nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(hidden_dim, hidden_dim),
            nn.LeakyReLU(0.2), #nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(hidden_dim, hidden_dim),
            nn.LeakyReLU(0.2), #nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(hidden_dim, 1),
            #nn.Sigmoid()#nn.Tanh()#nn.Sigmoid()
        )

    def forward(self, x):
        return self.model(x)

def compute_gradient_penalty(critic, real_data, fake_data, device):
    batch_size = real_data.size(0)
    epsilon = torch.rand(batch_size, 1, device=device)
    epsilon = epsilon.expand_as(real_data)

    interpolated = epsilon * real_data + (1 - epsilon) * fake_data
    interpolated.requires_grad_(True)

    prob_interpolated = critic(interpolated)
    gradients = grad(outputs=prob_interpolated, inputs=interpolated,
                     grad_outputs=torch.ones_like(prob_interpolated),
                     create_graph=True, retain_graph=True)[0]

    gradients = gradients.view(batch_size, -1)
    gradient_norm = gradients.norm(2, dim=1)
    penalty = ((gradient_norm - 1) ** 2).mean()
    return penalty*2


class eGAN():
    def __init__(self, embedding_dim=128, generator_dims=(256,512, 1024, 512, 256), discriminator_dim=256, discriminator_steps=2, epochs=600, batch_size=2000, cuda=True):
        self.embedding_dim = embedding_dim
        self.generator_dims = generator_dims
        self.discriminator_dim = discriminator_dim
        self.epochs = epochs
        self.batch_size = batch_size
        self.discriminator_steps = discriminator_steps
        self.criterion = nn.BCELoss()


        if not cuda or not torch.cuda.is_available():
            device = 'cpu'
        elif isinstance(cuda, str):
            device = cuda
        else:
            device = 'cuda'

        self.device = torch.device(device)

    def fit(self, data):
        self.data_dim = data.shape[1]
        self.data = torch.tensor(data, dtype=torch.float32).to(self.device)

        self.generator = Generator(self.embedding_dim, self.generator_dims, self.data_dim).to(self.device)
        self.discriminator = Discriminator(self.data_dim, self.discriminator_dim).to(self.device)

        self.optimizer_D = optim.Adam(self.discriminator.parameters(), lr=2e-4, betas=(0.5, 0.9))
        self.optimizer_G = optim.Adam(self.generator.parameters(), lr=1e-5, betas=(0.5, 0.9))

        data_loader = torch.utils.data.DataLoader(self.data, batch_size=self.batch_size)

        fixed_real_labels = torch.ones((self.batch_size, 1), device=self.device)
        fixed_fake_labels = torch.zeros((self.batch_size, 1), device=self.device)

        for epoch in range(self.epochs):
            for real_batch in data_loader:
                batch_size_curr = real_batch.size(0)
                real_labels = fixed_real_labels[:batch_size_curr]
                fake_labels = fixed_fake_labels[:batch_size_curr]

                #Train Discriminator
                for i in range(self.discriminator_steps):
                    d_real = self.discriminator(real_batch)

                    z = torch.randn(batch_size_curr, self.embedding_dim, device=self.device)
                    fake_data = self.generator(z)
                    d_fake = self.discriminator(fake_data.detach())

                    loss_real = -torch.mean(d_real) #WGAN
                    loss_fake = torch.mean(d_fake) #WGAN

                    gp = compute_gradient_penalty(self.discriminator, real_batch, fake_data.detach(), self.device)

                    loss_D = loss_real + loss_fake + 0.1*torch.abs(torch.var(d_real) - torch.var(d_fake))

                    self.optimizer_D.zero_grad()
                    gp.backward(retain_graph=True)
                    loss_D.backward()
                    self.optimizer_D.step()

                #Train Generator
                z = torch.randn(batch_size_curr, self.embedding_dim, device=self.device)
                generated_data = self.generator(z)
                outputs = self.discriminator(generated_data)

                loss_G = -torch.mean(outputs)

                self.optimizer_G.zero_grad()
                loss_G.backward()
                self.optimizer_G.step()

            if epoch % 25 == 0:
                print(f"Epoch {epoch} | Loss D: {loss_D.item():.4f} | Loss G: {loss_G.item():.4f}")

    def sample(self, num_samples):
        with torch.no_grad():
            z = torch.randn(num_samples, self.embedding_dim, device=self.device)
            generated_data = self.generator(z)
        return generated_data.cpu().numpy()

In [None]:
#Training
from sklearn.preprocessing import StandardScaler

l_nr = np.log1p(new_rows+0.01)  #For smoothing. It compresses large values and expands small ones

scaler = StandardScaler()
X_scaled = scaler.fit_transform(l_nr)

egan = eGAN()
egan.fit(X_scaled)

In [None]:
def get_discrete_values(tmp, start, end):
    X = embeddings_df.iloc[:, start : end]
    res=[]
    score = float('inf')
    for x in tmp:
        att=None
        score = float('inf')
        for v in X:
            s = nn.CosineEmbeddingLoss()(torch.tensor(X[v].values).unsqueeze(0), torch.tensor(x).unsqueeze(0), torch.ones(x.shape))
            if (s<score):
               score=s
               att=v
        res.append(att)
    return res

def create_final_df(sample, columns, discrete_columns, num_features):
    final_df = pd.DataFrame()
    i=0
    j=0
    col_atts_indeces = np.concatenate([np.zeros(1).astype(int), np.cumsum(ebd.values_per_column)])
    for c in columns:
        if c in discrete_columns:
            tmp = sample[:,i:i+num_features]
            attributes = get_discrete_values(tmp, col_atts_indeces[j], col_atts_indeces[j+1])
            j+=1
            final_df[c] = attributes
            i+=num_features
        else:
            final_df[c] = sample[:,i]
            i+=1
    return final_df

In [None]:
d = egan.sample(2000)

samples = scaler.inverse_transform(d)
samples = np.expm1(samples)

In [None]:
final_df = create_final_df(samples, new_df.columns, discrete_columns, num_features)

In [None]:
final_df