# Import csv dataset


In [None]:
from google.colab import files

files.upload()

In [None]:
dataset_name = '...'      # use the name .csv file imported above
discrete_columns = ['...']
attributes_to_ignore = ['...']     # attributes with unique values such as names and ids that dont hold any information for the data and make learning harder

num_features = 15
num_negatives = 5

In [None]:
import pandas as pd

train_data = pd.read_csv(dataset_name)
train_data = train_data.dropna().reset_index(drop=True)
train_data = train_data.drop(attributes_to_ignore, axis = 1)
discrete_columns = [c for c in train_data.columns if c in discrete_columns]    # puts disrete_columns in the right order

discrete_df = train_data[discrete_columns]

# Embeddings


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
import random
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import DataLoader, Dataset


class EmbeddingsNN(nn.Module):
    def __init__(self, discrete_df, num_features, num_negatives, pairs_pc, batch_size = 4000):
        super(EmbeddingsNN, self).__init__()
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.criterion = nn.BCELoss()
        self.num_epochs = 150
        self.num_negatives = num_negatives
        self.pairs_pc = pairs_pc
        self.batch_size = batch_size
        self.negtive_samples = []
        self.discrete_columns = discrete_df.columns
        self.num_words = self.compute_num_words(discrete_df)
        self.num_features = num_features
        self.f_matrix = self.get_f_matrix(self.word_occurance_pairs)
        self.column_pairs= [
        (col1, col2)
        for col1 in self.discrete_columns
        for col2 in self.discrete_columns
        if col1 != col2
        ]

        # E is the matrix we want to learn
        self.one_hot_matrix = torch.eye(self.num_words, device=self.device)
        self.E = nn.Parameter(torch.empty(self.num_words, self.num_features, device=self.device))  # Matrix E (learnable)
        self.theta = nn.Parameter(torch.empty(self.num_features, self.num_words, device=self.device))  # Vector θ (learnable)

        nn.init.xavier_uniform_(self.E)
        nn.init.xavier_uniform_(self.theta)

        self.optimizer = torch.optim.Adam(self.parameters(), lr=0.001, weight_decay=1e-5)

        self.to(self.device)


    def forward(self, idx_c, idx_t, batched=False):
        #O = self.one_hot_matrix[idx_c]
        #e_c = torch.matmul(O, self.E)
        e_c = self.E[idx_c]
        #e_c = torch.tanh(e_c)
        theta_t = self.theta[:, idx_t]
        #theta_t = torch.tanh(theta_t)
        if batched:
            return torch.sigmoid(torch.sum((e_c *(theta_t.T)), dim=1))
        else:
            return torch.sigmoid(torch.dot(e_c, theta_t))

    def compute_num_words(self, discrete_df):
        self.word_occurance_pairs = []
        self.values_per_column = [0]*len(self.discrete_columns)
        for column in self.discrete_columns:
            value_counts_pairs = discrete_df[column].value_counts().items()
            for v,c in value_counts_pairs:
              self.word_occurance_pairs.append([v,c])
              self.values_per_column[self.discrete_columns.get_loc(column)] += 1

        self.words = [word for word, count in self.word_occurance_pairs]
        self.word_to_idx = {word: idx for idx, word in enumerate(self.words)}
        return len(self.words)

    def get_f_matrix(self, word_occurance_pairs):
      total = sum(x[1] for x in word_occurance_pairs)
      for i in range(len(word_occurance_pairs)):
         word_occurance_pairs[i][1] = word_occurance_pairs[i][1]/total
      sum_of_fs = sum(x[1]**(3/4) for x in word_occurance_pairs)
      f_matrix =[]
      for i in range(len(word_occurance_pairs)):
          f_matrix.append([i, word_occurance_pairs[i][1]**(3/4)/sum_of_fs])
      return f_matrix


    def get_training_pairs(self, pairs_pc, total_batch):
      num_pairs = int(len(self.column_pairs)*pairs_pc)
      pairs_set = random.sample(self.column_pairs, k=num_pairs)
      #pairs_set = self.column_pairs
      training_pairs = []
      for c, t in pairs_set:
        centers = total_batch[c].map(self.word_to_idx)
        targets = total_batch[t].map(self.word_to_idx)
        pairs = torch.stack([
            torch.tensor(centers.values, device=self.device),
            torch.tensor(targets.values, device=self.device)
        ], dim=1)
        training_pairs.append(pairs)

        # Concatenate all pairs from all column combinations
      training_pairs = torch.cat(training_pairs, dim=0)
      return training_pairs


    def get_negative_samples(self, exclude_idx, num_negatives):
      neg_samples = []
      numbers, probabilities = zip(*self.f_matrix)
      for _ in range(num_negatives):
          r = random.choices(numbers, weights=probabilities, k=len(exclude_idx))
          neg_samples.append(r)
      return torch.tensor(neg_samples, device = self.device).T


    def train(self):
        num_samples = len(discrete_df)
        counter = 0

        for epoch in range(self.num_epochs):
            total_loss = 0

            for i in range(0, num_samples, self.batch_size):
                counter+=1
                batch_df = discrete_df.iloc[i:i+self.batch_size]
                if len(batch_df) == 0:
                    continue

                training_pairs = self.get_training_pairs(self.pairs_pc, batch_df)
                targets = training_pairs[:, 1]
                negative_samples = self.get_negative_samples(targets, self.num_negatives)

                tp_exp = training_pairs[:, 0].repeat_interleave(negative_samples.shape[1])
                negative_samples = negative_samples.flatten()
                negative_samples = torch.stack([tp_exp, negative_samples], dim=1)

                self.optimizer.zero_grad()

                pos_scores = self.forward(training_pairs[:,0], training_pairs[:,1], batched=True)
                pos_loss = self.criterion(pos_scores, torch.ones_like(pos_scores))

                neg_scores = self.forward(negative_samples[:,0], negative_samples[:,1], batched=True)
                neg_loss = self.criterion(neg_scores, torch.zeros_like(neg_scores))

                loss = (pos_loss + neg_loss)
                loss.backward()
                self.optimizer.step()

                total_loss += loss.item()

            if epoch%10 == 0:
                print(f"Epoch {epoch}, Loss: {total_loss:.4f}")

In [None]:
pair_prob = 0.3  # the probability of selecting a positive sample for training in a given epoch
ebd = EmbeddingsNN(discrete_df, num_features, num_negatives, pair_prob)
ebd.train()

In [None]:
E = torch.tanh(ebd.E.T).detach().cpu().numpy()
embeddings_df = pd.DataFrame(E, columns = ebd.words)

In [None]:
embeddings_df = (embeddings_df - embeddings_df.mean()) / embeddings_df.std()

In [None]:
#normalize the values of the embeddings
max = embeddings_df.abs().max().max()
embeddings_df = (embeddings_df / max).astype('float32')

#apply embeddings on words
def word_to_vec(word):
    if word in embeddings_df.columns.tolist():
       return embeddings_df[word].values

#construct the new dataframe with continuous ad embedded columns
def construct_dataset(_df):
    i=0
    cat_col_pos = []
    for column in _df:
       if column in discrete_columns:
          cat_col_pos.append(column)
          _df[column] = _df[column].apply(lambda x: word_to_vec(x))
       else:
          cat_col_pos.append(None)
    return _df, cat_col_pos

new_df, cat_cols_pos = construct_dataset(train_data.copy())

In [None]:
new_rows = []
for _, row in new_df.iterrows():
    flat = []
    for val in row:
        if isinstance(val, (list, np.ndarray)):
            flat.extend(val)  # unpack lists or arrays
        else:
            flat.append(val)  # keep scalar values
    new_rows.append(flat)

new_rows = np.array(new_rows)

#DDPM

Baesd on Algorithm 1 (Trainings) and Algorithm 2 (Sampling) of https://arxiv.org/pdf/2006.11239 at page 4

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import math



class DenoiseMLP(nn.Module):
    def __init__(self, data_dim, time_dim):
        super().__init__()
        self.time_embed = nn.Sequential(
            nn.Linear(1, time_dim), nn.ReLU(),
            nn.Linear(time_dim, time_dim)
        )
        self.net = nn.Sequential(
            nn.Linear(data_dim + time_dim, 512),
            nn.ReLU(),
            nn.LayerNorm(512),
            nn.Linear(512, 1024),
            nn.ReLU(),
            nn.LayerNorm(1024),
            nn.Linear(1024, 1024),
            nn.ReLU(),
            nn.LayerNorm(1024),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.LayerNorm(512),
            nn.Linear(512, data_dim),
        )

    def forward(self, x, t):
        t = t.view(-1, 1).float() / 1000  # Normalize timestep
        time_emb = self.time_embed(t)
        x = torch.cat([x, time_emb], dim=1)
        return self.net(x)


class eDDPM(nn.Module):
    def __init__(self, data_dim, time_dim=128, T=400, lr=1e-4, cuda=True):
        super().__init__()
        self.T = T
        self.lr = lr
        self.data_dim = data_dim

        if not cuda or not torch.cuda.is_available():
            self.device = 'cpu'
        elif isinstance(cuda, str):
            self.device = cuda
        else:
            self.device = 'cuda'

        self.betas = torch.linspace(1e-4, 0.02, T).to(self.device)
        self.alphas = 1.0 - self.betas
        self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
        self.sqrt_alphas_cumprod = torch.sqrt(self.alphas_cumprod).to(self.device)
        self.sqrt_one_minus_alphas_cumprod = torch.sqrt(1.0 - self.alphas_cumprod)

        self.device = torch.device(self.device)
        self.denoiser = DenoiseMLP(data_dim, time_dim).to(self.device)
        self.optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)

    def q_sample(self, x0, t, noise=None):
        if noise is None:
            noise = torch.randn_like(x0)
        return self.sqrt_alphas_cumprod[t].unsqueeze(1) * x0 + self.sqrt_one_minus_alphas_cumprod[t].unsqueeze(1) * noise

    def train(self, data, num_epochs=1000, batch_size=4000):
        data = torch.from_numpy(data).float().to(self.device)

        optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
        data_loader = torch.utils.data.DataLoader(data, batch_size)

        for epoch in range(num_epochs):
            for batch in data_loader:
                #t = torch.randint(0, self.T, (batch.shape[0],), dtype=torch.long).to(self.device)
                weights = torch.linspace(0.1, 1.0, self.T)  # increasing weight
                t = torch.multinomial(weights, batch.shape[0], replacement=True).to(self.device)
                noise = torch.randn_like(batch)
                xt = self.q_sample(batch, t, noise)

                pred_noise = self.denoiser(xt, t)
                loss = F.mse_loss(pred_noise, noise) #/ torch.sqrt(self.alphas[t].mean())
                optimizer.zero_grad()
                loss.backward()
                #torch.nn.utils.clip_grad_norm_(self.parameters(), max_norm=1.0)
                optimizer.step()

            if epoch % 20 == 0:
                print(f"Epoch {epoch}: loss {loss.item():.4f}")

    #@torch.no_grad()
    def sample(self, num_samples):
        with torch.no_grad():
            x_t = torch.randn((num_samples, self.data_dim), device = self.device)
            for t in reversed(range(self.T)):
                t_tensor = torch.tensor([t]*num_samples, device=self.device)
                pred_noise = self.denoiser(x_t, t_tensor)
                beta_t = self.betas[t]
                alpha_t = self.alphas[t]
                alpha_bar_t = self.alphas_cumprod[t]
                sigmas = torch.sqrt(beta_t) # or use sigmas = torch.sqrt(1 - self.alphas[t]) given by chat

                x_t_minus_one = (1 / torch.sqrt(alpha_t)) * (x_t - (1-alpha_t) / torch.sqrt(1 - alpha_bar_t) * pred_noise)
                x_t = x_t_minus_one
                if t > 0:
                    noise = torch.randn_like(x_t)
                    x_t += sigmas * noise
            return x_t

In [None]:
from sklearn.preprocessing import StandardScaler

l_nr = np.log1p(new_rows+0.01)  #For smoothing. It compresses large values and expands small ones

scaler = StandardScaler()
X_scaled = scaler.fit_transform(l_nr)

eddpm = eDDPM(l_nr.shape[1])
eddpm.train(X_scaled)

In [None]:
d = eddpm.sample(5000)
d = d.detach().cpu().numpy()

samples = scaler.inverse_transform(d)
samples = np.expm1(samples)

In [None]:
def get_discrete_values(tmp, start, end):
    X = embeddings_df.iloc[:, start : end]
    res=[]
    score = float('inf')
    for x in tmp:
        att=None
        score = float('inf')
        for v in X:
            s = nn.CosineEmbeddingLoss()(torch.tensor(X[v].values).unsqueeze(0), torch.tensor(x).unsqueeze(0), torch.ones(x.shape))
            if (s<score):
               score=s
               att=v
        res.append(att)
    return res


def create_final_df(sample, columns, discrete_columns, num_features):
    final_df = pd.DataFrame()
    i=0
    j=0
    col_atts_indeces = np.concatenate([np.zeros(1).astype(int), np.cumsum(ebd.values_per_column)])
    for c in columns:
        if c in discrete_columns:
            tmp = sample[:,i:i+num_features]
            attributes = get_discrete_values(tmp, col_atts_indeces[j], col_atts_indeces[j+1])
            j+=1
            final_df[c] = attributes
            i+=num_features
        else:
            final_df[c] = sample[:,i]
            i+=1
    return final_df

In [None]:
final_df = create_final_df(samples, new_df.columns, discrete_columns, num_features)

In [None]:
final_df