In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import nn, optim
from torch.autograd import Variable
import pandas as pd
import numpy as np
from sklearn import preprocessing
from scipy.io import arff
from sklearn.preprocessing import OneHotEncoder
from scipy.stats import pearsonr, ks_2samp
from scipy.spatial.distance import jensenshannon
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from xgboost import XGBClassifier

In [2]:
def cuda_check():
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    return device

In [3]:
def load_diabetes_data(path, sep=","):
    device = cuda_check()
    # read in from csv
    df = pd.read_csv(path, sep=sep)
    df_base = df.iloc[:, :-1]
    df_target = df.iloc[:,-1].values
    x = df_base.values.reshape(-1, df_base.shape[1]).astype('float32')
    # stadardize values
    standardizer = preprocessing.StandardScaler()
    x_train = standardizer.fit_transform(x)
    x_train = torch.from_numpy(x_train).to(device)
    return x_train, standardizer, df_target

In [42]:
def load_data_v2(path, sep=","):
    device = cuda_check()
    # read in from csv
    df = pd.read_csv(path, sep=sep)
    df_base = df.iloc[:, :-1]
    df_target = df.iloc[:,-1].values

    # Identify numerical and categorical columns
    binary_categorical_cols = [col for col in df_base.columns if df_base[col].nunique() == 2 and df_base[col].dtype == 'int64']
    continuous_numerical_cols = [col for col in df_base.columns if col not in binary_categorical_cols and df_base[col].dtype in ['float64', 'int64']]

    continuous_data = df_base[continuous_numerical_cols].values.astype('float32')
    standardizer = preprocessing.StandardScaler()
    continuous_data = standardizer.fit_transform(continuous_data)

    binary_data = df_base[binary_categorical_cols].values.astype('float32')

    x = np.hstack((continuous_data, binary_data))
    # Convert to torch tensor and move to device
    x_train = torch.from_numpy(x).to(device)
    return x_train, standardizer, df_target, continuous_numerical_cols, binary_categorical_cols


In [43]:
def inverse_transform_custom(x_tensor, standardizer, continuous_numerical_cols, binary_categorical_cols):
    print(continuous_numerical_cols, binary_categorical_cols)
    x_array = x_tensor
    df = pd.DataFrame(x_array, columns=continuous_numerical_cols + binary_categorical_cols)
    # Separate continuous and binary data
    continuous_data = df[continuous_numerical_cols].values
    binary_data = df[binary_categorical_cols].values
    # Apply inverse transformation only to continuous data
    continuous_data = standardizer.inverse_transform(continuous_data)
    
    # Reconstruct the dataframe to maintain original order
    continuous_df = pd.DataFrame(continuous_data, columns=continuous_numerical_cols)
    binary_df = pd.DataFrame(binary_data, columns=binary_categorical_cols)

    processed_df = pd.concat([continuous_df, binary_df], axis=1)
    processed_df = processed_df[df.columns]

    return processed_df.values

In [44]:
def load_cardio_data(path, sep=";"):
    device = cuda_check()
    # read in from csv
    df = pd.read_csv(path, sep=sep)
    df_base = df.iloc[:, :-1]
    df_target = df.iloc[:,-1].values
    x = df_base.values.reshape(-1, df_base.shape[1]).astype('float32')
    # stadardize values
    standardizer = preprocessing.StandardScaler()
    x_train = standardizer.fit_transform(x)
    x_train = torch.from_numpy(x_train).to(device)
    return x_train, standardizer, df_target

In [45]:
def load_bank_data(path, sep=","):
    device = cuda_check()
    # read in from csv
    df = pd.read_csv(path, sep=sep)
    df_base = df.iloc[:, :-1]
    df_target = df.iloc[:,-1].values
    x = df_base.values.reshape(-1, df_base.shape[1]).astype('float32')
    # stadardize values
    standardizer = preprocessing.StandardScaler()
    x_train = standardizer.fit_transform(x)
    x_train = torch.from_numpy(x_train).to(device)
    return x_train, standardizer, df_target

In [46]:
def load_abalone_data(path, sep=","):
    device = cuda_check()
    ont_hot_encoder_abalone = OneHotEncoder(sparse_output=False)
    # read in from csv
    df = pd.read_csv(path, sep=sep)
    categorical_columns = df.select_dtypes(include=['object']).columns.tolist()
    one_hot_encoded = ont_hot_encoder_abalone.fit_transform(df[categorical_columns])
    one_hot_df = pd.DataFrame(one_hot_encoded, columns=ont_hot_encoder_abalone.get_feature_names_out(categorical_columns))

    df_encoded = pd.concat([one_hot_df, df], axis=1)
    # Drop the original categorical columns
    df= df_encoded.drop(categorical_columns, axis=1)

    # Concatenate the one-hot encoded columns with the original DataFrame
    df_base = df.iloc[:, :-1]
    df_target = df.iloc[:,-1].values
    x = df_base.values.reshape(-1, df_base.shape[1]).astype('float32')
    # stadardize values
    standardizer = preprocessing.StandardScaler()
    x_train = standardizer.fit_transform(x)
    x_train = torch.from_numpy(x_train).to(device)
    return x_train, standardizer, df_target, ont_hot_encoder_abalone

In [47]:
from torch.utils.data import Dataset, DataLoader
class DataBuilder(Dataset):
    def __init__(self, path, data_loader_func ,sep=","):
        data = data_loader_func(path, sep)
        self.x, self.standardizer, self.outcome, self.continuous_numerical_cols, self.binary_categorical_cols = data
        self.len=self.x.shape[0]
            
    def __getitem__(self,index):
        return self.x[index]
    def __len__(self):
        return self.len

In [48]:
class Autoencoder(nn.Module):
    def __init__(self,D_in,H=50,H2=12,latent_dim=3):

        #Encoder
        super(Autoencoder,self).__init__()
        self.linear1=nn.Linear(D_in,H)
        self.lin_bn1 = nn.BatchNorm1d(num_features=H)
        self.linear2=nn.Linear(H,H2)
        self.lin_bn2 = nn.BatchNorm1d(num_features=H2)
        self.linear3=nn.Linear(H2,H2)
        self.lin_bn3 = nn.BatchNorm1d(num_features=H2)

#         # Latent vectors mu and sigma
        self.fc1 = nn.Linear(H2, latent_dim)
#        self.bn1 = nn.BatchNorm1d(num_features=latent_dim)
        self.fc21 = nn.Linear(latent_dim, latent_dim)
        self.fc22 = nn.Linear(latent_dim, latent_dim)

#         # Sampling vector
        self.fc3 = nn.Linear(latent_dim, latent_dim)
#         self.fc_bn3 = nn.BatchNorm1d(latent_dim)
        self.fc4 = nn.Linear(latent_dim, H2)
#         self.fc_bn4 = nn.BatchNorm1d(H2)

#         # Decoder
        self.linear4=nn.Linear(H2,H2)
        self.lin_bn4 = nn.BatchNorm1d(num_features=H2)
        self.linear5=nn.Linear(H2,H)
        self.lin_bn5 = nn.BatchNorm1d(num_features=H)
        self.linear6=nn.Linear(H,D_in)
        self.lin_bn6 = nn.BatchNorm1d(num_features=D_in)

        self.gelu = nn.GELU()

    def encode(self, x):
        lin1 = self.gelu(self.lin_bn1(self.linear1(x)))
        lin2 = self.gelu(self.lin_bn2(self.linear2(lin1)))
        lin3 = self.gelu(self.lin_bn3(self.linear3(lin2)))

        fc1 = F.relu(self.fc1(lin3))

        r1 = self.fc21(fc1) # Generating mu
        r2 = self.fc22(fc1) # Generating sigma

        return r1, r2

    def reparameterize(self, mu, logvar):
        if self.training:
            std = logvar.mul(0.5).exp_() # Convert it to std deviation
            eps = Variable(std.data.new(std.size()).normal_()) # Generate a noise of same size as std
            return eps.mul(std).add_(mu) # Perform reparameterization
        else:
            return mu

    def decode(self, z):
        fc3 = self.gelu(self.fc3(z)) # Not sure why these two are required. 
        fc4 = self.gelu(self.fc4(fc3))#.view(128, -1)

        lin4 = self.gelu(self.lin_bn4(self.linear4(fc4)))
        lin5 = self.gelu(self.lin_bn5(self.linear5(lin4)))
        return self.lin_bn6(self.linear6(lin5))



    def forward(self, x):
        mu, logvar = self.encode(x)
        z = self.reparameterize(mu, logvar)
        # self.decode(z) ist später recon_batch, mu ist mu und logvar ist logvar
        return self.decode(z), mu, logvar
    
    def embed(self,x):
        mu, logvar = self.encode(x)
        z = self.reparameterize(mu, logvar)
        return z
    

In [49]:
class customLoss(nn.Module):
    def __init__(self):
        super(customLoss, self).__init__()
        self.mse_loss = nn.MSELoss(reduction="sum")

    # x_recon is the reconstruction batch created in the forward pass of the model, x is the original x batch, mu is mu, and logvar is logvar
    def forward(self, x_recon, x, mu, logvar):
        loss_MSE = self.mse_loss(x_recon, x)
        loss_KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())

        return loss_MSE + loss_KLD

## Declaring the Autoencoder

In [50]:
def declare_model(data_set, H, H2):
    device = cuda_check()
    D_in = data_set.x.shape[1]
    model = Autoencoder(D_in, H, H2).to(device)
    optimizer = optim.Adam(model.parameters(), lr=1e-3)
    loss_mse = customLoss()
    return model, loss_mse, optimizer

## Trian the Client

In [51]:
def train_encoder(DATA_PATH, loader_func, sep=","):
    device = cuda_check()
    data_set=DataBuilder(DATA_PATH, loader_func , sep)
    trainloader=DataLoader(dataset=data_set,batch_size=32)

    model, loss_mse, optimizer = declare_model(data_set, 50, 12)
    # Refactor
    epochs = 1000
    train_losses = []
    for epoch in range(1, epochs + 1):
        model.train()
        train_loss = 0
        for batch_idx, data in enumerate(trainloader):
            data = data.to(device)
            optimizer.zero_grad()
            recon_batch, mu, logvar = model(data)
            loss = loss_mse(recon_batch, data, mu, logvar)
            loss.backward()
            train_loss += loss.item()
            optimizer.step()
        if epoch % 2 == 0:
            print('====> Epoch: {} Average loss: {:.4f}'.format(
                epoch, train_loss / len(trainloader.dataset)))
            train_losses.append(train_loss / len(trainloader.dataset))
    return model, trainloader.dataset

In [92]:
model, standardizer = train_encoder('Data/diabetes.csv', load_diabetes_data, ",")

====> Epoch: 2 Average loss: 15.2255
====> Epoch: 4 Average loss: 13.2242
====> Epoch: 6 Average loss: 11.6673
====> Epoch: 8 Average loss: 10.8127
====> Epoch: 10 Average loss: 10.0439
====> Epoch: 12 Average loss: 9.6780
====> Epoch: 14 Average loss: 9.1477
====> Epoch: 16 Average loss: 8.7563
====> Epoch: 18 Average loss: 8.3776
====> Epoch: 20 Average loss: 8.1180
====> Epoch: 22 Average loss: 7.8314
====> Epoch: 24 Average loss: 7.7385
====> Epoch: 26 Average loss: 7.5789
====> Epoch: 28 Average loss: 7.3782
====> Epoch: 30 Average loss: 7.1825
====> Epoch: 32 Average loss: 7.1746
====> Epoch: 34 Average loss: 7.0268
====> Epoch: 36 Average loss: 6.9396
====> Epoch: 38 Average loss: 6.7982
====> Epoch: 40 Average loss: 6.8535
====> Epoch: 42 Average loss: 6.7480
====> Epoch: 44 Average loss: 6.5847
====> Epoch: 46 Average loss: 6.6423
====> Epoch: 48 Average loss: 6.5244
====> Epoch: 50 Average loss: 6.5967
====> Epoch: 52 Average loss: 6.4165
====> Epoch: 54 Average loss: 6.3871


In [52]:
def generate_latent_for_diabetes_ds(model, path="latent_data/diabetes_latent.csv"):
    DATA_PATH = "Data/diabetes.csv"
    df = load_diabetes_data(DATA_PATH, sep=",")
    actual_data = df[0]
    outcomes = df[2]
    outcomes_numeric = [1 if outcome == "b'tested_positive'" else 0 for outcome in outcomes]

    latents = []
    model.eval()
    with torch.no_grad():
        for idx, e in enumerate(actual_data):
            sample = e.unsqueeze(0)  # Add batch dimension
            latent = model.embed(sample)  # Get the latent representation
            latents.append(latent.squeeze().cpu().numpy())

    latents_df = pd.DataFrame(latents)
    outcomes_df = pd.DataFrame(outcomes_numeric)
    # Save DataFrame to a CSV file
    data_with_outcomes = pd.concat([latents_df, outcomes_df], axis=1)

    data_with_outcomes.to_csv(path, index=False)

In [94]:
generate_latent_for_diabetes_ds(model)

In [53]:
def reconstruction_of_latent(model, npy_x_data_file_path, npy_y_data_file_path, dataset):
    generated_latent_x = np.load(npy_x_data_file_path)
    generaated_latent_y = np.load(npy_y_data_file_path)
    generated_torch_data = torch.from_numpy(generated_latent_x).float()

    z = model.decode(generated_torch_data)

    # generated_data_x = standardizer.inverse_transform(z.cpu().detach().numpy())
    generated_data_x = inverse_transform_custom(z.detach().numpy(), dataset.standardizer , dataset.continuous_numerical_cols, dataset.binary_categorical_cols)
    return generated_data_x

In [314]:
x_real = load_diabetes_data("Data/diabetes.csv")
x_real = x_real[1].inverse_transform(x_real[0].cpu().detach().numpy())
x_real.shape

(768, 8)

In [315]:
x_syn = reconstruction_of_latent(model, 'X_num_train.npy', 'y_diabetes_train.npy', standardizer)

In [316]:
x_syn.shape

(10000, 2)

In [54]:
def compute_categorical_similarity(col_real, col_synthetic):
    # Compute Theil's U for categorical features
    p_real = pd.Series(col_real).value_counts(normalize=True)
    p_synthetic = pd.Series(col_synthetic).value_counts(normalize=True)
    u = (p_real * np.log(p_real / p_synthetic)).sum()
    return 1 - u
def column_similarity(real_data, synthetic_data):
    similarities = []
    for col_real, col_synthetic in zip(real_data, synthetic_data):
        correlation, _ = pearsonr(col_real, col_synthetic)
        similarity = correlation
        similarities.append(similarity)
    return np.mean(similarities)
def correlation_similarity(real_data, synthetic_data):
    real_corr = np.corrcoef(real_data, rowvar=False)
    synthetic_corr = np.corrcoef(synthetic_data, rowvar=False)
    correlation, _ = pearsonr(real_corr.flatten(), synthetic_corr.flatten())
    return correlation
def jensen_shannon_similarity(real_data, synthetic_data):
    similarities = []
    for col_real, col_synthetic in zip(real_data.T, synthetic_data.T):
        # Compute probability distributions and Jensen-Shannon divergence
        p_real = np.histogram(col_real, bins=10, density=True)[0]
        p_synthetic = np.histogram(col_synthetic, bins=10, density=True)[0]
        similarity = 1 - jensenshannon(p_real, p_synthetic)
        similarities.append(similarity)
    return np.mean(similarities)
def kolmogorov_smirnov_similarity(real_data, synthetic_data):
    similarities = []
    for col_real, col_synthetic in zip(real_data.T, synthetic_data.T):
        # Compute cumulative distributions and Kolmogorov-Smirnov distance
        similarity, _ = ks_2samp(col_real, col_synthetic)
        similarity = 1 - similarity
        similarities.append(similarity)
    return np.mean(similarities)
def propensity_mean_absolute_similarity(real_data, synthetic_data):
    # Train XGBoost classifier to discriminate between real and synthetic samples
    X = np.vstack([real_data, synthetic_data])
    y = np.concatenate([np.ones(len(real_data)), np.zeros(len(synthetic_data))])
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    classifier = XGBClassifier()
    classifier.fit(X_train, y_train)
    # Compute mean absolute error of classifier probabilities
    y_pred_proba = classifier.predict_proba(X_test)[:, 1]
    error = mean_absolute_error(y_test, y_pred_proba)
    return 1 - error

In [55]:
def resemblance_measure(real_data, synthetic_data):
    resemblance_score = (
        column_similarity(real_data, synthetic_data) +
        correlation_similarity(real_data, synthetic_data) +
        jensen_shannon_similarity(real_data, synthetic_data) +
        kolmogorov_smirnov_similarity(real_data, synthetic_data) +
        propensity_mean_absolute_similarity(real_data, synthetic_data)
    ) / 5
    print("Resemblance Score:", resemblance_score)

In [309]:
real_data = np.random.randn(100, 5)
synthetic_data = np.random.randn(100, 5)
propensity_mean_absolute_similarity(synthetic_data, synthetic_data)

0.10504327449016271

In [293]:
resemblance_measure(x_syn[:768], x_real)

Resemblance Score: 0.7588793579549271


## Cardio Data

In [103]:
model_cardio, standardizer_cardio = train_encoder('Data/cardio_train.csv', load_diabetes_data, ";")

====> Epoch: 2 Average loss: 9.8872
====> Epoch: 4 Average loss: 9.6558
====> Epoch: 6 Average loss: 9.5732
====> Epoch: 8 Average loss: 9.4705
====> Epoch: 10 Average loss: 9.4279
====> Epoch: 12 Average loss: 9.3978
====> Epoch: 14 Average loss: 9.3661
====> Epoch: 16 Average loss: 9.3469
====> Epoch: 18 Average loss: 9.3409
====> Epoch: 20 Average loss: 9.3224
====> Epoch: 22 Average loss: 9.3005
====> Epoch: 24 Average loss: 9.2791
====> Epoch: 26 Average loss: 9.2800
====> Epoch: 28 Average loss: 9.2714
====> Epoch: 30 Average loss: 9.2637
====> Epoch: 32 Average loss: 9.2540
====> Epoch: 34 Average loss: 9.2464
====> Epoch: 36 Average loss: 9.2477
====> Epoch: 38 Average loss: 9.2233
====> Epoch: 40 Average loss: 9.2268
====> Epoch: 42 Average loss: 9.2185
====> Epoch: 44 Average loss: 9.2141
====> Epoch: 46 Average loss: 9.2151
====> Epoch: 48 Average loss: 9.2138
====> Epoch: 50 Average loss: 9.2058
====> Epoch: 52 Average loss: 9.1916
====> Epoch: 54 Average loss: 9.1927
====>

In [56]:
def generate_latent_for_cardio_ds(model):
    DATA_PATH = "Data/cardio_train.csv"
    df = load_diabetes_data(DATA_PATH, sep=";")
    actual_data = df[0]
    outcomes = df[2]

    latents = []
    model.eval()
    with torch.no_grad():
        for idx, e in enumerate(actual_data):
            sample = e.unsqueeze(0)  # Add batch dimension
            latent = model.embed(sample)  # Get the latent representation
            latents.append(latent.squeeze().cpu().numpy())

    latents_df = pd.DataFrame(latents)
    outcomes_df = pd.DataFrame(outcomes)
    # Save DataFrame to a CSV file
    data_with_outcomes = pd.concat([latents_df, outcomes_df], axis=1)

    data_with_outcomes.to_csv('latent_data/cardio_latent.csv', index=False)

In [106]:
generate_latent_for_cardio_ds(model_cardio)

In [107]:
x_cardio_syn = reconstruction_of_latent(model_cardio, 'syn_latent/cardio_synthetic/X_num_unnorm.npy', 'syn_latent/cardio_synthetic/y_train.npy', standardizer_cardio)

In [111]:
x_cardio_syn.shape

(10000, 12)

In [110]:
x_cardio_real = load_diabetes_data("Data/cardio_train.csv", ";")
x_cardio_real = x_cardio_real[1].inverse_transform(x_cardio_real[0].cpu().detach().numpy())
x_cardio_real.shape

(70000, 12)

In [112]:
resemblance_measure(x_cardio_syn, x_cardio_real[:9999])

Resemblance Score: 0.7650878484389891


## Loan Medium Level data

In [114]:
model_bank, bank_standardizer = train_encoder("Data/bank.csv", load_bank_data, ",")

====> Epoch: 2 Average loss: 18.2724
====> Epoch: 4 Average loss: 14.4822
====> Epoch: 6 Average loss: 12.8783
====> Epoch: 8 Average loss: 12.2937
====> Epoch: 10 Average loss: 11.9030
====> Epoch: 12 Average loss: 11.7320
====> Epoch: 14 Average loss: 11.5080
====> Epoch: 16 Average loss: 11.3381
====> Epoch: 18 Average loss: 11.2089
====> Epoch: 20 Average loss: 11.1627
====> Epoch: 22 Average loss: 11.0722
====> Epoch: 24 Average loss: 11.0496
====> Epoch: 26 Average loss: 10.9655
====> Epoch: 28 Average loss: 10.9049
====> Epoch: 30 Average loss: 10.8614
====> Epoch: 32 Average loss: 10.7989
====> Epoch: 34 Average loss: 10.7828
====> Epoch: 36 Average loss: 10.7792
====> Epoch: 38 Average loss: 10.7873
====> Epoch: 40 Average loss: 10.6894
====> Epoch: 42 Average loss: 10.6745
====> Epoch: 44 Average loss: 10.6588
====> Epoch: 46 Average loss: 10.6109
====> Epoch: 48 Average loss: 10.6120
====> Epoch: 50 Average loss: 10.6297
====> Epoch: 52 Average loss: 10.6019
====> Epoch: 54 

In [120]:
def generate_latent_for_bank_ds(model):
    DATA_PATH = "Data/bank.csv"
    df = load_diabetes_data(DATA_PATH, sep=",")
    actual_data = df[0]
    outcomes = df[2]

    latents = []
    model.eval()
    with torch.no_grad():
        for idx, e in enumerate(actual_data):
            sample = e.unsqueeze(0)  # Add batch dimension
            latent = model.embed(sample)  # Get the latent representation
            latents.append(latent.squeeze().cpu().numpy())

    latents_df = pd.DataFrame(latents)
    outcomes_df = pd.DataFrame(outcomes)
    # Save DataFrame to a CSV file
    data_with_outcomes = pd.concat([latents_df, outcomes_df], axis=1)

    data_with_outcomes.to_csv('latent_data/bank_latent.csv', index=False)

In [121]:
generate_latent_for_bank_ds(model_bank)

In [122]:
x_bank_syn = reconstruction_of_latent(model_bank, 'syn_latent/bank_synth/X_num_unnorm.npy', 'syn_latent/bank_synth/y_train.npy', bank_standardizer)

In [124]:
x_bank_real = load_bank_data("Data/bank.csv", ",")
x_bank_real = x_bank_real[1].inverse_transform(x_bank_real[0].cpu().detach().numpy())
x_bank_real.shape

(5000, 13)

In [125]:
x_bank_syn.shape

(10000, 13)

In [126]:
resemblance_measure(x_bank_syn[:4999], x_bank_real)

Resemblance Score: 0.8743903071420271


# Vertical Partitioning Experiment

### Split Columns in 3 clients
### Run Individual Auto Encoder
### Generate N Latents
### Get Generated Data From Tabddpm Latent

In [531]:
cardio_all_clients = pd.read_csv("Data/diabetes.csv")
cardio_all_clients

Unnamed: 0,preg,plas,pres,skin,insu,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,b'tested_positive'
1,1,85,66,29,0,26.6,0.351,31,b'tested_negative'
2,8,183,64,0,0,23.3,0.672,32,b'tested_positive'
3,1,89,66,23,94,28.1,0.167,21,b'tested_negative'
4,0,137,40,35,168,43.1,2.288,33,b'tested_positive'
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,b'tested_negative'
764,2,122,70,27,0,36.8,0.340,27,b'tested_negative'
765,5,121,72,23,112,26.2,0.245,30,b'tested_negative'
766,1,126,60,0,0,30.1,0.349,47,b'tested_positive'


In [532]:
df_label = cardio_all_clients.iloc[:, -1:]
df_client_1 = cardio_all_clients.iloc[:, :2]
df_client_2 = cardio_all_clients.iloc[:, 2:4]
df_client_3 = cardio_all_clients.iloc[:, 4:8]

In [533]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df_label['class_binary'] = le.fit_transform(df_label['class'])
df_label = df_label.drop(columns=["class"])

df_label

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_label['class_binary'] = le.fit_transform(df_label['class'])


Unnamed: 0,class_binary
0,1
1,0
2,1
3,0
4,1
...,...
763,0
764,0
765,0
766,1


In [534]:
df_client_1 = pd.concat([df_client_1, df_label], axis=1)

In [535]:
df_client_1.to_csv("client_n_data/diabetes_clients/client_1.csv", index=False)

In [536]:
df_client_2 = pd.concat([df_client_2, df_label], axis=1)

In [537]:
df_client_2.to_csv("client_n_data/diabetes_clients/client_2.csv", index=False)

In [538]:
df_client_3 = pd.concat([df_client_3, df_label], axis=1)

In [539]:
df_client_3.to_csv("client_n_data/diabetes_clients/client_3.csv", index=False)

In [540]:
model_c1, dataset = train_encoder("client_n_data/diabetes_clients/client_1.csv", load_data_v2, ",")

====> Epoch: 2 Average loss: 3.5405
====> Epoch: 4 Average loss: 3.0714
====> Epoch: 6 Average loss: 2.4556
====> Epoch: 8 Average loss: 2.2651
====> Epoch: 10 Average loss: 2.0781
====> Epoch: 12 Average loss: 2.0319
====> Epoch: 14 Average loss: 1.9991
====> Epoch: 16 Average loss: 1.8858
====> Epoch: 18 Average loss: 1.8025
====> Epoch: 20 Average loss: 1.8712
====> Epoch: 22 Average loss: 1.8883
====> Epoch: 24 Average loss: 1.8504
====> Epoch: 26 Average loss: 1.7753
====> Epoch: 28 Average loss: 1.8197
====> Epoch: 30 Average loss: 1.7261
====> Epoch: 32 Average loss: 1.7277
====> Epoch: 34 Average loss: 1.7315
====> Epoch: 36 Average loss: 1.7500
====> Epoch: 38 Average loss: 1.7442
====> Epoch: 40 Average loss: 1.7476
====> Epoch: 42 Average loss: 1.7582
====> Epoch: 44 Average loss: 1.7040
====> Epoch: 46 Average loss: 1.7155
====> Epoch: 48 Average loss: 1.7261
====> Epoch: 50 Average loss: 1.6683
====> Epoch: 52 Average loss: 1.7289
====> Epoch: 54 Average loss: 1.7096
====>

In [386]:
def generate_latent_client_n_for_diabetes_ds(model, source='client_n_data/diabetes_clients/client_1.csv', path="client_n_data/latent/client_1.csv"):
    DATA_PATH = source
    df = load_data_v2(DATA_PATH, sep=",")
    actual_data = df[0]
    outcomes = df[2]

    latents = []
    model.eval()
    with torch.no_grad():
        for idx, e in enumerate(actual_data):
            sample = e.unsqueeze(0)  # Add batch dimension
            latent = model.embed(sample)  # Get the latent representation
            latents.append(latent.squeeze().cpu().numpy())

    latents_df = pd.DataFrame(latents)
    outcomes_df = pd.DataFrame(outcomes)
    # Save DataFrame to a CSV file
    data_with_outcomes = pd.concat([latents_df, outcomes_df], axis=1)

    data_with_outcomes.to_csv(path, index=False)

In [541]:
generate_latent_client_n_for_diabetes_ds(model_c1, 'client_n_data/diabetes_clients/client_1.csv', 'client_n_data/latent/client_1.csv')

In [543]:
model_c2, standardizer_c2 = train_encoder("client_n_data/diabetes_clients/client_2.csv", load_data_v2, ",")

====> Epoch: 2 Average loss: 3.9282
====> Epoch: 4 Average loss: 3.3327
====> Epoch: 6 Average loss: 2.8338
====> Epoch: 8 Average loss: 2.5488
====> Epoch: 10 Average loss: 2.3846
====> Epoch: 12 Average loss: 2.1060
====> Epoch: 14 Average loss: 1.9519
====> Epoch: 16 Average loss: 1.8267
====> Epoch: 18 Average loss: 1.8156
====> Epoch: 20 Average loss: 1.8492
====> Epoch: 22 Average loss: 1.8044
====> Epoch: 24 Average loss: 1.7568
====> Epoch: 26 Average loss: 1.7837
====> Epoch: 28 Average loss: 1.7341
====> Epoch: 30 Average loss: 1.7869
====> Epoch: 32 Average loss: 1.6593
====> Epoch: 34 Average loss: 1.7508
====> Epoch: 36 Average loss: 1.6803
====> Epoch: 38 Average loss: 1.6675
====> Epoch: 40 Average loss: 1.7030
====> Epoch: 42 Average loss: 1.6460
====> Epoch: 44 Average loss: 1.7300
====> Epoch: 46 Average loss: 1.6988
====> Epoch: 48 Average loss: 1.6349
====> Epoch: 50 Average loss: 1.7173
====> Epoch: 52 Average loss: 1.7015
====> Epoch: 54 Average loss: 1.7105
====>

In [544]:
generate_latent_client_n_for_diabetes_ds(model_c2, 'client_n_data/diabetes_clients/client_2.csv', 'client_n_data/latent/client_2.csv')

In [545]:
model_c3, standardizer_c3 = train_encoder("client_n_data/diabetes_clients/client_3.csv", load_data_v2, ",")

====> Epoch: 2 Average loss: 7.9732
====> Epoch: 4 Average loss: 7.4029
====> Epoch: 6 Average loss: 6.6994
====> Epoch: 8 Average loss: 5.3698
====> Epoch: 10 Average loss: 4.8490
====> Epoch: 12 Average loss: 4.4077
====> Epoch: 14 Average loss: 4.1814
====> Epoch: 16 Average loss: 4.0216
====> Epoch: 18 Average loss: 3.9864
====> Epoch: 20 Average loss: 3.9269
====> Epoch: 22 Average loss: 3.8874
====> Epoch: 24 Average loss: 3.7569
====> Epoch: 26 Average loss: 3.7931
====> Epoch: 28 Average loss: 3.7413
====> Epoch: 30 Average loss: 3.7414
====> Epoch: 32 Average loss: 3.6490
====> Epoch: 34 Average loss: 3.6011
====> Epoch: 36 Average loss: 3.6879
====> Epoch: 38 Average loss: 3.6091
====> Epoch: 40 Average loss: 3.5919
====> Epoch: 42 Average loss: 3.5029
====> Epoch: 44 Average loss: 3.5298
====> Epoch: 46 Average loss: 3.4747
====> Epoch: 48 Average loss: 3.4911
====> Epoch: 50 Average loss: 3.5061
====> Epoch: 52 Average loss: 3.4882
====> Epoch: 54 Average loss: 3.4882
====>

In [546]:
generate_latent_client_n_for_diabetes_ds(model_c3, 'client_n_data/diabetes_clients/client_3.csv', 'client_n_data/latent/client_3.csv')

### N Client Experiment

# Client 1 Reconstruction Check

In [553]:
diabetes_generated_latent = np.load("syn_data/syn_latent/diabetes generated/X_num_unnorm.npy")

In [554]:
diabetes_generated_latent.shape

(800, 9)

In [555]:
client_1_generated_latent = diabetes_generated_latent[:, 0:3]
np.save('syn_data/syn_latent/diabetes generated/X_c1_unnorm.npy', client_1_generated_latent)

In [556]:
client_2_generated_latent = diabetes_generated_latent[:, 3:6]
np.save('syn_data/syn_latent/diabetes generated/X_c2_unnorm.npy', client_2_generated_latent)

In [557]:
client_3_generated_latent =  diabetes_generated_latent[:, 6:9]
np.save('syn_data/syn_latent/diabetes generated/X_c3_unnorm.npy', client_3_generated_latent)

In [558]:
x_client_1_syn = reconstruction_of_latent(model_c1, "syn_data/syn_latent/diabetes generated/X_c1_unnorm.npy", "syn_data/syn_latent/diabetes generated/y_train.npy", dataset)

['preg', 'plas'] []


In [559]:
x_client_1_syn.shape

(800, 2)

In [560]:
x_diabetes_c1_real = load_diabetes_data("client_n_data/diabetes_clients/client_1.csv", ",")
x_diabetes_c1_real = x_diabetes_c1_real[1].inverse_transform(x_diabetes_c1_real[0].cpu().detach().numpy())
x_diabetes_c1_real.shape

(768, 2)

In [561]:
resemblance_measure(x_client_1_syn[:768], x_diabetes_c1_real)

Resemblance Score: 0.9113609494611784


In [562]:
jensen_shannon_similarity(x_client_1_syn[:768], x_diabetes_c1_real)

0.3285598678261618

# Client 2 Reconstruction Check

In [563]:
x_client_2_syn = reconstruction_of_latent(model_c2, "syn_data/syn_latent/diabetes generated/X_c2_unnorm.npy", "syn_data/syn_latent/diabetes generated/y_train.npy", standardizer_c2)

['pres', 'skin'] []


In [564]:
x_diabetes_c2_real = load_diabetes_data("client_n_data/diabetes_clients/client_2.csv", ",")
x_diabetes_c2_real = x_diabetes_c2_real[1].inverse_transform(x_diabetes_c2_real[0].cpu().detach().numpy())
x_diabetes_c2_real.shape

(768, 2)

In [565]:
resemblance_measure(x_diabetes_c2_real, x_client_2_syn[:768])
jensen_shannon_similarity(x_diabetes_c2_real, x_client_2_syn[:768])

Resemblance Score: 0.8878641888259153


0.4493680841694591

# Check for Client 3 resemblance

In [566]:
x_client_3_syn = reconstruction_of_latent(model_c3, "syn_data/syn_latent/diabetes generated/X_c3_unnorm.npy", "syn_data/syn_latent/diabetes generated/y_train.npy", standardizer_c3)

['insu', 'mass', 'pedi', 'age'] []


In [567]:
x_diabetes_c3_real = load_diabetes_data("client_n_data/diabetes_clients/client_3.csv", ",")
x_diabetes_c3_real = x_diabetes_c3_real[1].inverse_transform(x_diabetes_c3_real[0].cpu().detach().numpy())
x_diabetes_c3_real.shape

(768, 4)

In [568]:
resemblance_measure(x_client_3_syn[:768], x_diabetes_c3_real)


Resemblance Score: 0.6834076393911033


In [569]:
jensen_shannon_similarity(x_client_3_syn[:768], x_diabetes_c3_real)

0.3817520677806493

# Loan Data Client 

In [486]:
loan_data_all_client = pd.read_csv("Data/bank.csv")
loan_data_all_client

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,1,25,1,49,91107,4,1.6,1,0,0,1,0,0,0
1,2,45,19,34,90089,3,1.5,1,0,0,1,0,0,0
2,3,39,15,11,94720,1,1.0,1,0,0,0,0,0,0
3,4,35,9,100,94112,1,2.7,2,0,0,0,0,0,0
4,5,35,8,45,91330,4,1.0,2,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,4996,29,3,40,92697,1,1.9,3,0,0,0,0,1,0
4996,4997,30,4,15,92037,4,0.4,1,85,0,0,0,1,0
4997,4998,63,39,24,93023,2,0.3,3,0,0,0,0,0,0
4998,4999,65,40,49,90034,3,0.5,2,0,0,0,0,1,0


In [487]:
df_label = loan_data_all_client.iloc[:, -1:]
df_client_1 = loan_data_all_client.iloc[:, :4]
df_client_2 = loan_data_all_client.iloc[:, 4:8]
df_client_3 = loan_data_all_client.iloc[:, 8:13]


In [488]:
df_client_1 = pd.concat([df_client_1, df_label], axis=1)
df_client_1.to_csv("client_n_data/bank_clients/bank_1.csv", index=False)
df_client_2 = pd.concat([df_client_2, df_label], axis=1)
df_client_2.to_csv("client_n_data/bank_clients/bank_2.csv", index=False)
df_client_3 = pd.concat([df_client_3, df_label], axis=1)
df_client_3.to_csv("client_n_data/bank_clients/bank_3.csv", index=False)

In [489]:
model_b1, dataset_b1 = train_encoder("client_n_data/bank_clients/bank_1.csv", load_data_v2, ",")

====> Epoch: 2 Average loss: 4.9692
====> Epoch: 4 Average loss: 3.9265
====> Epoch: 6 Average loss: 3.4615
====> Epoch: 8 Average loss: 3.3442
====> Epoch: 10 Average loss: 3.2889
====> Epoch: 12 Average loss: 3.2613
====> Epoch: 14 Average loss: 3.2300
====> Epoch: 16 Average loss: 3.1994
====> Epoch: 18 Average loss: 3.1300
====> Epoch: 20 Average loss: 3.1381
====> Epoch: 22 Average loss: 3.0808
====> Epoch: 24 Average loss: 3.0939
====> Epoch: 26 Average loss: 3.0972
====> Epoch: 28 Average loss: 3.0837
====> Epoch: 30 Average loss: 3.0672
====> Epoch: 32 Average loss: 3.0700
====> Epoch: 34 Average loss: 3.0944
====> Epoch: 36 Average loss: 3.0786
====> Epoch: 38 Average loss: 3.0689
====> Epoch: 40 Average loss: 3.0782
====> Epoch: 42 Average loss: 3.0759
====> Epoch: 44 Average loss: 3.0483
====> Epoch: 46 Average loss: 3.0619
====> Epoch: 48 Average loss: 3.0768
====> Epoch: 50 Average loss: 3.0688
====> Epoch: 52 Average loss: 3.0540
====> Epoch: 54 Average loss: 3.0623
====>

In [491]:
model_b2, dataset_b2 = train_encoder("client_n_data/bank_clients/bank_2.csv", load_data_v2, ",")

====> Epoch: 2 Average loss: 4.5884
====> Epoch: 4 Average loss: 3.7007
====> Epoch: 6 Average loss: 3.5828
====> Epoch: 8 Average loss: 3.5160
====> Epoch: 10 Average loss: 3.5145
====> Epoch: 12 Average loss: 3.4839
====> Epoch: 14 Average loss: 3.4773
====> Epoch: 16 Average loss: 3.4263
====> Epoch: 18 Average loss: 3.4767
====> Epoch: 20 Average loss: 3.4550
====> Epoch: 22 Average loss: 3.4763
====> Epoch: 24 Average loss: 3.4427
====> Epoch: 26 Average loss: 3.4420
====> Epoch: 28 Average loss: 3.4286
====> Epoch: 30 Average loss: 3.4584
====> Epoch: 32 Average loss: 3.4476
====> Epoch: 34 Average loss: 3.4403
====> Epoch: 36 Average loss: 3.4115
====> Epoch: 38 Average loss: 3.4350
====> Epoch: 40 Average loss: 3.4089
====> Epoch: 42 Average loss: 3.4242
====> Epoch: 44 Average loss: 3.4247
====> Epoch: 46 Average loss: 3.4150
====> Epoch: 48 Average loss: 3.4159
====> Epoch: 50 Average loss: 3.4147
====> Epoch: 52 Average loss: 3.4004
====> Epoch: 54 Average loss: 3.4001
====>

In [492]:
model_b3, dataset_b3 = train_encoder("client_n_data/bank_clients/bank_3.csv", load_data_v2, ",")

====> Epoch: 2 Average loss: 3.8010
====> Epoch: 4 Average loss: 2.3006
====> Epoch: 6 Average loss: 1.6693
====> Epoch: 8 Average loss: 1.4318
====> Epoch: 10 Average loss: 1.3062
====> Epoch: 12 Average loss: 1.2543
====> Epoch: 14 Average loss: 1.2379
====> Epoch: 16 Average loss: 1.2586
====> Epoch: 18 Average loss: 1.2152
====> Epoch: 20 Average loss: 1.2382
====> Epoch: 22 Average loss: 1.2347
====> Epoch: 24 Average loss: 1.2300
====> Epoch: 26 Average loss: 1.2194
====> Epoch: 28 Average loss: 1.2488
====> Epoch: 30 Average loss: 1.2294
====> Epoch: 32 Average loss: 1.2362
====> Epoch: 34 Average loss: 1.2366
====> Epoch: 36 Average loss: 1.2583
====> Epoch: 38 Average loss: 1.2294
====> Epoch: 40 Average loss: 1.2523
====> Epoch: 42 Average loss: 1.2345
====> Epoch: 44 Average loss: 1.2336
====> Epoch: 46 Average loss: 1.2346
====> Epoch: 48 Average loss: 1.2261
====> Epoch: 50 Average loss: 1.2371
====> Epoch: 52 Average loss: 1.2403
====> Epoch: 54 Average loss: 1.2358
====>

In [493]:
generate_latent_client_n_for_diabetes_ds(model_b1, 'client_n_data/bank_clients/bank_1.csv', 'client_n_data/latent/bank_1.csv')
generate_latent_client_n_for_diabetes_ds(model_b2, 'client_n_data/bank_clients/bank_2.csv', 'client_n_data/latent/bank_2.csv')
generate_latent_client_n_for_diabetes_ds(model_b3, 'client_n_data/bank_clients/bank_3.csv', 'client_n_data/latent/bank_3.csv')

# Cardio N Client 

In [570]:
cardio_data_all_client = pd.read_csv("Data/cardio_train.csv", sep=";")
cardio_data_all_client

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,99993,19240,2,168,76.0,120,80,1,1,1,0,1,0
69996,99995,22601,1,158,126.0,140,90,2,2,0,0,1,1
69997,99996,19066,2,183,105.0,180,90,3,1,0,1,0,1
69998,99998,22431,1,163,72.0,135,80,1,2,0,0,0,1


In [548]:
df_label = cardio_data_all_client.iloc[:, -1:]
df_client_1 = cardio_data_all_client.iloc[:, :4]
df_client_2 = cardio_data_all_client.iloc[:, 4:8]
df_client_3 = cardio_data_all_client.iloc[:, 8:12]

In [549]:
df_client_1 = pd.concat([df_client_1, df_label], axis=1)
df_client_1.to_csv("client_n_data/cardio_clients/cardio_1.csv", index=False)

df_client_2 = pd.concat([df_client_2, df_label], axis=1)
df_client_2.to_csv("client_n_data/cardio_clients/cardio_2.csv", index=False)

df_client_3 = pd.concat([df_client_3, df_label], axis=1)
df_client_3.to_csv("client_n_data/cardio_clients/cardio_3.csv", index=False)

In [550]:
model_car1, standardizer_car1 = train_encoder("client_n_data/cardio_clients/cardio_1.csv", load_data_v2, ",")

====> Epoch: 2 Average loss: 2.9341
====> Epoch: 4 Average loss: 2.8984
====> Epoch: 6 Average loss: 2.8993
====> Epoch: 8 Average loss: 2.8941
====> Epoch: 10 Average loss: 2.8949
====> Epoch: 12 Average loss: 2.8887
====> Epoch: 14 Average loss: 2.8923
====> Epoch: 16 Average loss: 2.8933
====> Epoch: 18 Average loss: 2.8815
====> Epoch: 20 Average loss: 2.8928
====> Epoch: 22 Average loss: 2.8910
====> Epoch: 24 Average loss: 2.8858
====> Epoch: 26 Average loss: 2.8900
====> Epoch: 28 Average loss: 2.8879
====> Epoch: 30 Average loss: 2.8904
====> Epoch: 32 Average loss: 2.8890
====> Epoch: 34 Average loss: 2.8867
====> Epoch: 36 Average loss: 2.8889
====> Epoch: 38 Average loss: 2.8841
====> Epoch: 40 Average loss: 2.8829
====> Epoch: 42 Average loss: 2.8885
====> Epoch: 44 Average loss: 2.8913
====> Epoch: 46 Average loss: 2.8864
====> Epoch: 48 Average loss: 2.8905
====> Epoch: 50 Average loss: 2.8834
====> Epoch: 52 Average loss: 2.8849
====> Epoch: 54 Average loss: 2.8823
====>

In [571]:
model_car2, standardizer_car2 = train_encoder("client_n_data/cardio_clients/cardio_2.csv", load_data_v2, ",")
model_car3, standardizer_car3 = train_encoder("client_n_data/cardio_clients/cardio_3.csv", load_data_v2, ",")

====> Epoch: 2 Average loss: 3.6959
====> Epoch: 4 Average loss: 3.5846
====> Epoch: 6 Average loss: 3.5395
====> Epoch: 8 Average loss: 3.5112
====> Epoch: 10 Average loss: 3.5108
====> Epoch: 12 Average loss: 3.4936
====> Epoch: 14 Average loss: 3.4957
====> Epoch: 16 Average loss: 3.5019
====> Epoch: 18 Average loss: 3.4933
====> Epoch: 20 Average loss: 3.4925
====> Epoch: 22 Average loss: 3.4874
====> Epoch: 24 Average loss: 3.4898
====> Epoch: 26 Average loss: 3.4832
====> Epoch: 28 Average loss: 3.4852
====> Epoch: 30 Average loss: 3.4868
====> Epoch: 32 Average loss: 3.4802
====> Epoch: 34 Average loss: 3.4806
====> Epoch: 36 Average loss: 3.4865
====> Epoch: 38 Average loss: 3.4768
====> Epoch: 40 Average loss: 3.4813
====> Epoch: 42 Average loss: 3.4817
====> Epoch: 44 Average loss: 3.4768
====> Epoch: 46 Average loss: 3.4764
====> Epoch: 48 Average loss: 3.4842
====> Epoch: 50 Average loss: 3.4792
====> Epoch: 52 Average loss: 3.4819
====> Epoch: 54 Average loss: 3.4775
====>

In [572]:
generate_latent_client_n_for_diabetes_ds(model_car1, 'client_n_data/cardio_clients/cardio_1.csv', 'client_n_data/latent/cardio_1_n.csv')
generate_latent_client_n_for_diabetes_ds(model_car2, 'client_n_data/cardio_clients/cardio_2.csv', 'client_n_data/latent/cardio_2_n.csv')
generate_latent_client_n_for_diabetes_ds(model_car3, 'client_n_data/cardio_clients/cardio_3.csv', 'client_n_data/latent/cardio_3_n.csv')

# Credit Risk Synthetic Data Regeneration

In [494]:
bank_generated_latent = np.load("syn_data/syn_latent/bank_generated/X_num_unnorm.npy")
bank_generated_latent

array([[-0.82446558, -0.10738604, -0.70312679, ..., -0.14555997,
        -0.13987976,  0.14407737],
       [ 0.98803076,  0.26679164,  1.09171258, ...,  0.50479601,
         0.50116088, -0.46562918],
       [-0.60384326, -2.38580069, -1.05241235, ..., -0.16404833,
        -0.15464898,  0.11206154],
       ...,
       [ 0.07394192,  0.13784105,  0.17234722, ..., -0.14442296,
        -0.14367248,  0.15206941],
       [ 0.03557592,  0.95764422,  0.21473787, ...,  1.15029097,
         1.10996377, -1.15720295],
       [ 0.46487051, -0.16059849,  0.41815324, ..., -1.10103208,
        -1.09973648,  1.10311836]])

In [137]:
bank_generated_latent[:, 6:9]

array([[-5.19517304, -0.16316561,  0.05517056],
       [-5.18313103, -0.1585745 ,  0.06426594],
       [ 0.4720868 ,  0.32224879, -0.56347627],
       ...,
       [ 0.99620575,  5.16704597,  5.27336353],
       [ 0.48797082, -0.12506895, -0.69481049],
       [ 0.14273366, -1.04822629, -0.94084446]])

In [495]:
client_1_bank_generated_latent = bank_generated_latent[:, 0:3]
np.save('syn_data/syn_latent/bank_generated/X_c1_unnorm.npy', client_1_bank_generated_latent)
client_2_bank_generated_latent = bank_generated_latent[:, 3:6]
np.save('syn_data/syn_latent/bank_generated/X_c2_unnorm.npy', client_2_bank_generated_latent)
client_3_bank_generated_latent =  bank_generated_latent[:, 6:9]
np.save('syn_data/syn_latent/bank_generated/X_c3_unnorm.npy', client_3_bank_generated_latent)

### Bank Client 1

In [497]:
x_client_1_bank_syn = reconstruction_of_latent(model_b1, "syn_data/syn_latent/bank_generated/X_c1_unnorm.npy", "syn_data/syn_latent/bank_generated/y_train.npy", dataset_b1)

['ID', 'Age', 'Experience', 'Income']


In [499]:
x_client_1_bank_syn.shape

(5000, 4)

In [518]:
x_bank_c1_real = load_data_v2("client_n_data/bank_clients/bank_1.csv", ",")
x_bank_c1_real = x_bank_c1_real[1].inverse_transform(x_bank_c1_real[0].cpu().detach().numpy())
x_bank_c1_real.shape

(5000, 4)

In [519]:
resemblance_measure(x_client_1_bank_syn, x_bank_c1_real)

Resemblance Score: 0.9202213471471097


In [520]:
jensen_shannon_similarity(x_client_1_bank_syn, x_bank_c1_real)

0.6599078273534568

### Bank Client 2

In [527]:
x_client_2_bank_syn = reconstruction_of_latent(model_b2, "syn_data/syn_latent/bank_generated/X_c2_unnorm.npy", "syn_data/syn_latent/bank_generated/y_train.npy", dataset_b2)

['ZIP Code', 'Family', 'CCAvg', 'Education'] []


In [522]:
x_bank_c2_real = load_data_v2("client_n_data/bank_clients/bank_2.csv", ",")
x_bank_c2_real = x_bank_c2_real[1].inverse_transform(x_bank_c2_real[0].cpu().detach().numpy())
x_bank_c2_real.shape

(5000, 4)

In [523]:
resemblance_measure(x_client_2_bank_syn, x_bank_c2_real)

Resemblance Score: 0.8127648780405755


In [524]:
jensen_shannon_similarity(x_client_2_bank_syn, x_bank_c2_real)

0.5087392517175497

### Bank Client 3

In [528]:
x_client_3_bank_syn = reconstruction_of_latent(model_b3, "syn_data/syn_latent/bank_generated/X_c3_unnorm.npy", "syn_data/syn_latent/bank_generated/y_train.npy", dataset_b3)

['Mortgage'] ['Personal Loan', 'Securities Account', 'CD Account', 'Online']


In [530]:
x_client_3_bank_syn[0]

array([25.080957  ,  0.0762483 ,  0.10269814,  0.05266344,  0.59364074],
      dtype=float32)

In [516]:
x_bank_c3_real = load_data_v2("client_n_data/bank_clients/bank_3.csv", ",")
x_bank_c3_real = x_bank_c3_real[1].inverse_transform(x_bank_c3_real[0].cpu().detach().numpy())
x_bank_c3_real.shape

(5000, 5)

In [517]:
resemblance_measure(x_client_3_bank_syn, x_bank_c3_real)

Resemblance Score: 0.29929480879930015


In [512]:
jensen_shannon_similarity(x_client_3_bank_syn, x_bank_c3_real)

0.6017650031482227

# Cardio Reconstruction

In [171]:
cardio_n_generated_latent = np.load("syn_data/syn_latent/cardio_generated/X_num_unnorm.npy")
cardio_n_generated_latent.shape

(70000, 9)

In [172]:
client_1_cardio_generated_latent = bank_generated_latent[:, 0:3]
np.save('syn_data/syn_latent/cardio_generated/X_c1_unnorm.npy', client_1_cardio_generated_latent)
client_2_cardio_generated_latent = bank_generated_latent[:, 3:6]
np.save('syn_data/syn_latent/cardio_generated/X_c2_unnorm.npy', client_2_cardio_generated_latent)
client_3_cardio_generated_latent =  bank_generated_latent[:, 6:9]
np.save('syn_data/syn_latent/cardio_generated/X_c3_unnorm.npy', client_3_cardio_generated_latent)

# Cardio Client 1

In [173]:
x_client_1_cardio_syn = reconstruction_of_latent(model_car1, "syn_data/syn_latent/cardio_generated/X_c1_unnorm.npy", "syn_data/syn_latent/cardio_generated/y_train.npy", standardizer_car1)

In [174]:
x_cardio_c1_real = load_diabetes_data("client_n_data/cardio_clients/cardio_1.csv", ",")
x_cardio_c1_real = x_cardio_c1_real[1].inverse_transform(x_cardio_c1_real[0].cpu().detach().numpy())
x_cardio_c1_real.shape

(70000, 4)

In [190]:
x_cardio_c1_real

array([[1.9312500e-03, 1.8393000e+04, 2.0000000e+00, 1.6800000e+02],
       [1.0019312e+00, 2.0228000e+04, 1.0000000e+00, 1.5600000e+02],
       [2.0019312e+00, 1.8857000e+04, 1.0000000e+00, 1.6500000e+02],
       ...,
       [9.9996000e+04, 1.9066000e+04, 2.0000000e+00, 1.8300000e+02],
       [9.9998000e+04, 2.2431000e+04, 1.0000000e+00, 1.6300000e+02],
       [9.9999000e+04, 2.0540000e+04, 1.0000000e+00, 1.7000000e+02]],
      dtype=float32)

In [175]:
x_client_1_cardio_syn.shape

(5000, 4)

In [201]:
resemblance_measure(x_client_1_cardio_syn, x_cardio_c1_real[:5000])

Resemblance Score: 0.7337510021837952


# Cardio Client 2

In [178]:
x_client_2_cardio_syn = reconstruction_of_latent(model_car2, "syn_data/syn_latent/cardio_generated/X_c2_unnorm.npy", "syn_data/syn_latent/cardio_generated/y_train.npy", standardizer_car2)
x_cardio_c2_real = load_diabetes_data("client_n_data/cardio_clients/cardio_2.csv", ",")
x_cardio_c2_real = x_cardio_c2_real[1].inverse_transform(x_cardio_c2_real[0].cpu().detach().numpy())
x_cardio_c2_real.shape

(70000, 4)

In [186]:
x_client_2_cardio_syn

array([[ 82.66248  , 133.91621  ,  91.236404 ,   1.1893916],
       [ 68.02494  , 122.48908  ,  83.65555  ,   1.1430093],
       [ 74.71665  , 127.00881  ,  86.32303  ,   1.1424247],
       ...,
       [ 69.823616 , 123.12251  ,  84.845    ,   1.1328647],
       [ 66.235756 , 120.976974 ,  82.99318  ,   1.1562182],
       [ 71.67981  , 123.83279  ,  85.49577  ,   1.1278017]],
      dtype=float32)

In [202]:
resemblance_measure(x_client_2_cardio_syn, x_cardio_c2_real[:5000])

Resemblance Score: 0.7229961610931834


# Cardio Client 3

In [261]:
x_client_3_cardio_syn = reconstruction_of_latent(model_car3, "syn_data/syn_latent/cardio_generated/X_c3_unnorm.npy", "syn_data/syn_latent/cardio_generated/y_train.npy", standardizer_car3)
x_cardio_c3_real = load_diabetes_data("client_n_data/cardio_clients/cardio_3.csv", ",")
x_cardio_c3_real = x_cardio_c3_real[1].inverse_transform(x_cardio_c3_real[0].cpu().detach().numpy())
x_cardio_c3_real

array([[ 1.0000000e+00, -2.7639526e-09, -1.9243784e-10,  1.0000000e+00],
       [ 1.0000000e+00, -2.7639526e-09, -1.9243784e-10,  1.0000000e+00],
       [ 1.0000000e+00, -2.7639526e-09, -1.9243784e-10, -9.0462819e-09],
       ...,
       [ 1.0000000e+00, -2.7639526e-09,  1.0000000e+00, -9.0462819e-09],
       [ 2.0000000e+00, -2.7639526e-09, -1.9243784e-10, -9.0462819e-09],
       [ 1.0000000e+00, -2.7639526e-09, -1.9243784e-10,  1.0000000e+00]],
      dtype=float32)

In [189]:
x_cardio_c3_real

array([[ 1.0000000e+00, -2.7639526e-09, -1.9243784e-10,  1.0000000e+00],
       [ 1.0000000e+00, -2.7639526e-09, -1.9243784e-10,  1.0000000e+00],
       [ 1.0000000e+00, -2.7639526e-09, -1.9243784e-10, -9.0462819e-09],
       ...,
       [ 1.0000000e+00, -2.7639526e-09,  1.0000000e+00, -9.0462819e-09],
       [ 2.0000000e+00, -2.7639526e-09, -1.9243784e-10, -9.0462819e-09],
       [ 1.0000000e+00, -2.7639526e-09, -1.9243784e-10,  1.0000000e+00]],
      dtype=float32)

In [200]:
resemblance_measure(x_client_3_cardio_syn, x_cardio_c3_real[:5000])

Resemblance Score: 0.6009089888327613


In [204]:
jensen_shannon_similarity(x_client_3_cardio_syn, x_cardio_c3_real[:5000])

0.5728722256751043

### New Thiels U

In [78]:
import pandas as pd
import numpy as np

def compute_categorical_similarity_v2(col_real, col_synthetic):
    # Ensure the input columns are of the same length
    if len(col_real) != len(col_synthetic):
        raise ValueError("Input columns must have the same length.")
    
    # Compute value counts (probabilities) for each unique value in the columns
    p_real = pd.Series(col_real).value_counts(normalize=True)
    p_synthetic = pd.Series(col_synthetic).value_counts(normalize=True)
    
    # Align the indices of the real and synthetic distributions
    p_real, p_synthetic = p_real.align(p_synthetic, fill_value=0)
    
    # Compute Theil's U
    epsilon = 1e-10  # Small value to avoid division by zero or log of zero
    p_real += epsilon
    p_synthetic += epsilon
    u = (p_real * np.log(p_real / p_synthetic)).sum()
    
    # Return similarity measure
    return 1 - u

# Example usage
# Create sample dataframes with columns having 0s and 1s
data_real = pd.DataFrame({
    'feature1': np.random.choice([0, 1], size=5000),
    'feature2': np.random.choice([0, 1], size=5000),
    'feature3': np.random.choice([0, 1], size=5000),
    'feature4': np.random.choice([0, 1], size=5000)
})

data_synthetic = pd.DataFrame({
    'feature1': np.random.choice([0, 1], size=5000),
    'feature2': np.random.choice([0, 1], size=5000),
    'feature3': np.random.choice([0, 1], size=5000),
    'feature4': np.random.choice([0, 1], size=5000)
})

# Compute similarity for each column
similarities = {}
for column in data_real.columns:
    similarity = compute_categorical_similarity(data_real[column], data_synthetic[column])
    similarities[column] = similarity

print('Categorical Similarities (Theil\'s U):', similarities)


Categorical Similarities (Theil's U): {'feature1': 0.9992314608332202, 'feature2': 0.9999459194865851, 'feature3': 0.9999711181199618, 'feature4': 0.9999128704433407}


In [258]:
df_test = pd.DataFrame(x_cardio_c3_real, columns=['col1', 'col2', 'col3', 'col4'])

df_test

Unnamed: 0,col1,col2,col3,col4
0,1.0,-2.763953e-09,-1.924378e-10,1.000000e+00
1,1.0,-2.763953e-09,-1.924378e-10,1.000000e+00
2,1.0,-2.763953e-09,-1.924378e-10,-9.046282e-09
3,1.0,-2.763953e-09,-1.924378e-10,1.000000e+00
4,1.0,-2.763953e-09,-1.924378e-10,-9.046282e-09
...,...,...,...,...
69995,1.0,1.000000e+00,-1.924378e-10,1.000000e+00
69996,2.0,-2.763953e-09,-1.924378e-10,1.000000e+00
69997,1.0,-2.763953e-09,1.000000e+00,-9.046282e-09
69998,2.0,-2.763953e-09,-1.924378e-10,-9.046282e-09


In [257]:
similarities = {}
x_client_3_cardio_syn_df = pd.DataFrame(x_client_3_cardio_syn, columns=['col1', 'col2', 'col3', 'col4'])
x_cardio_c3_real_df = pd.DataFrame( x_cardio_c3_real[:5000], columns=['col1', 'col2', 'col3', 'col4'])
for column in x_cardio_c3_real_df.columns:
    similarity = compute_categorical_similarity(x_cardio_c3_real_df[column], x_client_3_cardio_syn_df[column])
    similarities[column] = similarity

print('Categorical Similarities (Theil\'s U):', similarities)

Categorical Similarities (Theil's U): {'col1': -21.489038688118, 'col2': -21.726096630021438, 'col3': -21.811164839042853, 'col4': -21.51967653053544}


# Experiment On Load Data

In [327]:
'''Fixed Non Scaling of Categorical Data'''
def load_data_v2(path, sep=","):
    device = cuda_check()
    # read in from csv
    df = pd.read_csv(path, sep=sep)
    df_base = df.iloc[:, :-1]
    df_target = df.iloc[:,-1].values

    # Identify numerical and categorical columns
    binary_categorical_cols = [col for col in df_base.columns if df_base[col].nunique() == 2 and df_base[col].dtype == 'int64']
    continuous_numerical_cols = [col for col in df_base.columns if col not in binary_categorical_cols and df_base[col].dtype in ['float64', 'int64']]

    continuous_data = df_base[continuous_numerical_cols].values.astype('float32')
    standardizer = preprocessing.StandardScaler()
    continuous_data = standardizer.fit_transform(continuous_data)

    binary_data = df_base[binary_categorical_cols].values.astype('int64')

    x = np.hstack((continuous_data, binary_data))
    # Convert to torch tensor and move to device
    x_train = torch.from_numpy(x).to(device)
    return x_train, standardizer, df_target, continuous_numerical_cols, binary_categorical_cols


(tensor([[-1.7321, -0.4361,  0.4435,  ...,  0.0000,  0.0000,  1.0000],
         [-1.7320,  0.3077, -1.0182,  ...,  0.0000,  0.0000,  1.0000],
         [-1.7320, -0.2480,  0.0780,  ...,  0.0000,  0.0000,  0.0000],
         ...,
         [ 1.7339, -0.1633,  2.2705,  ...,  0.0000,  1.0000,  0.0000],
         [ 1.7339,  1.2006, -0.1656,  ...,  0.0000,  0.0000,  0.0000],
         [ 1.7340,  0.4341,  0.6871,  ...,  0.0000,  0.0000,  1.0000]],
        dtype=torch.float64),
 StandardScaler(),
 array([0, 1, 1, ..., 1, 1, 0]),
 ['id', 'age', 'height', 'weight', 'ap_hi', 'ap_lo', 'cholesterol', 'gluc'],
 ['gender', 'smoke', 'alco', 'active'])

In [351]:
def inverse_transform_custom(x_tensor, standardizer, continuous_numerical_cols, binary_categorical_cols):
    
    x_array = x_tensor.cpu().detach().numpy()
    df = pd.DataFrame(x_array, columns=continuous_numerical_cols + binary_categorical_cols)
    # Separate continuous and binary data
    continuous_data = df[continuous_numerical_cols].values
    binary_data = df[binary_categorical_cols].values
    # Apply inverse transformation only to continuous data
    continuous_data = standardizer.inverse_transform(continuous_data)
    
    # Reconstruct the dataframe to maintain original order
    continuous_df = pd.DataFrame(continuous_data, columns=continuous_numerical_cols)
    binary_df = pd.DataFrame(binary_data, columns=binary_categorical_cols)

    processed_df = pd.concat([continuous_df, binary_df], axis=1)
    processed_df = processed_df[df.columns]

    return processed_df.values

In [353]:
# inverse_transform_custom(x[0], x[1], x[3], x[4])[0]

array([3.30117282e-03, 1.83930000e+04, 1.68000000e+02, 6.19999999e+01,
       1.10000000e+02, 7.99999997e+01, 1.00000002e+00, 9.99999998e-01,
       2.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.00000000e+00])

# TVAE Experiments

## Single Table Metadarta API

In [59]:
from sdv.metadata import SingleTableMetadata

metadata = SingleTableMetadata()
meta = metadata.detect_from_csv('Data/bank.csv')

metadata.to_dict()

{'METADATA_SPEC_VERSION': 'SINGLE_TABLE_V1',
 'columns': {'ID': {'sdtype': 'id'},
  'Age': {'sdtype': 'numerical'},
  'Experience': {'sdtype': 'numerical'},
  'Income': {'sdtype': 'numerical'},
  'ZIP Code': {'sdtype': 'postcode', 'pii': True},
  'Family': {'sdtype': 'categorical'},
  'CCAvg': {'sdtype': 'numerical'},
  'Education': {'sdtype': 'categorical'},
  'Mortgage': {'sdtype': 'numerical'},
  'Personal Loan': {'sdtype': 'categorical'},
  'Securities Account': {'sdtype': 'categorical'},
  'CD Account': {'sdtype': 'categorical'},
  'Online': {'sdtype': 'categorical'},
  'CreditCard': {'sdtype': 'categorical'}},
 'primary_key': 'ID'}

In [58]:
from sdv.single_table import TVAESynthesizer
synthesizer = TVAESynthesizer(
    metadata, # required
    enforce_min_max_values=True,
    enforce_rounding=True,
    epochs=500
)

NameError: name 'metadata' is not defined

In [581]:
data_tvae = pd.read_csv('Data/diabetes.csv')
# data_tvae
syn_data = synthesizer.fit(data_tvae)

In [584]:
syn_data_data = synthesizer.sample(num_rows=768)

In [596]:
vae_model = synthesizer._model

In [598]:
vae_model

<ctgan.synthesizers.tvae.TVAE at 0x296672e80>

In [60]:
"""TVAE module."""

import numpy as np
import pandas as pd
import torch
from torch.nn import Linear, Module, Parameter, ReLU, Sequential
from torch.nn.functional import cross_entropy
from torch.optim import Adam
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm

from ctgan.data_transformer import DataTransformer
from ctgan.synthesizers.base import BaseSynthesizer, random_state


class Encoder(Module):
    """Encoder for the TVAE.

    Args:
        data_dim (int):
            Dimensions of the data.
        compress_dims (tuple or list of ints):
            Size of each hidden layer.
        embedding_dim (int):
            Size of the output vector.
    """

    def __init__(self, data_dim, compress_dims, embedding_dim):
        super(Encoder, self).__init__()
        dim = data_dim
        seq = []
        for item in list(compress_dims):
            seq += [
                Linear(dim, item),
                ReLU()
            ]
            dim = item

        self.seq = Sequential(*seq)
        self.fc1 = Linear(dim, embedding_dim)
        self.fc2 = Linear(dim, embedding_dim)

    def forward(self, input_):
        """Encode the passed `input_`."""
        feature = self.seq(input_)
        mu = self.fc1(feature)
        logvar = self.fc2(feature)
        std = torch.exp(0.5 * logvar)
        return mu, std, logvar


class Decoder(Module):
    """Decoder for the TVAE.

    Args:
        embedding_dim (int):
            Size of the input vector.
        decompress_dims (tuple or list of ints):
            Size of each hidden layer.
        data_dim (int):
            Dimensions of the data.
    """

    def __init__(self, embedding_dim, decompress_dims, data_dim):
        super(Decoder, self).__init__()
        dim = embedding_dim
        seq = []
        for item in list(decompress_dims):
            seq += [Linear(dim, item), ReLU()]
            dim = item

        seq.append(Linear(dim, data_dim))
        self.seq = Sequential(*seq)
        self.sigma = Parameter(torch.ones(data_dim) * 0.1)

    def forward(self, input_):
        """Decode the passed `input_`."""
        return self.seq(input_), self.sigma


def _loss_function(recon_x, x, sigmas, mu, logvar, output_info, factor):
    st = 0
    loss = []
    for column_info in output_info:
        for span_info in column_info:
            if span_info.activation_fn != 'softmax':
                ed = st + span_info.dim
                std = sigmas[st]
                eq = x[:, st] - torch.tanh(recon_x[:, st])
                loss.append((eq ** 2 / 2 / (std ** 2)).sum())
                loss.append(torch.log(std) * x.size()[0])
                st = ed

            else:
                ed = st + span_info.dim
                loss.append(cross_entropy(
                    recon_x[:, st:ed], torch.argmax(x[:, st:ed], dim=-1), reduction='sum'))
                st = ed

    assert st == recon_x.size()[1]
    KLD = -0.5 * torch.sum(1 + logvar - mu**2 - logvar.exp())
    return sum(loss) * factor / x.size()[0], KLD / x.size()[0]


class TVAE(BaseSynthesizer):
    """TVAE."""

    def __init__(
        self,
        embedding_dim=3,
        compress_dims=(128, 128),
        decompress_dims=(128, 128),
        l2scale=1e-5,
        batch_size=500,
        epochs=300,
        loss_factor=2,
        cuda=False,
        verbose=False
    ):

        self.embedding_dim = embedding_dim
        self.compress_dims = compress_dims
        self.decompress_dims = decompress_dims

        self.l2scale = l2scale
        self.batch_size = batch_size
        self.loss_factor = loss_factor
        self.epochs = epochs
        self.loss_values = pd.DataFrame(columns=['Epoch', 'Batch', 'Loss'])
        self.verbose = verbose

        if not cuda or not torch.cuda.is_available():
            device = 'cpu'
        elif isinstance(cuda, str):
            device = cuda
        else:
            device = 'cuda'

        self._device = torch.device(device)

    @random_state
    def fit(self, train_data, discrete_columns=()):
        """Fit the TVAE Synthesizer models to the training data.

        Args:
            train_data (numpy.ndarray or pandas.DataFrame):
                Training Data. It must be a 2-dimensional numpy array or a pandas.DataFrame.
            discrete_columns (list-like):
                List of discrete columns to be used to generate the Conditional
                Vector. If ``train_data`` is a Numpy array, this list should
                contain the integer indices of the columns. Otherwise, if it is
                a ``pandas.DataFrame``, this list should contain the column names.
        """
        self.transformer = DataTransformer()
        self.transformer.fit(train_data, discrete_columns)
        train_data = self.transformer.transform(train_data)
        dataset = TensorDataset(torch.from_numpy(train_data.astype('float32')).to(self._device))
        loader = DataLoader(dataset, batch_size=self.batch_size, shuffle=True, drop_last=False)

        data_dim = self.transformer.output_dimensions
        encoder = Encoder(data_dim, self.compress_dims, self.embedding_dim).to(self._device)
        self.decoder = Decoder(self.embedding_dim, self.decompress_dims, data_dim).to(self._device)
        optimizerAE = Adam(
            list(encoder.parameters()) + list(self.decoder.parameters()),
            weight_decay=self.l2scale)

        self.loss_values = pd.DataFrame(columns=['Epoch', 'Batch', 'Loss'])
        iterator = tqdm(range(self.epochs), disable=(not self.verbose))
        if self.verbose:
            iterator_description = 'Loss: {loss:.3f}'
            iterator.set_description(iterator_description.format(loss=0))

        for i in iterator:
            latent_embeddings = []
            loss_values = []
            batch = []
            for id_, data in enumerate(loader):
                optimizerAE.zero_grad()
                real = data[0].to(self._device)
                mu, std, logvar = encoder(real)
                eps = torch.randn_like(std)
                emb = eps * std + mu
                latent_embeddings.append(emb.detach())
                rec, sigmas = self.decoder(emb)
                loss_1, loss_2 = _loss_function(
                    rec, real, sigmas, mu, logvar,
                    self.transformer.output_info_list, self.loss_factor
                )
                loss = loss_1 + loss_2
                loss.backward()
                optimizerAE.step()
                self.decoder.sigma.data.clamp_(0.01, 1.0)

                batch.append(id_)
                loss_values.append(loss.detach().cpu().item())

            epoch_loss_df = pd.DataFrame({
                'Epoch': [i] * len(batch),
                'Batch': batch,
                'Loss': loss_values
            })
            if not self.loss_values.empty:
                self.loss_values = pd.concat(
                    [self.loss_values, epoch_loss_df]
                ).reset_index(drop=True)
            else:
                self.loss_values = epoch_loss_df

            if self.verbose:
                iterator.set_description(
                    iterator_description.format(
                        loss=loss.detach().cpu().item()))
        return latent_embeddings

    @random_state
    def sample(self, samples, noise):
        """Sample data similar to the training data.

        Args:
            samples (int):
                Number of rows to sample.
            noise (tensor):
                Noise

        Returns:
            numpy.ndarray or pandas.DataFrame
        """
        print("Here")
        self.decoder.eval()
        steps = samples // self.batch_size + 1
        data = []
        for hoola in range(steps):
            # mean = torch.zeros(self.batch_size, self.embedding_dim)
            # std = mean + 1
            # noise = torch.normal(mean=mean, std=std).to(self._device)
            fake, sigmas = self.decoder(noise)
            fake = torch.tanh(fake)
            data.append(fake.detach().cpu().numpy())

        data = np.concatenate(data, axis=0)
        data = data[:samples]
        return self.transformer.inverse_transform(data, sigmas.detach().cpu().numpy())

    def set_device(self, device):
        """Set the `device` to be used ('GPU' or 'CPU)."""
        self._device = device
        self.decoder.to(self._device)

In [833]:
"""DNU"""
discrete_columns = [
    'Family',
    'Education',
    'Personal Loan',
    'Securities Account',
    'CD Account',
    'Online',
    'CreditCard'
]
# train_data_bank = pd.read_csv("Data/bank.csv")
train_data_bank = pd.read_csv("client_n_data/bank_clients/bank_1.csv")
tave_custom = TVAE()



In [834]:
"""DNU"""
model_tvae_custom = tave_custom.fit(train_data_bank, discrete_columns)

In [835]:
"""DNU"""
unbatched_data = torch.cat(model_tvae_custom, dim=0)

unbatched_data

tensor([[ 6.3777e-01, -2.3564e+00, -5.3486e-01],
        [-5.6708e-02, -1.3972e+00, -1.0256e+00],
        [ 4.2632e-01, -3.2646e-01, -2.2598e-03],
        ...,
        [ 1.6345e+00, -1.6921e+00,  1.0038e+00],
        [ 9.1276e-01, -9.5760e-01, -2.0631e+00],
        [-1.4227e+00, -1.6125e+00, -1.2580e+00]])

In [836]:
"""DNU"""
data_gen_tvae = tave_custom.sample(5000, unbatched_data)
data_gen_tvae

Here


Unnamed: 0,ID,Age,Experience,Income,CreditCard
0,403,57,33,34,0
1,178,57,26,52,0
2,3696,52,25,58,0
3,799,35,10,40,0
4,540,36,12,22,0
...,...,...,...,...,...
4995,4410,46,23,47,0
4996,647,50,26,43,0
4997,-79,55,30,50,0
4998,527,47,22,19,0


In [756]:
"""DNU"""
kolmogorov_smirnov_similarity(data_gen_tvae.to_numpy(), train_data_bank.to_numpy())

  similarity, _ = ks_2samp(col_real, col_synthetic)


0.8168571428571428

In [64]:
def categorical_column_indices(metadata_dict):
    categorical_indices = []
    columns = metadata_dict.get('columns', {})
    column_names = list(columns.keys())[:-1]  # Exclude the last key
    for index, column_name in enumerate(column_names):
        column_data = columns[column_name]
        if column_data.get('sdtype') == 'categorical':
            categorical_indices.append(index)
    return categorical_indices

In [62]:
def generate_and_save_latent(model, source='client_n_data/diabetes_clients/client_1.csv', path="client_n_data/latent/client_1.csv"):
    DATA_PATH = source
    df = pd.read_csv(DATA_PATH, sep=",")
    actual_data = df.iloc[:, :-1]
    outcomes = df.iloc[:, -1]

    latents = []
    metadata = SingleTableMetadata()
    meta = metadata.detect_from_csv(source)

    discrete_columns = categorical_column_indices(metadata.to_dict())
    print(discrete_columns)
    latents = model.fit(actual_data, discrete_columns)
    unbatched_latent = torch.cat(latents, dim=0)

    latents_df = pd.DataFrame(unbatched_latent)
    outcomes_df = pd.DataFrame(outcomes)
    # Save DataFrame to a CSV file
    data_with_outcomes = pd.concat([latents_df, outcomes_df], axis=1)

    data_with_outcomes.to_csv(path, index=False)

# Meduim Dataset TVAE GEN

In [81]:
loan_data = TVAE()
generate_and_save_latent(loan_data, "client_n_data/bank_clients/bank_1.csv", path="client_n_data/latent/bank_tvae_1.csv")
loan_data_client2 = TVAE()
generate_and_save_latent(loan_data_client2, "client_n_data/bank_clients/bank_2.csv", path="client_n_data/latent/bank_tvae_2.csv")
loan_data_client3 = TVAE()
generate_and_save_latent(loan_data_client3, "client_n_data/bank_clients/bank_3.csv", path="client_n_data/latent/bank_tvae_3.csv")

[]
[1, 3]
[1, 2, 3, 4]


In [869]:
"""Pre Test"""
bank_client_3_generated_latent = pd.read_csv("client_n_data/latent/bank_tvae_3.csv")
bank_client_3_generated_latent = bank_client_3_generated_latent.iloc[:, :-1].to_numpy()
bank_client_3_actual_gen = loan_data_client3.sample(5000, torch.tensor(bank_client_3_generated_latent).float())
reeeeel = pd.read_csv("client_n_data/bank_clients/bank_3.csv", ).iloc[:, :-1]
jensen_shannon_similarity(bank_client_3_actual_gen.to_numpy(), reeeeel.to_numpy())

Here


0.9857763935745911

In [870]:
bank_gen_ddpm = np.load("syn_data/syn_latent/bank_gen_tvae/X_num_unnorm.npy")
bank_client_1_generated_latent = bank_gen_ddpm[:, 0:3]
bank_client_2_generated_latent = bank_gen_ddpm[:, 3:6]
bank_client_3_generated_latent = bank_gen_ddpm[:, 6:9]

In [879]:
bank_c1_actual_gen = loan_data.sample(5000, torch.tensor(bank_client_1_generated_latent).float())
bank_c2_actual_gen = loan_data_client2.sample(5000, torch.tensor(bank_client_2_generated_latent).float())
bank_c3_actual_gen = loan_data_client3.sample(5000, torch.tensor(bank_client_3_generated_latent).float())

Here
Here
Here


In [880]:
actual_bank_1_data_tvae = pd.read_csv("client_n_data/bank_clients/bank_1.csv")
actual_bank_1_data_tvae = actual_bank_1_data_tvae.iloc[: , :-1]

actual_bank_2_data_tvae = pd.read_csv("client_n_data/bank_clients/bank_2.csv")
actual_bank_2_data_tvae = actual_bank_2_data_tvae.iloc[: , :-1]

actual_bank_3_data_tvae = pd.read_csv("client_n_data/bank_clients/bank_3.csv")
actual_bank_3_data_tvae = actual_bank_3_data_tvae.iloc[: , :-1]


In [911]:
resemblance_measure(actual_bank_1_data_tvae.to_numpy(), bank_c1_actual_gen.to_numpy())

Resemblance Score: 0.8542637491600948


In [937]:
resemblance_measure(actual_bank_2_data_tvae.to_numpy(), bank_c2_actual_gen.to_numpy())

Resemblance Score: 0.9183453392812382


In [923]:
"""Thiel U"""
similarities = []
for col in actual_bank_3_data_tvae.columns:
    print(col)
    similarity = compute_categorical_similarity_v2(actual_bank_3_data_tvae[col], bank_c3_actual_gen[col])
    similarities.append(similarity)

similarities

Mortgage
Personal Loan
Securities Account
CD Account
Online


[-1.2243042930235055,
 0.9314255630121435,
 0.9979722162490516,
 -0.1626946771629656,
 0.9997764523998197]

In [939]:
jensen_shannon_similarity(actual_bank_3_data_tvae, bank_c3_actual_gen)

1.0

In [996]:
kolmogorov_smirnov_similarity(actual_bank_3_data_tvae.to_numpy(), bank_c3_actual_gen.to_numpy())

0.93956

In [997]:
propensity_mean_absolute_similarity(actual_bank_3_data_tvae.to_numpy(), bank_c3_actual_gen.to_numpy())

0.6143557148412802



nan

# Easy Diabetes TVAE GAN GENERATION

In [65]:
diabetes_client_1 = TVAE()
generate_and_save_latent(diabetes_client_1, "client_n_data/diabetes_clients/client_1.csv", path="client_n_data/latent/diabetes_tvae_1.csv")
diabetes_client_2 = TVAE()
generate_and_save_latent(diabetes_client_2, "client_n_data/diabetes_clients/client_2.csv", path="client_n_data/latent/diabetes_tvae_2.csv")
diabetes_client_3 = TVAE()
generate_and_save_latent(diabetes_client_3, "client_n_data/diabetes_clients/client_3.csv", path="client_n_data/latent/diabetes_tvae_3.csv")

[]
[]
[]


In [66]:
diabetes_gen_ddpm = np.load("syn_data/syn_latent/diabetes_gen_tvae/X_num_unnorm.npy")
diabetes_client_1_generated_latent = diabetes_gen_ddpm[:, 0:3]
diabetes_client_2_generated_latent = diabetes_gen_ddpm[:, 3:6]
diabetes_client_3_generated_latent = diabetes_gen_ddpm[:, 6:9]

In [67]:
diabetes_c1_actual_gen = diabetes_client_1.sample(5000, torch.tensor(diabetes_client_1_generated_latent).float())
diabetes_c2_actual_gen = diabetes_client_2.sample(5000, torch.tensor(diabetes_client_2_generated_latent).float())
diabetes_c3_actual_gen = diabetes_client_3.sample(5000, torch.tensor(diabetes_client_3_generated_latent).float())

Here
Here
Here


In [68]:
actual_diabetes_1_data_tvae = pd.read_csv("client_n_data/diabetes_clients/client_1.csv")
actual_diabetes_1_data_tvae = actual_diabetes_1_data_tvae.iloc[: , :-1]

actual_diabetes_2_data_tvae = pd.read_csv("client_n_data/diabetes_clients/client_2.csv")
actual_diabetes_2_data_tvae = actual_diabetes_2_data_tvae.iloc[: , :-1]

actual_diabetes_3_data_tvae = pd.read_csv("client_n_data/diabetes_clients/client_3.csv")
actual_diabetes_3_data_tvae = actual_diabetes_3_data_tvae.iloc[: , :-1]


In [69]:
resemblance_measure(actual_diabetes_1_data_tvae.to_numpy(), diabetes_c1_actual_gen.to_numpy())

Resemblance Score: 0.8959707295548046


In [72]:
correlation_similarity(actual_diabetes_2_data_tvae.to_numpy(), diabetes_c2_actual_gen.iloc[:768].to_numpy())

1.0

In [73]:
jensen_shannon_similarity(actual_diabetes_2_data_tvae.to_numpy(), diabetes_c2_actual_gen.iloc[:768].to_numpy())

0.5761575459734001

In [74]:
kolmogorov_smirnov_similarity(actual_diabetes_2_data_tvae.to_numpy(), diabetes_c2_actual_gen.iloc[:768].to_numpy())

0.904296875

In [75]:
propensity_mean_absolute_similarity(actual_diabetes_2_data_tvae.to_numpy(), diabetes_c2_actual_gen.iloc[:768].to_numpy())

0.6735895613231848

In [80]:
actual_diabetes_2_data_tvae_col = actual_diabetes_2_data_tvae
similarities = []
for col in actual_diabetes_2_data_tvae_col.columns:
    print(col)
    similarity = compute_categorical_similarity_v2(actual_diabetes_2_data_tvae_col[col], diabetes_c2_actual_gen.iloc[:768][col])
    similarities.append(similarity)

similarities

pres
skin


[-0.17488178100690988, 0.2500766114718487]

In [76]:
resemblance_measure(actual_diabetes_3_data_tvae.to_numpy(), diabetes_c3_actual_gen.to_numpy())

Resemblance Score: 0.7325254579089375


# Cardio TVAE GEN Data

In [999]:
cardio_client_1 = TVAE()
generate_and_save_latent(cardio_client_1, "client_n_data/cardio_clients/cardio_1.csv", path="client_n_data/latent/cardio_tvae_1.csv")

[2]


FileNotFoundError: [Errno 2] No such file or directory: 'client_n_data/cardio_clients/client_2.csv'

In [1000]:
cardio_client_2 = TVAE()
generate_and_save_latent(cardio_client_2, "client_n_data/cardio_clients/cardio_2.csv", path="client_n_data/latent/cardio_tvae_2.csv")
cardio_client_3 = TVAE()
generate_and_save_latent(cardio_client_3, "client_n_data/cardio_clients/cardio_3.csv", path="client_n_data/latent/cardio_tvae_3.csv")

[3]
[0, 1, 2, 3]


In [1003]:
cardio_gen_ddpm = np.load("syn_data/syn_latent/cardio_gen_tvae/X_num_unnorm.npy")
cardio_client_1_generated_latent = cardio_gen_ddpm[:, 0:3]
cardio_client_2_generated_latent = cardio_gen_ddpm[:, 3:6]
cardio_client_3_generated_latent = cardio_gen_ddpm[:, 6:9]

In [2]:
cardio_c1_actual_gen = cardio_client_1.sample(5000, torch.tensor(cardio_client_1_generated_latent).float())
cardio_c2_actual_gen = cardio_client_2.sample(5000, torch.tensor(cardio_client_2_generated_latent).float())
cardio_c3_actual_gen = cardio_client_3.sample(5000, torch.tensor(cardio_client_3_generated_latent).float())

NameError: name 'cardio_client_1' is not defined

In [1005]:
actual_cardio_1_data_tvae = pd.read_csv("client_n_data/cardio_clients/cardio_1.csv")
actual_cardio_1_data_tvae = actual_cardio_1_data_tvae.iloc[: , :-1]

actual_cardio_2_data_tvae = pd.read_csv("client_n_data/cardio_clients/cardio_2.csv")
actual_cardio_2_data_tvae = actual_cardio_2_data_tvae.iloc[: , :-1]

actual_cardio_3_data_tvae = pd.read_csv("client_n_data/cardio_clients/cardio_3.csv")
actual_cardio_3_data_tvae = actual_cardio_3_data_tvae.iloc[: , :-1]


In [1006]:
resemblance_measure(actual_cardio_1_data_tvae.to_numpy(), cardio_c1_actual_gen.to_numpy())

Resemblance Score: 0.6998977103783992


In [1007]:
resemblance_measure(actual_cardio_2_data_tvae.to_numpy(), cardio_c2_actual_gen.to_numpy())

Resemblance Score: 0.9487162136387917


In [1]:
cardio_c3_actual_gen

NameError: name 'cardio_c3_actual_gen' is not defined

In [1008]:
resemblance_measure(actual_cardio_3_data_tvae.to_numpy(), cardio_c3_actual_gen.to_numpy())



Resemblance Score: nan


In [1010]:
correlation_similarity(actual_cardio_3_data_tvae.to_numpy(), cardio_c3_actual_gen.to_numpy())

0.9962375673357506

In [1011]:
jensen_shannon_similarity(actual_cardio_3_data_tvae.to_numpy(), cardio_c3_actual_gen.to_numpy())

0.9790119081082638

In [1012]:
kolmogorov_smirnov_similarity(actual_cardio_3_data_tvae.to_numpy(), cardio_c3_actual_gen.to_numpy())

0.9844178571428571

In [1013]:
propensity_mean_absolute_similarity(actual_cardio_3_data_tvae.to_numpy(), cardio_c3_actual_gen.to_numpy())

0.8760279810349146

In [1028]:
actual_cardio_3_data_tvae.iloc[:5000, :].shape

(5000, 4)

In [1027]:
cardio_c3_actual_gen.iloc[:5000, :].shape

(5000, 4)

In [1029]:
"""Thiel U"""

actual_cardio_3_data_tvae_5k = actual_cardio_3_data_tvae.iloc[:5000, :]
similarities = []
for col in actual_cardio_3_data_tvae_5k.columns:
    print(col)
    similarity = compute_categorical_similarity_v2(actual_cardio_3_data_tvae_5k[col], cardio_c3_actual_gen[col])
    similarities.append(similarity)

similarities

gluc
smoke
alco
active


[0.9984389165488295,
 0.9965288791630152,
 0.9961797318412299,
 0.9986231736120271]

In [None]:
class CustomDecoder(nn.Module):
    def __init__(self, H2, continuous_features, categorical_features):
        super(CustomDecoder, self).__init__()
        self.H2 = H2
        self.continuous_features = continuous_features
        self.categorical_features = categorical_features
        
        # Heads for Continuous Features
        self.continuous_heads = nn.ModuleDict({
            feature: nn.Linear(H2, 2) for feature in continuous_features
        })
        
        # Heads for Categorical Features
        self.categorical_heads = nn.ModuleDict({
            feature: nn.Linear(H2, n_classes) for feature, n_classes in categorical_features.items()
        })

    def forward(self, x):
        outputs = {}
        
        # Process Continuous Features
        for feature in self.continuous_features:
            head = self.continuous_heads[feature]
            mean_var = head(x)
            mean, log_var = mean_var.chunk(2, dim=-1)  # Split into mean and variance
            outputs[feature] = (mean, torch.exp(log_var))  # Return mean and variance
        
        # Process Categorical Features
        for feature, n_classes in self.categorical_features.items():
            head = self.categorical_heads[feature]
            logits = head(x)
            probabilities = torch.softmax(logits, dim=-1)  # Convert logits to probabilities
            outputs[feature] = probabilities
        
        return outputs

# Baseline Tests

### CTGAN

#### Bank: 70

In [40]:
bank_ctgan = pd.read_csv("centralized_data_gen/bank_CTGAN.csv")#.iloc[:,1:]
bank_real = pd.read_csv("Data/bank.csv")
bank_ctgan['CCAvg'] = bank_ctgan['CCAvg'].str.replace(',', '.').astype(float)
bank_ctgan = bank_ctgan.iloc[:, 1:]

In [160]:
bank_ctgan

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,773153107,42,31,143,19052,2,3.4,3,510,1,1,1,1,0
1,295010557,28,30,196,55911,2,8.3,1,0,0,0,0,1,1
2,475410988,27,36,26,90925,4,0.5,1,0,0,1,0,1,0
3,421328571,26,38,73,31930,3,2.4,1,0,0,0,0,0,1
4,646285693,23,6,101,57866,1,3.3,1,221,1,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,977707134,60,26,114,51968,1,3.9,1,1,0,0,0,1,0
4996,662078397,58,2,107,97655,2,3.1,1,0,0,0,0,0,1
4997,883705473,36,20,127,43471,4,2.2,3,2,0,0,0,1,1
4998,453804152,34,16,55,58726,4,0.2,3,83,0,0,0,0,1


In [41]:
resemblance_measure(bank_real.to_numpy(), bank_ctgan.to_numpy())

Resemblance Score: 0.7058689733834619


#### Diabetes 76

In [37]:
diabetes_ctgan = pd.read_csv("centralized_data_gen/diabetes_CTGAN.csv")
diabetes_ctgan = diabetes_ctgan.iloc[:, 1:-1]
diabetes_ctgan

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,8.0,62.0,57.0,6.0,1.0,46.2,1.421,31.0
1,0.0,157.0,46.0,23.0,14.0,32.3,0.729,69.0
2,2.0,56.0,65.0,40.0,9.0,41.9,1.768,37.0
3,6.0,143.0,55.0,30.0,0.0,30.6,0.331,36.0
4,5.0,67.0,31.0,57.0,25.0,34.9,0.089,31.0
...,...,...,...,...,...,...,...,...
763,2.0,50.0,46.0,9.0,5.0,41.4,0.418,35.0
764,2.0,159.0,61.0,0.0,175.0,30.4,1.419,34.0
765,0.0,180.0,36.0,37.0,19.0,20.7,0.290,27.0
766,2.0,139.0,44.0,1.0,134.0,47.6,0.943,55.0


In [38]:
diabetes_real = pd.read_csv("Data/diabetes.csv")
diabetes_real = diabetes_real.iloc[:, :-1]
diabetes_real

Unnamed: 0,preg,plas,pres,skin,insu,mass,pedi,age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33
...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63
764,2,122,70,27,0,36.8,0.340,27
765,5,121,72,23,112,26.2,0.245,30
766,1,126,60,0,0,30.1,0.349,47


In [39]:
resemblance_measure(diabetes_ctgan.to_numpy(), diabetes_real.to_numpy())

Resemblance Score: 0.7662461372023996


#### Cardio 86

In [31]:
cardio_tabddpm = pd.read_csv("centralized_data_gen/cardio_CTGAN.csv")
cardio_tabddpm = cardio_tabddpm.iloc[:, 1:]
cardio_tabddpm

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,817023912,21514,2,162,75.66,120,77,1,1,0,0,1,0
1,570334775,22218,1,165,79.82,102,66,1,1,0,0,0,0
2,382444845,19036,2,181,66.99,119,72,1,1,1,0,1,1
3,153988967,17731,2,169,80.62,119,55,1,1,0,0,1,0
4,842781231,19659,2,182,90.92,161,81,3,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,444794722,18378,1,162,102.39,130,69,1,1,0,0,1,0
69996,782843220,17490,2,164,87.00,105,70,1,1,0,0,1,0
69997,794133946,17616,1,156,69.51,152,75,1,1,0,0,1,1
69998,68294833,19647,1,149,76.67,138,92,1,1,0,0,1,0


In [32]:
cardio_real = pd.read_csv("Data/cardio_train.csv", sep=";")
cardio_real

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,99993,19240,2,168,76.0,120,80,1,1,1,0,1,0
69996,99995,22601,1,158,126.0,140,90,2,2,0,0,1,1
69997,99996,19066,2,183,105.0,180,90,3,1,0,1,0,1
69998,99998,22431,1,163,72.0,135,80,1,2,0,0,0,1


In [33]:
resemblance_measure(cardio_tabddpm.to_numpy(), cardio_real.to_numpy())

Resemblance Score: 0.88658632849786


### Tabddpm

#### Diabetes: 78

In [28]:
diabetes_tabddpm = pd.read_csv("centralized_data_gen/dabetes_tabddpm_synth.csv", sep=",")
diabetes_tabddpm = diabetes_tabddpm.iloc[:, 1:]
diabetes_tabddpm

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0.0,140.069267,73.183003,30.171821,0.000000,31.390122,0.250685,21.777885
1,7.0,144.043381,81.845612,0.000000,0.000000,32.743503,0.112416,48.877731
2,7.0,133.511609,73.295214,31.046656,0.000000,32.689396,0.233206,41.942528
3,1.0,79.758328,59.453588,34.690728,0.000000,32.313570,0.225102,21.000000
4,0.0,36.399746,55.132142,23.527988,0.000000,32.803863,0.133688,22.740768
...,...,...,...,...,...,...,...,...
763,5.0,96.574792,87.161192,15.851913,181.723822,34.685963,0.407741,30.221269
764,0.0,165.786133,80.099040,50.406801,213.809464,39.514917,1.470972,24.651495
765,0.0,0.000000,110.000000,60.000000,846.000000,67.100000,2.329000,81.000000
766,3.0,134.922533,66.897980,23.382913,252.922705,29.661195,0.285518,25.997983


In [29]:
diabetes_real = pd.read_csv("Data/diabetes.csv")
diabetes_real = diabetes_real.iloc[:, :-1]
diabetes_real

Unnamed: 0,preg,plas,pres,skin,insu,mass,pedi,age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33
...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63
764,2,122,70,27,0,36.8,0.340,27
765,5,121,72,23,112,26.2,0.245,30
766,1,126,60,0,0,30.1,0.349,47


In [30]:
resemblance_measure(diabetes_tabddpm.to_numpy(), diabetes_real.to_numpy())

Resemblance Score: 0.7886061144859609


#### Bank 92

In [24]:
bank_tabddpm = pd.read_csv("centralized_data_gen/bank_tabddpm_synth.csv", sep=";")
bank_tabddpm["Age"] = bank_tabddpm["Age"].str.replace(",", ".").astype(float).round().astype(int)
bank_tabddpm["Experience"] = bank_tabddpm["Experience"].str.replace(",", ".").astype(float).round().astype(int)
bank_tabddpm["Income"] = bank_tabddpm["Income"].str.replace(",", ".").astype(float).round().astype(int)
bank_tabddpm["ZIP Code"] = bank_tabddpm["ZIP Code"].str.replace(",", ".").astype(float).round().astype(int)
bank_tabddpm["CCAvg"] = bank_tabddpm["CCAvg"].astype(float)


In [25]:
bank_tabddpm

Unnamed: 0,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online
0,30,6,43,92075,1,2.0,2,0,0,1,1,1
1,23,-2,10,90639,3,1.0,2,0,0,0,0,0
2,45,18,28,94553,1,15.0,3,101,0,0,0,1
3,27,7,8,95918,4,1.0,1,0,0,0,0,0
4,48,21,60,94007,1,49.0,2,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
4995,67,41,83,92102,3,24.0,1,0,0,0,0,0
4996,23,-3,8,96651,2,7.0,1,0,0,0,0,0
4997,40,15,92,94096,2,19.0,1,0,0,0,0,0
4998,58,31,80,91747,1,57.0,1,0,0,0,0,1


In [26]:
bank_real = pd.read_csv("Data/bank.csv", sep=",")
bank_real = bank_real.iloc[:, 1:-1]
bank_real

Unnamed: 0,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online
0,25,1,49,91107,4,1.6,1,0,0,1,0,0
1,45,19,34,90089,3,1.5,1,0,0,1,0,0
2,39,15,11,94720,1,1.0,1,0,0,0,0,0
3,35,9,100,94112,1,2.7,2,0,0,0,0,0
4,35,8,45,91330,4,1.0,2,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
4995,29,3,40,92697,1,1.9,3,0,0,0,0,1
4996,30,4,15,92037,4,0.4,1,85,0,0,0,1
4997,63,39,24,93023,2,0.3,3,0,0,0,0,0
4998,65,40,49,90034,3,0.5,2,0,0,0,0,1


In [27]:
resemblance_measure(bank_real.to_numpy(), bank_tabddpm.to_numpy())

Resemblance Score: 0.9269400954686651


#### Cardio 95

In [82]:
cardio_tabddpm = pd.read_csv("centralized_data_gen/cardio_tabddpm_synth.csv", sep=";")
cardio_tabddpm["id"] = cardio_tabddpm["id"].str.replace(",", ".").astype(float).round().astype(int)
cardio_tabddpm["age"] = cardio_tabddpm["id"].astype(int)
cardio_tabddpm["height"] = cardio_tabddpm["height"].str.replace(",", ".").astype(float).round().astype(int)
cardio_tabddpm["weight"] = cardio_tabddpm["weight"].str.replace(",", ".").astype(float)
cardio_tabddpm["ap_hi"] = cardio_tabddpm["ap_hi"].str.replace(",", ".").astype(float).round().astype(int)



cardio_tabddpm = cardio_tabddpm.iloc[:, 1:]
cardio_tabddpm

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active
0,96581,2,174,81.000000,130,80,1,1,0,0,0
1,77306,2,176,72.000000,120,70,1,1,0,0,1
2,3965,1,158,72.000000,120,80,1,1,0,0,1
3,18320,2,160,60.000000,120,80,1,1,0,0,0
4,1987,1,169,88.000000,110,70,1,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
69995,75354,2,165,61.198263,120,80,3,3,0,0,1
69996,61921,2,148,48.000000,130,90,1,1,1,0,1
69997,90037,2,165,70.000000,120,80,1,1,0,0,1
69998,11232,2,172,78.000000,120,80,1,1,1,1,0


In [83]:
cardio_real = pd.read_csv("Data/cardio_train.csv", sep=";")
cardio_real = cardio_real.iloc[:, 1:-1]
cardio_real

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active
0,18393,2,168,62.0,110,80,1,1,0,0,1
1,20228,1,156,85.0,140,90,3,1,0,0,1
2,18857,1,165,64.0,130,70,3,1,0,0,0
3,17623,2,169,82.0,150,100,1,1,0,0,1
4,17474,1,156,56.0,100,60,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
69995,19240,2,168,76.0,120,80,1,1,1,0,1
69996,22601,1,158,126.0,140,90,2,2,0,0,1
69997,19066,2,183,105.0,180,90,3,1,0,1,0
69998,22431,1,163,72.0,135,80,1,2,0,0,0


In [84]:
resemblance_measure(cardio_tabddpm.to_numpy(), cardio_real.to_numpy())

Resemblance Score: 0.9543470153748727


# TABSyn

### Cardio - 89

In [87]:
cardio_tabsyn = pd.read_csv("centralized_data_gen/cardio_synthetic_tabsyn.csv")
cardio_tabsyn = cardio_tabsyn.iloc[:, 1:12]
cardio_tabsyn

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active
0,20372.195,2,172.00000,97.00000,120.00000,80.0000,1,1,0,0,1
1,21135.533,2,177.00000,85.00000,120.00000,80.0000,1,1,1,0,1
2,21979.768,2,174.00000,66.00000,134.89389,80.0000,1,1,0,1,1
3,21754.598,2,165.00000,79.00000,140.00000,90.0000,1,1,0,0,1
4,15217.372,1,169.00000,57.00000,130.00000,80.0000,2,2,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
69995,21256.832,2,176.00000,90.00000,120.00000,80.0000,1,1,0,0,1
69996,23423.223,1,139.22299,75.00000,130.00000,80.0000,3,1,0,0,1
69997,21869.270,1,167.00000,75.00000,120.00000,90.0000,1,1,0,0,1
69998,21123.682,1,165.00000,109.26121,160.00000,1109.1909,3,3,0,0,1


In [88]:
cardio_real = pd.read_csv("Data/cardio_train.csv", sep=";")
cardio_real = cardio_real.iloc[:, 1:-1]
cardio_real

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active
0,18393,2,168,62.0,110,80,1,1,0,0,1
1,20228,1,156,85.0,140,90,3,1,0,0,1
2,18857,1,165,64.0,130,70,3,1,0,0,0
3,17623,2,169,82.0,150,100,1,1,0,0,1
4,17474,1,156,56.0,100,60,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
69995,19240,2,168,76.0,120,80,1,1,1,0,1
69996,22601,1,158,126.0,140,90,2,2,0,0,1
69997,19066,2,183,105.0,180,90,3,1,0,1,0
69998,22431,1,163,72.0,135,80,1,2,0,0,0


In [89]:
resemblance_measure(cardio_tabsyn.to_numpy(), cardio_real.to_numpy())

Resemblance Score: 0.8909749564838968


### Bank - 92

In [97]:
bank_tabsyn = pd.read_csv("centralized_data_gen/bank_tabsyn_synthetic.csv", sep=";")
bank_tabsyn = bank_tabsyn.iloc[:, 1:13]
bank_tabsyn


Unnamed: 0,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online
0,38,10065128,5593166,9238774,1,08,2,0,0,1,1,1
1,47,21362486,15544936,95089984,0,2,1,3436373,0,0,0,1
2,53,31,53982037,9131117,0,018908173,1,1373138,0,0,0,0
3,60,34410744,4339613,9562139,0,11,1,14827242,0,0,0,1
4,41,14,7047527,9430456,0,18962245,1,17170033,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
4995,2716002,0,7937031,9583551,0,16,2,0,0,0,0,1
4996,55,25,10913015,95837234,0,11,3,24012534,0,0,0,1
4997,46,17454306,13411798,9286063,0,18,1,0,0,0,0,1
4998,29378838,5,3109678,9505176,0,033038783,1,0,0,0,0,0


In [94]:
bank_real = pd.read_csv("Data/bank.csv", sep=",")
bank_real = bank_real.iloc[:, 1:-1]
bank_real

Unnamed: 0,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online
0,25,1,49,91107,4,1.6,1,0,0,1,0,0
1,45,19,34,90089,3,1.5,1,0,0,1,0,0
2,39,15,11,94720,1,1.0,1,0,0,0,0,0
3,35,9,100,94112,1,2.7,2,0,0,0,0,0
4,35,8,45,91330,4,1.0,2,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
4995,29,3,40,92697,1,1.9,3,0,0,0,0,1
4996,30,4,15,92037,4,0.4,1,85,0,0,0,1
4997,63,39,24,93023,2,0.3,3,0,0,0,0,0
4998,65,40,49,90034,3,0.5,2,0,0,0,0,1


In [98]:
resemblance_measure(bank_real.to_numpy(), bank_tabsyn.to_numpy())

Resemblance Score: 0.9269400954686651


### Diabetes - 84

In [100]:
diabetes_tabsyn = pd.read_csv("centralized_data_gen/diabetes_synthetic_tabsyn.csv")
diabetes_tabsyn = diabetes_tabsyn.iloc[:, :8]
diabetes_tabsyn

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,4.705798,99.219070,68.331560,32.854282,0.00000,27.010769,0.186255,27.698145
1,16.443970,120.690850,87.298210,0.000000,0.00000,43.948967,0.263599,56.661964
2,9.463222,133.451500,62.535930,0.000000,0.00000,23.505415,0.127230,55.177315
3,4.000000,74.589380,69.129030,12.637459,0.00000,26.711203,0.340420,23.000000
4,4.000000,183.244490,64.735660,32.231907,224.37398,27.595322,0.249921,57.578530
...,...,...,...,...,...,...,...,...
763,0.743928,147.832150,63.244427,32.024063,132.08727,32.672330,0.760614,24.831545
764,4.000000,97.704155,73.403250,0.000000,0.00000,30.904074,0.144953,42.103560
765,5.116669,127.130470,83.693180,0.000000,0.00000,6.091141,0.161974,25.291174
766,6.050511,159.605290,87.569960,0.000000,0.00000,31.508020,0.176828,59.165478


In [101]:
diabetes_real = pd.read_csv("Data/diabetes.csv")
diabetes_real = diabetes_real.iloc[:, :-1]
diabetes_real

Unnamed: 0,preg,plas,pres,skin,insu,mass,pedi,age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33
...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63
764,2,122,70,27,0,36.8,0.340,27
765,5,121,72,23,112,26.2,0.245,30
766,1,126,60,0,0,30.1,0.349,47


In [103]:
resemblance_measure(diabetes_real.to_numpy(), diabetes_tabsyn.to_numpy())

Resemblance Score: 0.8436536206167308
