In [50]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import nn, optim
from torch.autograd import Variable
import pandas as pd
import numpy as np
from sklearn import preprocessing
from scipy.io import arff
from sklearn.preprocessing import OneHotEncoder
from scipy.stats import pearsonr, ks_2samp
from scipy.spatial.distance import jensenshannon
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from xgboost import XGBClassifier

In [20]:
def cuda_check():
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    return device

In [21]:
def load_diabetes_data(path, sep=","):
    device = cuda_check()
    # read in from csv
    df = pd.read_csv(path, sep=sep)
    df_base = df.iloc[:, :-1]
    df_target = df.iloc[:,-1].values
    x = df_base.values.reshape(-1, df_base.shape[1]).astype('float32')
    # stadardize values
    standardizer = preprocessing.StandardScaler()
    x_train = standardizer.fit_transform(x)
    x_train = torch.from_numpy(x_train).to(device)
    return x_train, standardizer, df_target

In [22]:
def load_cardio_data(path, sep=","):
    device = cuda_check()
    # read in from csv
    df = pd.read_csv(path, sep=sep)
    df_base = df.iloc[:, :-1]
    df_target = df.iloc[:,-1].values
    x = df_base.values.reshape(-1, df_base.shape[1]).astype('float32')
    # stadardize values
    standardizer = preprocessing.StandardScaler()
    x_train = standardizer.fit_transform(x)
    x_train = torch.from_numpy(x_train).to(device)
    return x_train, standardizer, df_target

In [113]:
def load_bank_data(path, sep=","):
    device = cuda_check()
    # read in from csv
    df = pd.read_csv(path, sep=sep)
    df_base = df.iloc[:, :-1]
    df_target = df.iloc[:,-1].values
    x = df_base.values.reshape(-1, df_base.shape[1]).astype('float32')
    # stadardize values
    standardizer = preprocessing.StandardScaler()
    x_train = standardizer.fit_transform(x)
    x_train = torch.from_numpy(x_train).to(device)
    return x_train, standardizer, df_target

In [23]:
def load_abalone_data(path, sep=","):
    device = cuda_check()
    ont_hot_encoder_abalone = OneHotEncoder(sparse_output=False)
    # read in from csv
    df = pd.read_csv(path, sep=sep)
    categorical_columns = df.select_dtypes(include=['object']).columns.tolist()
    one_hot_encoded = ont_hot_encoder_abalone.fit_transform(df[categorical_columns])
    one_hot_df = pd.DataFrame(one_hot_encoded, columns=ont_hot_encoder_abalone.get_feature_names_out(categorical_columns))

    df_encoded = pd.concat([one_hot_df, df], axis=1)
    # Drop the original categorical columns
    df= df_encoded.drop(categorical_columns, axis=1)

    # Concatenate the one-hot encoded columns with the original DataFrame
    df_base = df.iloc[:, :-1]
    df_target = df.iloc[:,-1].values
    x = df_base.values.reshape(-1, df_base.shape[1]).astype('float32')
    # stadardize values
    standardizer = preprocessing.StandardScaler()
    x_train = standardizer.fit_transform(x)
    x_train = torch.from_numpy(x_train).to(device)
    return x_train, standardizer, df_target, ont_hot_encoder_abalone

In [24]:
from torch.utils.data import Dataset, DataLoader
class DataBuilder(Dataset):
    def __init__(self, path, data_loader_func ,sep=","):
        data = data_loader_func(path, sep)
        self.x, self.standardizer, self.outcome = data[:3]
        self.len=self.x.shape[0]

        if len(data) > 3:
            self.one_hot_encoder_abalone = data[3]
        else:
            self.one_hot_encoder_abalone = None
            
    def __getitem__(self,index):
        return self.x[index]
    def __len__(self):
        return self.len

In [25]:
class Autoencoder(nn.Module):
    def __init__(self,D_in,H=50,H2=12,latent_dim=3):

        #Encoder
        super(Autoencoder,self).__init__()
        self.linear1=nn.Linear(D_in,H)
        self.lin_bn1 = nn.BatchNorm1d(num_features=H)
        self.linear2=nn.Linear(H,H2)
        self.lin_bn2 = nn.BatchNorm1d(num_features=H2)
        self.linear3=nn.Linear(H2,H2)
        self.lin_bn3 = nn.BatchNorm1d(num_features=H2)

#         # Latent vectors mu and sigma
        self.fc1 = nn.Linear(H2, latent_dim)
#        self.bn1 = nn.BatchNorm1d(num_features=latent_dim)
        self.fc21 = nn.Linear(latent_dim, latent_dim)
        self.fc22 = nn.Linear(latent_dim, latent_dim)

#         # Sampling vector
        self.fc3 = nn.Linear(latent_dim, latent_dim)
#         self.fc_bn3 = nn.BatchNorm1d(latent_dim)
        self.fc4 = nn.Linear(latent_dim, H2)
#         self.fc_bn4 = nn.BatchNorm1d(H2)

#         # Decoder
        self.linear4=nn.Linear(H2,H2)
        self.lin_bn4 = nn.BatchNorm1d(num_features=H2)
        self.linear5=nn.Linear(H2,H)
        self.lin_bn5 = nn.BatchNorm1d(num_features=H)
        self.linear6=nn.Linear(H,D_in)
        self.lin_bn6 = nn.BatchNorm1d(num_features=D_in)

        self.gelu = nn.GELU()

    def encode(self, x):
        lin1 = self.gelu(self.lin_bn1(self.linear1(x)))
        lin2 = self.gelu(self.lin_bn2(self.linear2(lin1)))
        lin3 = self.gelu(self.lin_bn3(self.linear3(lin2)))

        fc1 = F.relu(self.fc1(lin3))

        r1 = self.fc21(fc1) # Generating mu
        r2 = self.fc22(fc1) # Generating sigma

        return r1, r2

    def reparameterize(self, mu, logvar):
        if self.training:
            std = logvar.mul(0.5).exp_() # Convert it to std deviation
            eps = Variable(std.data.new(std.size()).normal_()) # Generate a noise of same size as std
            return eps.mul(std).add_(mu) # Perform reparameterization
        else:
            return mu

    def decode(self, z):
        fc3 = self.gelu(self.fc3(z)) # Not sure why these two are required. 
        fc4 = self.gelu(self.fc4(fc3))#.view(128, -1)

        lin4 = self.gelu(self.lin_bn4(self.linear4(fc4)))
        lin5 = self.gelu(self.lin_bn5(self.linear5(lin4)))
        return self.lin_bn6(self.linear6(lin5))



    def forward(self, x):
        mu, logvar = self.encode(x)
        z = self.reparameterize(mu, logvar)
        # self.decode(z) ist später recon_batch, mu ist mu und logvar ist logvar
        return self.decode(z), mu, logvar
    
    def embed(self,x):
        mu, logvar = self.encode(x)
        z = self.reparameterize(mu, logvar)
        return z
    

In [26]:
class customLoss(nn.Module):
    def __init__(self):
        super(customLoss, self).__init__()
        self.mse_loss = nn.MSELoss(reduction="sum")

    # x_recon is the reconstruction batch created in the forward pass of the model, x is the original x batch, mu is mu, and logvar is logvar
    def forward(self, x_recon, x, mu, logvar):
        loss_MSE = self.mse_loss(x_recon, x)
        loss_KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())

        return loss_MSE + loss_KLD

## Declaring the Autoencoder

In [27]:
# device = cuda_check()
# D_in = data_set.x.shape[1]
# H = 50
# H2 = 12
# model = Autoencoder(D_in, H, H2).to(device)
# optimizer = optim.Adam(model.parameters(), lr=1e-3)

In [28]:
# loss_mse = customLoss()

In [29]:
def declare_model(data_set, H, H2):
    device = cuda_check()
    D_in = data_set.x.shape[1]
    model = Autoencoder(D_in, H, H2).to(device)
    optimizer = optim.Adam(model.parameters(), lr=1e-3)
    loss_mse = customLoss()
    return model, loss_mse, optimizer

## Trian the Client

In [91]:
def train_encoder(DATA_PATH, loader_func, sep=","):
    device = cuda_check()

    data_set=DataBuilder(DATA_PATH, loader_func , sep)
    trainloader=DataLoader(dataset=data_set,batch_size=32)


    model, loss_mse, optimizer = declare_model(data_set, 50, 12)
    # Refactor
    epochs = 1000
    train_losses = []
    for epoch in range(1, epochs + 1):
        model.train()
        train_loss = 0
        for batch_idx, data in enumerate(trainloader):
            data = data.to(device)
            optimizer.zero_grad()
            recon_batch, mu, logvar = model(data)
            loss = loss_mse(recon_batch, data, mu, logvar)
            loss.backward()
            train_loss += loss.item()
            optimizer.step()
        if epoch % 2 == 0:
            print('====> Epoch: {} Average loss: {:.4f}'.format(
                epoch, train_loss / len(trainloader.dataset)))
            train_losses.append(train_loss / len(trainloader.dataset))
    return model, trainloader.dataset.standardizer

In [92]:
model, standardizer = train_encoder('Data/diabetes.csv', load_diabetes_data, ",")

====> Epoch: 2 Average loss: 15.2255
====> Epoch: 4 Average loss: 13.2242
====> Epoch: 6 Average loss: 11.6673
====> Epoch: 8 Average loss: 10.8127
====> Epoch: 10 Average loss: 10.0439
====> Epoch: 12 Average loss: 9.6780
====> Epoch: 14 Average loss: 9.1477
====> Epoch: 16 Average loss: 8.7563
====> Epoch: 18 Average loss: 8.3776
====> Epoch: 20 Average loss: 8.1180
====> Epoch: 22 Average loss: 7.8314
====> Epoch: 24 Average loss: 7.7385
====> Epoch: 26 Average loss: 7.5789
====> Epoch: 28 Average loss: 7.3782
====> Epoch: 30 Average loss: 7.1825
====> Epoch: 32 Average loss: 7.1746
====> Epoch: 34 Average loss: 7.0268
====> Epoch: 36 Average loss: 6.9396
====> Epoch: 38 Average loss: 6.7982
====> Epoch: 40 Average loss: 6.8535
====> Epoch: 42 Average loss: 6.7480
====> Epoch: 44 Average loss: 6.5847
====> Epoch: 46 Average loss: 6.6423
====> Epoch: 48 Average loss: 6.5244
====> Epoch: 50 Average loss: 6.5967
====> Epoch: 52 Average loss: 6.4165
====> Epoch: 54 Average loss: 6.3871


In [93]:
def generate_latent_for_diabetes_ds(model):
    DATA_PATH = "Data/diabetes.csv"
    df = load_diabetes_data(DATA_PATH, sep=",")
    actual_data = df[0]
    outcomes = df[2]
    outcomes_numeric = [1 if outcome == "b'tested_positive'" else 0 for outcome in outcomes]

    latents = []
    model.eval()
    with torch.no_grad():
        for idx, e in enumerate(actual_data):
            sample = e.unsqueeze(0)  # Add batch dimension
            latent = model.embed(sample)  # Get the latent representation
            latents.append(latent.squeeze().cpu().numpy())

    latents_df = pd.DataFrame(latents)
    outcomes_df = pd.DataFrame(outcomes_numeric)
    # Save DataFrame to a CSV file
    data_with_outcomes = pd.concat([latents_df, outcomes_df], axis=1)

    data_with_outcomes.to_csv('latent_data/diabetes_latent.csv', index=False)

In [94]:
generate_latent_for_diabetes_ds(model)

In [95]:
def reconstruction_of_latent(model, npy_x_data_file_path, npy_y_data_file_path, standardizer):
    generated_latent_x = np.load(npy_x_data_file_path)
    generaated_latent_y = np.load(npy_y_data_file_path)
    generated_torch_data = torch.from_numpy(generated_latent_x).float()

    z = model.decode(generated_torch_data)

    generated_data_x = standardizer.inverse_transform(z.cpu().detach().numpy())
    return generated_data_x

In [96]:
x_real = load_diabetes_data("Data/diabetes.csv")
x_real = x_real[1].inverse_transform(x_real[0].cpu().detach().numpy())
x_real.shape

(768, 8)

In [97]:
x_syn = reconstruction_of_latent(model, 'X_num_train.npy', 'y_diabetes_train.npy', standardizer)

In [98]:
x_syn.shape

(10000, 8)

In [99]:
def compute_categorical_similarity(col_real, col_synthetic):
    # Compute Theil's U for categorical features
    p_real = pd.Series(col_real).value_counts(normalize=True)
    p_synthetic = pd.Series(col_synthetic).value_counts(normalize=True)
    u = (p_real * np.log(p_real / p_synthetic)).sum()
    return 1 - u
def column_similarity(real_data, synthetic_data):
    similarities = []
    for col_real, col_synthetic in zip(real_data, synthetic_data):
        correlation, _ = pearsonr(col_real, col_synthetic)
        similarity = correlation
        similarities.append(similarity)
    return np.mean(similarities)
def correlation_similarity(real_data, synthetic_data):
    real_corr = np.corrcoef(real_data, rowvar=False)
    synthetic_corr = np.corrcoef(synthetic_data, rowvar=False)
    correlation, _ = pearsonr(real_corr.flatten(), synthetic_corr.flatten())
    return correlation
def jensen_shannon_similarity(real_data, synthetic_data):
    similarities = []
    for col_real, col_synthetic in zip(real_data.T, synthetic_data.T):
        # Compute probability distributions and Jensen-Shannon divergence
        p_real = np.histogram(col_real, bins=10, density=True)[0]
        p_synthetic = np.histogram(col_synthetic, bins=10, density=True)[0]
        similarity = 1 - jensenshannon(p_real, p_synthetic)
        similarities.append(similarity)
    return np.mean(similarities)
def kolmogorov_smirnov_similarity(real_data, synthetic_data):
    similarities = []
    for col_real, col_synthetic in zip(real_data.T, synthetic_data.T):
        # Compute cumulative distributions and Kolmogorov-Smirnov distance
        _, p_value = ks_2samp(col_real, col_synthetic)
        similarity = 1 - p_value
        similarities.append(similarity)
    return np.mean(similarities)
def propensity_mean_absolute_similarity(real_data, synthetic_data):
    # Train XGBoost classifier to discriminate between real and synthetic samples
    X = np.vstack([real_data, synthetic_data])
    y = np.concatenate([np.ones(len(real_data)), np.zeros(len(synthetic_data))])
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    classifier = XGBClassifier()
    classifier.fit(X_train, y_train)
    # Compute mean absolute error of classifier probabilities
    y_pred_proba = classifier.predict_proba(X_test)[:, 1]
    error = mean_absolute_error(y_test, y_pred_proba)
    return 1 - error

In [100]:
def resemblance_measure(real_data, synthetic_data):
    resemblance_score = (
        column_similarity(real_data, synthetic_data) +
        correlation_similarity(real_data, synthetic_data) +
        jensen_shannon_similarity(real_data, synthetic_data) +
        kolmogorov_smirnov_similarity(real_data, synthetic_data) +
        propensity_mean_absolute_similarity(real_data, synthetic_data)
    ) / 5
    print("Resemblance Score:", resemblance_score)

In [101]:
resemblance_measure(x_syn[:768], x_real)

Resemblance Score: 0.8612875825328349


## Cardio Data

In [103]:
model_cardio, standardizer_cardio = train_encoder('Data/cardio_train.csv', load_diabetes_data, ";")

====> Epoch: 2 Average loss: 9.8872
====> Epoch: 4 Average loss: 9.6558
====> Epoch: 6 Average loss: 9.5732
====> Epoch: 8 Average loss: 9.4705
====> Epoch: 10 Average loss: 9.4279
====> Epoch: 12 Average loss: 9.3978
====> Epoch: 14 Average loss: 9.3661
====> Epoch: 16 Average loss: 9.3469
====> Epoch: 18 Average loss: 9.3409
====> Epoch: 20 Average loss: 9.3224
====> Epoch: 22 Average loss: 9.3005
====> Epoch: 24 Average loss: 9.2791
====> Epoch: 26 Average loss: 9.2800
====> Epoch: 28 Average loss: 9.2714
====> Epoch: 30 Average loss: 9.2637
====> Epoch: 32 Average loss: 9.2540
====> Epoch: 34 Average loss: 9.2464
====> Epoch: 36 Average loss: 9.2477
====> Epoch: 38 Average loss: 9.2233
====> Epoch: 40 Average loss: 9.2268
====> Epoch: 42 Average loss: 9.2185
====> Epoch: 44 Average loss: 9.2141
====> Epoch: 46 Average loss: 9.2151
====> Epoch: 48 Average loss: 9.2138
====> Epoch: 50 Average loss: 9.2058
====> Epoch: 52 Average loss: 9.1916
====> Epoch: 54 Average loss: 9.1927
====>

In [105]:
def generate_latent_for_cardio_ds(model):
    DATA_PATH = "Data/cardio_train.csv"
    df = load_diabetes_data(DATA_PATH, sep=";")
    actual_data = df[0]
    outcomes = df[2]

    latents = []
    model.eval()
    with torch.no_grad():
        for idx, e in enumerate(actual_data):
            sample = e.unsqueeze(0)  # Add batch dimension
            latent = model.embed(sample)  # Get the latent representation
            latents.append(latent.squeeze().cpu().numpy())

    latents_df = pd.DataFrame(latents)
    outcomes_df = pd.DataFrame(outcomes)
    # Save DataFrame to a CSV file
    data_with_outcomes = pd.concat([latents_df, outcomes_df], axis=1)

    data_with_outcomes.to_csv('latent_data/cardio_latent.csv', index=False)

In [106]:
generate_latent_for_cardio_ds(model_cardio)

In [107]:
x_cardio_syn = reconstruction_of_latent(model_cardio, 'syn_latent/cardio_synthetic/X_num_unnorm.npy', 'syn_latent/cardio_synthetic/y_train.npy', standardizer_cardio)

In [111]:
x_cardio_syn.shape

(10000, 12)

In [110]:
x_cardio_real = load_diabetes_data("Data/cardio_train.csv", ";")
x_cardio_real = x_cardio_real[1].inverse_transform(x_cardio_real[0].cpu().detach().numpy())
x_cardio_real.shape

(70000, 12)

In [112]:
resemblance_measure(x_cardio_syn, x_cardio_real[:9999])

Resemblance Score: 0.7650878484389891


## Loan Medium Level data

In [114]:
model_bank, bank_standardizer = train_encoder("Data/bank.csv", load_bank_data, ",")

====> Epoch: 2 Average loss: 18.2724
====> Epoch: 4 Average loss: 14.4822
====> Epoch: 6 Average loss: 12.8783
====> Epoch: 8 Average loss: 12.2937
====> Epoch: 10 Average loss: 11.9030
====> Epoch: 12 Average loss: 11.7320
====> Epoch: 14 Average loss: 11.5080
====> Epoch: 16 Average loss: 11.3381
====> Epoch: 18 Average loss: 11.2089
====> Epoch: 20 Average loss: 11.1627
====> Epoch: 22 Average loss: 11.0722
====> Epoch: 24 Average loss: 11.0496
====> Epoch: 26 Average loss: 10.9655
====> Epoch: 28 Average loss: 10.9049
====> Epoch: 30 Average loss: 10.8614
====> Epoch: 32 Average loss: 10.7989
====> Epoch: 34 Average loss: 10.7828
====> Epoch: 36 Average loss: 10.7792
====> Epoch: 38 Average loss: 10.7873
====> Epoch: 40 Average loss: 10.6894
====> Epoch: 42 Average loss: 10.6745
====> Epoch: 44 Average loss: 10.6588
====> Epoch: 46 Average loss: 10.6109
====> Epoch: 48 Average loss: 10.6120
====> Epoch: 50 Average loss: 10.6297
====> Epoch: 52 Average loss: 10.6019
====> Epoch: 54 

In [120]:
def generate_latent_for_bank_ds(model):
    DATA_PATH = "Data/bank.csv"
    df = load_diabetes_data(DATA_PATH, sep=",")
    actual_data = df[0]
    outcomes = df[2]

    latents = []
    model.eval()
    with torch.no_grad():
        for idx, e in enumerate(actual_data):
            sample = e.unsqueeze(0)  # Add batch dimension
            latent = model.embed(sample)  # Get the latent representation
            latents.append(latent.squeeze().cpu().numpy())

    latents_df = pd.DataFrame(latents)
    outcomes_df = pd.DataFrame(outcomes)
    # Save DataFrame to a CSV file
    data_with_outcomes = pd.concat([latents_df, outcomes_df], axis=1)

    data_with_outcomes.to_csv('latent_data/bank_latent.csv', index=False)

In [121]:
generate_latent_for_bank_ds(model_bank)

In [122]:
x_bank_syn = reconstruction_of_latent(model_bank, 'syn_latent/bank_synth/X_num_unnorm.npy', 'syn_latent/bank_synth/y_train.npy', bank_standardizer)

In [124]:
x_bank_real = load_diabetes_data("Data/bank.csv", ",")
x_bank_real = x_bank_real[1].inverse_transform(x_bank_real[0].cpu().detach().numpy())
x_bank_real.shape

(5000, 13)

In [125]:
x_bank_syn.shape

(10000, 13)

In [126]:
resemblance_measure(x_bank_syn[:4999], x_bank_real)

Resemblance Score: 0.8743903071420271
