In [21]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

# dataset
df = pd.read_csv('WEB_TARGET.csv')

In [22]:
df.columns

Index(['Age', 'Sex', 'Ruptured', 'Circulation', 'Location', 'Ignore',
       'Max diameter in any dimension (mm)', 'Height (mm)', 'Width (mm)',
       'Other diameter', 'Neck (mm) orthogonal view 1',
       'Neck (mm) orthogonal view 2', 'Avg neck vol', 'Aneurysm 3D Volume',
       'WEB_1_Name', 'SL/Elongated', 'WEB_1_width', 'WEB_1_Height',
       'WEB Volume', 'WEB Device #2 name', 'WEB 2 Volume', 'Final WEB Volume',
       'Comments on why a second device needed',
       'Complete WEB failure/abandonment', 'Re-sizing required',
       'Stenting used to support WEB?', 'Composite outcome',
       'Procedure related ischemic stroke?', 'Intraop rupture', '3_mo_occ',
       '6_mo_occ', '1_year_occ'],
      dtype='object')

In [23]:

# Alternatively, calculate percentages
missing_percent = df.isna().mean() * 100
print("Percentage of Missing Values in Each Feature:")
print(missing_percent)

Percentage of Missing Values in Each Feature:
Age                                        0.000000
Sex                                        0.000000
Ruptured                                   0.000000
Circulation                                0.000000
Location                                   0.000000
Ignore                                    70.512821
Max diameter in any dimension (mm)         0.000000
Height (mm)                                0.000000
Width (mm)                                 0.000000
Other diameter                            55.128205
Neck (mm) orthogonal view 1                0.000000
Neck (mm) orthogonal view 2                0.000000
Avg neck vol                               0.000000
Aneurysm 3D Volume                         0.000000
WEB_1_Name                                 0.000000
SL/Elongated                               0.000000
WEB_1_width                                0.000000
WEB_1_Height                               0.000000
WEB Volume        

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import MinMaxScaler

# Features and data preparation
features = ['Age', 'Sex', 'Ruptured', 'Circulation',
            'Max diameter in any dimension (mm)', 'Height (mm)',
            'Width (mm)', 'Neck (mm) orthogonal view 1',
            'Neck (mm) orthogonal view 2', 'Avg neck vol', 'Aneurysm 3D Volume',
            'SL/Elongated', 'Final WEB Volume',
            'Complete WEB failure/abandonment', 'Re-sizing required',
            'Stenting used to support WEB?', 'Composite outcome',
            'Procedure related ischemic stroke?', 'Intraop rupture',
            '6_mo_occ']

data = df[features].copy()

ordinal_features = ['6_mo_occ']
binary_features = ['Sex', 'Ruptured', 'Circulation', 'SL/Elongated','Composite outcome',
                   'Complete WEB failure/abandonment', 'Re-sizing required',
                   'Stenting used to support WEB?', 'Intraop rupture',
                   'Procedure related ischemic stroke?']
continuous_features = [col for col in features if col not in binary_features + ordinal_features]

for col in ordinal_features:
    data[col].fillna(data[col].mode()[0], inplace=True)
data.fillna(data.mean(numeric_only=True), inplace=True)

scaler_continuous = MinMaxScaler()
scaler_ordinal = MinMaxScaler()
data[continuous_features] = scaler_continuous.fit_transform(data[continuous_features])
data[ordinal_features] = scaler_ordinal.fit_transform(data[ordinal_features])

X = data.values

# GAN parameters
latent_dim = 16
input_dim = X.shape[1]
binary_dim = len(binary_features)
continuous_dim = len(continuous_features)
ordinal_dim = len(ordinal_features)
epochs = 5000
batch_size = 16
learning_rate = 0.0002

# Conditional Generator
class ConditionalGenerator(nn.Module):
    def __init__(self, latent_dim, binary_dim, continuous_dim, ordinal_dim, initial_temperature=0.2):
        super(ConditionalGenerator, self).__init__()
        self.latent_dim = latent_dim
        self.binary_dim = binary_dim
        self.continuous_dim = continuous_dim
        self.ordinal_dim = ordinal_dim
        self.temperature = initial_temperature
        self.model = nn.Sequential(
            nn.Linear(latent_dim + binary_dim, 256),
            nn.LeakyReLU(0.2),
            nn.Linear(256, 512),
            nn.LeakyReLU(0.2),
            nn.Linear(512, binary_dim + continuous_dim + ordinal_dim)
        )

    def forward(self, z, conditions):
        input = torch.cat([z, conditions], dim=1)
        logits = self.model(input)
        binary_logits = logits[:, :self.binary_dim]
        continuous_logits = logits[:, self.binary_dim:self.binary_dim + self.continuous_dim]
        ordinal_logits = logits[:, self.binary_dim + self.continuous_dim:]
        binary_output = torch.sigmoid(binary_logits / self.temperature)
        continuous_output = torch.sigmoid(continuous_logits)
        ordinal_output = torch.sigmoid(ordinal_logits)
        return torch.cat([binary_output, continuous_output, ordinal_output], dim=1)

    def update_temperature(self, new_temperature):
        self.temperature = new_temperature

# Conditional Discriminator
class ConditionalDiscriminator(nn.Module):
    def __init__(self, input_dim, binary_dim):
        super(ConditionalDiscriminator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim + binary_dim, 256),
            nn.LeakyReLU(0.2),
            nn.Linear(256, 128),
            nn.LeakyReLU(0.2),
            nn.Linear(128, 1)
        )

    def forward(self, x, conditions):
        input = torch.cat([x, conditions], dim=1)
        return self.model(input)

# Hinge Loss
def hinge_loss_discriminator(real_scores, fake_scores):
    return torch.mean(torch.relu(1.0 - real_scores)) + torch.mean(torch.relu(1.0 + fake_scores))

def hinge_loss_generator(fake_scores):
    return -torch.mean(fake_scores)

# Gradient Penalty
def compute_gradient_penalty(discriminator, real_data, fake_data, conditions):
    alpha = torch.rand(real_data.size(0), 1).expand_as(real_data)
    interpolates = alpha * real_data + (1 - alpha) * fake_data
    interpolates = interpolates.requires_grad_(True)
    scores = discriminator(interpolates, conditions)
    gradients = torch.autograd.grad(
        outputs=scores,
        inputs=interpolates,
        grad_outputs=torch.ones_like(scores),
        create_graph=True,
        retain_graph=True,
        only_inputs=True,
    )[0]
    gradient_norm = gradients.norm(2, dim=1)
    return ((gradient_norm - 1) ** 2).mean()

# Initialize generator and discriminator
generator = ConditionalGenerator(latent_dim, binary_dim, continuous_dim, ordinal_dim)
discriminator = ConditionalDiscriminator(input_dim, binary_dim)

# Optimizers
optimizer_g = optim.Adam(generator.parameters(), lr=learning_rate)
optimizer_d = optim.Adam(discriminator.parameters(), lr=learning_rate)

# Binary conditions
binary_conditions = torch.tensor(X[:, :binary_dim], dtype=torch.float32)

# Training
initial_temperature = 0.2
final_temperature = 0.05
temperature_decay = (initial_temperature - final_temperature) / epochs

for epoch in range(epochs):
    current_temperature = max(final_temperature, initial_temperature - epoch * temperature_decay)
    generator.update_temperature(current_temperature)

    for _ in range(X.shape[0] // batch_size):
        real_idx = np.random.choice(X.shape[0], batch_size)
        real_data = torch.tensor(X[real_idx], dtype=torch.float32)
        real_conditions = binary_conditions[real_idx]

        noise = torch.randn(batch_size, latent_dim)
        fake_data = generator(noise, real_conditions)
        real_scores = discriminator(real_data, real_conditions)
        fake_scores = discriminator(fake_data.detach(), real_conditions)

        gp = compute_gradient_penalty(discriminator, real_data, fake_data, real_conditions)

        optimizer_d.zero_grad()
        loss_d = hinge_loss_discriminator(real_scores, fake_scores) + 10 * gp
        loss_d.backward()
        optimizer_d.step()

        noise = torch.randn(batch_size, latent_dim)
        fake_data = generator(noise, real_conditions)
        fake_scores = discriminator(fake_data, real_conditions)

        optimizer_g.zero_grad()
        loss_g = hinge_loss_generator(fake_scores)
        loss_g.backward()
        optimizer_g.step()

    if epoch % 500 == 0:
        print(f"Epoch {epoch}/{epochs} | Loss D: {loss_d.item():.4f} | Loss G: {loss_g.item():.4f} | Temp: {current_temperature:.4f}")


Epoch 0/5000 | Loss D: 9.6863 | Loss G: -0.0875 | Temp: 0.2000


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].mode()[0], inplace=True)


Epoch 500/5000 | Loss D: 1.1535 | Loss G: 0.3860 | Temp: 0.1850
Epoch 1000/5000 | Loss D: 1.7433 | Loss G: 0.0661 | Temp: 0.1700
Epoch 1500/5000 | Loss D: 1.7610 | Loss G: 0.0638 | Temp: 0.1550
Epoch 2000/5000 | Loss D: 1.6162 | Loss G: 0.1223 | Temp: 0.1400
Epoch 2500/5000 | Loss D: 1.6121 | Loss G: 0.1030 | Temp: 0.1250
Epoch 3000/5000 | Loss D: 1.8313 | Loss G: 0.0893 | Temp: 0.1100
Epoch 3500/5000 | Loss D: 1.7818 | Loss G: 0.0907 | Temp: 0.0950
Epoch 4000/5000 | Loss D: 1.8297 | Loss G: 0.1148 | Temp: 0.0800
Epoch 4500/5000 | Loss D: 1.4840 | Loss G: 0.2876 | Temp: 0.0650


In [29]:

# Generate synthetic data
num_samples = 1000
noise = torch.randn(num_samples, latent_dim)

if num_samples > len(binary_conditions):
    conditions = binary_conditions.repeat((num_samples // len(binary_conditions)) + 1, 1)[:num_samples]
else:
    conditions = binary_conditions[:num_samples]

synthetic_data = generator(noise, conditions).detach().numpy()

# Post-processing
synthetic_df = pd.DataFrame(synthetic_data, columns=features)
synthetic_df[continuous_features] = scaler_continuous.inverse_transform(synthetic_df[continuous_features])
synthetic_df[ordinal_features] = scaler_ordinal.inverse_transform(synthetic_df[ordinal_features])

for col in ordinal_features:
    synthetic_df[col] = synthetic_df[col].round().astype(int)

synthetic_df[binary_features] = synthetic_df[binary_features].round().astype(int)

print(synthetic_df.head())  

         Age  Sex  Ruptured  Circulation  Max diameter in any dimension (mm)  \
0  47.396488    1         1            1                            6.670673   
1  33.364552    1         1            1                            8.396546   
2  57.797279    0         0            1                           10.937903   
3  36.825123    0         1            1                            5.799216   
4  40.772968    0         0            1                            4.324586   

   Height (mm)  Width (mm)  Neck (mm) orthogonal view 1  \
0     6.401578    5.721988                     3.173919   
1     8.676097    7.632496                     6.792737   
2    10.773809    7.744838                     2.603350   
3     5.460171    5.722278                     2.411260   
4     4.232851    3.915308                     3.453340   

   Neck (mm) orthogonal view 2  Avg neck vol  Aneurysm 3D Volume  \
0                     2.732397      2.767143          125.095436   
1                     3.9578

In [30]:
import pandas as pd
from scipy.stats import ks_2samp, wasserstein_distance
from scipy.spatial.distance import jensenshannon
import numpy as np

def ks_test(original, synthetic):
    try:
        ks_stat, p_value = ks_2samp(original, synthetic)
        return {"KS Statistic": ks_stat, "p-value": p_value}
    except Exception as e:
        return {"KS Statistic": None, "p-value": None, "Error": str(e)}

def js_divergence(original, synthetic, bins=30):
    try:
        original_hist, _ = np.histogram(original, bins=bins, density=True)
        synthetic_hist, _ = np.histogram(synthetic, bins=bins, density=True)
        jsd = jensenshannon(original_hist, synthetic_hist)
        return jsd
    except Exception as e:
        return None

def emd(original, synthetic):
    try:
        emd_value = wasserstein_distance(original, synthetic)
        return emd_value
    except Exception as e:
        return None

# Kolmogorov-Smirnov Test
ks_results = {feature: ks_test(df[feature].dropna(), synthetic_df[feature].dropna()) for feature in continuous_features}

# Jensen-Shannon Divergence
jsd_results = {feature: js_divergence(df[feature].dropna(), synthetic_df[feature].dropna()) for feature in continuous_features}

# Earth Mover's Distance (Wasserstein Distance)
emd_results = {feature: emd(df[feature].dropna(), synthetic_df[feature].dropna()) for feature in continuous_features}

# Binary features comparison
binary_comparison = {feature: {"Original Proportion": df[feature].mean(skipna=True), "Synthetic Proportion": synthetic_df[feature].mean(skipna=True)} for feature in binary_features}

# Convert results to DataFrames for easier analysis
ks_results_df = pd.DataFrame(ks_results).T
jsd_results_df = pd.DataFrame(jsd_results, index=["Jensen-Shannon Divergence"]).T
emd_results_df = pd.DataFrame(emd_results, index=["Earth Mover's Distance"]).T
binary_comparison_df = pd.DataFrame(binary_comparison).T

print("Kolmogorov-Smirnov Test Results for Continuous Features:")
print(ks_results_df)

print("\nJensen-Shannon Divergence for Continuous Features:")
print(jsd_results_df)

print("\nEarth Mover's Distance for Continuous Features:")
print(emd_results_df)

print("\nComparison of Proportions for Binary Features:")
print(binary_comparison_df)


Kolmogorov-Smirnov Test Results for Continuous Features:
                                    KS Statistic   p-value
Age                                     0.054795  0.972620
Max diameter in any dimension (mm)      0.051231  0.985854
Height (mm)                             0.044462  0.997486
Width (mm)                              0.121641  0.215245
Neck (mm) orthogonal view 1             0.052436  0.982032
Neck (mm) orthogonal view 2             0.129564  0.160437
Avg neck vol                            0.102744  0.400803
Aneurysm 3D Volume                      0.089128  0.580284
Final WEB Volume                        0.099949  0.435020

Jensen-Shannon Divergence for Continuous Features:
                                    Jensen-Shannon Divergence
Age                                                  0.286426
Max diameter in any dimension (mm)                   0.199746
Height (mm)                                          0.194778
Width (mm)                                           

In [31]:
ks_results_ordinal = {}
jsd_results_ordinal = {}
freq_comparison_ordinal = {}

for feature in ordinal_features:
    try:
        # Drop missing values for both real and synthetic datasets
        original = df[feature].dropna()
        synthetic = synthetic_df[feature].dropna()

        # KS Test
        ks_stat, p_value = ks_2samp(original, synthetic)
        ks_results_ordinal[feature] = {"KS Statistic": ks_stat, "p-value": p_value}

        # Jensen-Shannon Divergence
        num_bins = max(len(original.unique()), len(synthetic.unique()))
        original_hist, _ = np.histogram(original, bins=num_bins, range=(original.min(), original.max()), density=True)
        synthetic_hist, _ = np.histogram(synthetic, bins=num_bins, range=(original.min(), original.max()), density=True)
        jsd = jensenshannon(original_hist, synthetic_hist)
        jsd_results_ordinal[feature] = jsd

        # Frequency Comparison
        original_freq = original.value_counts(normalize=True).sort_index()
        synthetic_freq = synthetic.value_counts(normalize=True).sort_index()
        freq_comparison = pd.DataFrame({'Original': original_freq, 'Synthetic': synthetic_freq}).fillna(0)
        freq_comparison_ordinal[feature] = freq_comparison

    except Exception as e:
        ks_results_ordinal[feature] = {"KS Statistic": None, "p-value": None, "Error": str(e)}
        jsd_results_ordinal[feature] = None
        freq_comparison_ordinal[feature] = None

ks_results_ordinal_df = pd.DataFrame(ks_results_ordinal).T
jsd_results_ordinal_df = pd.DataFrame(jsd_results_ordinal, index=["Jensen-Shannon Divergence"]).T

print("Kolmogorov-Smirnov Test Results for Ordinal Features:")
print(ks_results_ordinal_df)

print("\nJensen-Shannon Divergence for Ordinal Features:")
print(jsd_results_ordinal_df)

print("\nFrequency Comparison for Ordinal Features:")
for feature, comparison in freq_comparison_ordinal.items():
    if comparison is not None:
        print(f"\n{feature} Frequency Comparison:")
        print(comparison)
    else:
        print(f"\n{feature} Frequency Comparison: None")


Kolmogorov-Smirnov Test Results for Ordinal Features:
          KS Statistic   p-value
6_mo_occ      0.054051  0.975898

Jensen-Shannon Divergence for Ordinal Features:
          Jensen-Shannon Divergence
6_mo_occ                   0.106894

Frequency Comparison for Ordinal Features:

6_mo_occ Frequency Comparison:
          Original  Synthetic
6_mo_occ                     
0         0.538462      0.494
1         0.102564      0.130
2         0.076923      0.148
3         0.256410      0.183
4         0.025641      0.045


In [32]:
synthetic_df.to_csv('synthetic_data.csv', index=False)