In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# Load Titanic dataset
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
data = pd.read_csv(url)
data = data[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']]

In [3]:
data.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare
0,3,male,22.0,1,0,7.25
1,1,female,38.0,1,0,71.2833
2,3,female,26.0,0,0,7.925
3,1,female,35.0,1,0,53.1
4,3,male,35.0,0,0,8.05


In [4]:
#data = pd.read_csv('train.csv', usecols=['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare'])

# Preprocess the dataset
data['Age'].fillna(data['Age'].median(), inplace=True)
data['Fare'].fillna(data['Fare'].median(), inplace=True)
data['Sex'] = data['Sex'].map({'male': 0, 'female': 1})
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data)

# Convert data to PyTorch tensor
data_tensor = torch.tensor(data_scaled, dtype=torch.float32)

# Define the generator model
class Generator(nn.Module):
    def __init__(self, noise_dim, output_dim):
        super(Generator, self).__init__()
        self.noise_dim = noise_dim
        self.output_dim = output_dim

        self.fc1 = nn.Linear(noise_dim, 128)
        self.fc2 = nn.Linear(128, 256)
        self.fc3 = nn.Linear(256, output_dim)

    def forward(self, noise):
        x = torch.relu(self.fc1(noise))
        x = torch.relu(self.fc2(x))
        x = torch.tanh(self.fc3(x))
        return x

# Define the discriminator model
class Discriminator(nn.Module):
    def __init__(self, input_dim):
        super(Discriminator, self).__init__()
        self.input_dim = input_dim

        self.fc1 = nn.Linear(input_dim, 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, 1)

    def forward(self, data):
        x = torch.relu(self.fc1(data))
        x = torch.relu(self.fc2(x))
        x = torch.sigmoid(self.fc3(x))
        return x

# Define hyperparameters
noise_dim = 32
output_dim = data_tensor.shape[1]
batch_size = 32
num_epochs = 10000
lr = 0.0002

# Initialize generator and discriminator
generator = Generator(noise_dim, output_dim)
discriminator = Discriminator(output_dim)

# Define loss function and optimizers
criterion = nn.BCELoss()
optimizer_g = optim.Adam(generator.parameters(), lr=lr)
optimizer_d = optim.Adam(discriminator.parameters(), lr=lr)

# Training loop
for epoch in range(num_epochs):
    for _ in range(data_tensor.size(0) // batch_size):
        # Train discriminator with real data
        real_batch = data_tensor[torch.randint(data_tensor.size(0), size=(batch_size,))]
        real_labels = torch.ones((batch_size, 1))
        real_output = discriminator(real_batch)
        loss_d_real = criterion(real_output, real_labels)

        # Train discriminator with fake data
        noise = torch.randn(batch_size, noise_dim)
        fake_batch = generator(noise)
        fake_labels = torch.zeros((batch_size, 1))
        fake_output = discriminator(fake_batch.detach())
        loss_d_fake = criterion(fake_output, fake_labels)

        # Update discriminator
        loss_d = loss_d_real + loss_d_fake
        optimizer_d.zero_grad()
        loss_d.backward()
        optimizer_d.step()

        # Train generator
        noise = torch.randn(batch_size, noise_dim)
        fake_batch = generator(noise)
        fake_labels = torch.ones((batch_size, 1))
        fake_output = discriminator(fake_batch)
        loss_g = criterion(fake_output, fake_labels)

        # Update generator
        optimizer_g.zero_grad()
        loss_g.backward()
        optimizer_g.step()

    if epoch % 100 == 0:
        print(f'Epoch [{epoch}/{num_epochs}] Loss D: {loss_d:.4f} Loss G: {loss_g:.4f}')



Epoch [0/10000] Loss D: 1.1569 Loss G: 0.7591
Epoch [100/10000] Loss D: 0.1910 Loss G: 2.5233
Epoch [200/10000] Loss D: 0.0454 Loss G: 3.7160
Epoch [300/10000] Loss D: 0.0016 Loss G: 6.6413
Epoch [400/10000] Loss D: 0.0003 Loss G: 8.4165
Epoch [500/10000] Loss D: 0.0001 Loss G: 10.0697
Epoch [600/10000] Loss D: 0.0000 Loss G: 10.8578
Epoch [700/10000] Loss D: 0.0000 Loss G: 12.6682
Epoch [800/10000] Loss D: 0.0000 Loss G: 14.1595
Epoch [900/10000] Loss D: 0.0000 Loss G: 15.8086
Epoch [1000/10000] Loss D: 0.0000 Loss G: 17.2173
Epoch [1100/10000] Loss D: 0.0000 Loss G: 18.5555
Epoch [1200/10000] Loss D: 0.0000 Loss G: 20.3449
Epoch [1300/10000] Loss D: 0.0000 Loss G: 21.4369
Epoch [1400/10000] Loss D: 0.0000 Loss G: 22.4632
Epoch [1500/10000] Loss D: 0.0000 Loss G: 23.3166
Epoch [1600/10000] Loss D: 0.0000 Loss G: 23.7917
Epoch [1700/10000] Loss D: 0.0000 Loss G: 24.3361
Epoch [1800/10000] Loss D: 0.0000 Loss G: 24.4022
Epoch [1900/10000] Loss D: 0.0000 Loss G: 24.7853
Epoch [2000/10000

In [5]:
# Generate synthetic samples
num_samples = 1000
noise = torch.randn(num_samples, noise_dim)
synthetic_data = generator(noise).detach().numpy()

# Convert synthetic data to a pandas DataFrame
synthetic_df = pd.DataFrame(synthetic_data, columns=data.columns)

# Print a few rows of the synthetic data
print(synthetic_df.head())

# Save synthetic data to a CSV file
synthetic_df.to_csv('synthetic_titanic_data.csv', index=False)

     Pclass  Sex  Age     SibSp     Parch      Fare
0  0.738130  1.0  1.0 -0.438775  1.000000 -0.341638
1  0.738596  1.0  1.0 -0.438155  1.000000 -0.338861
2  0.878274  1.0  1.0 -0.235755 -0.501316 -0.474498
3  0.878457  1.0  1.0 -0.236553 -0.501417 -0.474997
4  0.742956  1.0  1.0 -0.443310  1.000000 -0.342363


In [6]:
data.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare
0,3,0,22.0,1,0,7.25
1,1,1,38.0,1,0,71.2833
2,3,1,26.0,0,0,7.925
3,1,1,35.0,1,0,53.1
4,3,0,35.0,0,0,8.05


In [8]:
# Import necessary libraries
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim

# Define your Generator and Discriminator classes here

# Initialize the models, optimizers, and loss functions

# Define a function to generate synthetic data
def generate_synthetic_data(generator, num_samples):
    noise = torch.randn(num_samples, noise_dim)
    generated_data = generator(noise).detach().cpu().numpy()

    # Convert generated continuous data to discrete values
    generated_data = np.round(generated_data)  # Round to nearest integer

    # Apply inverse transformations to get back original scales
    generated_data = scaler.inverse_transform(generated_data)

    # Create a DataFrame with appropriate column names
    columns = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']  # Adjust based on your columns
    synthetic_df = pd.DataFrame(generated_data, columns=columns)

    return synthetic_df

# Generate synthetic data
num_samples = 1000  # Number of synthetic samples to generate
synthetic_data = generate_synthetic_data(generator, num_samples)

# Display the generated synthetic data
print(synthetic_data.head())


     Pclass       Sex       Age     SibSp     Parch       Fare
0  3.144244  0.830135  42.37397  0.523008 -0.424011  32.204208
1  3.144244  0.830135  42.37397  0.523008  1.187199  32.204208
2  3.144244  0.830135  42.37397  0.523008  1.187199  32.204208
3  3.144244  0.830135  42.37397  0.523008  1.187199  32.204208
4  3.144244  0.830135  42.37397  0.523008  1.187199  32.204208


In [9]:
data

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare
0,3,0,22.0,1,0,7.2500
1,1,1,38.0,1,0,71.2833
2,3,1,26.0,0,0,7.9250
3,1,1,35.0,1,0,53.1000
4,3,0,35.0,0,0,8.0500
...,...,...,...,...,...,...
886,2,0,27.0,0,0,13.0000
887,1,1,19.0,0,0,30.0000
888,3,1,28.0,1,2,23.4500
889,1,0,26.0,0,0,30.0000
