In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

# Load Titanic dataset
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
data = pd.read_csv(url)

In [2]:
data.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
# Preprocessing
data = data.drop(columns=['Name', 'Ticket', 'Cabin'])
data = data.dropna()
data['Sex'] = LabelEncoder().fit_transform(data['Sex'])
data['Embarked'] = LabelEncoder().fit_transform(data['Embarked'])
data['Pclass'] = data['Pclass'].astype('category')
data = pd.get_dummies(data, columns=['Pclass'])

In [4]:
data.head(5)

Unnamed: 0,PassengerId,Survived,Sex,Age,SibSp,Parch,Fare,Embarked,Pclass_1,Pclass_2,Pclass_3
0,1,0,1,22.0,1,0,7.25,2,0,0,1
1,2,1,0,38.0,1,0,71.2833,0,1,0,0
2,3,1,0,26.0,0,0,7.925,2,0,0,1
3,4,1,0,35.0,1,0,53.1,2,1,0,0
4,5,0,1,35.0,0,0,8.05,2,0,0,1


In [5]:
# Split data into features and target
X = data.drop(columns='Survived')
y = data['Survived']

# Normalize numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Convert to PyTorch tensors
X_tensor = torch.tensor(X_scaled, dtype=torch.float32)
y_tensor = torch.tensor(y.values, dtype=torch.float32).view(-1, 1)

# Define VAE architecture
class VAE(nn.Module):
    def __init__(self, input_dim, latent_dim):
        super(VAE, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, latent_dim * 2)
        )
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 128),
            nn.ReLU(),
            nn.Linear(128, input_dim),
            nn.Sigmoid()
        )

    def encode(self, x):
        mean_logvar = self.encoder(x)
        mean = mean_logvar[:, :latent_dim]
        logvar = mean_logvar[:, latent_dim:]
        return mean, logvar

    def reparameterize(self, mean, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        z = mean + eps * std
        return z

    def decode(self, z):
        return self.decoder(z)

    def forward(self, x):
        mean, logvar = self.encode(x)
        z = self.reparameterize(mean, logvar)
        recon_x = self.decode(z)
        return recon_x, mean, logvar

# Initialize VAE model
input_dim = X_scaled.shape[1]
latent_dim = 10
vae = VAE(input_dim, latent_dim)

# Define loss function
def loss_function(recon_x, x, mean, logvar):
    recon_loss = nn.BCELoss()(recon_x, x)
    kl_loss = -0.5 * torch.sum(1 + logvar - mean.pow(2) - logvar.exp())
    return recon_loss + kl_loss

# Define optimizer
optimizer = optim.Adam(vae.parameters(), lr=0.001)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_tensor, y_tensor, test_size=0.2, random_state=42)

# Create DataLoader
train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

# Training loop
num_epochs = 100
vae.train()
for epoch in range(num_epochs):
    for batch_idx, (data_batch, _) in enumerate(train_loader):
        optimizer.zero_grad()
        recon_batch, mean, logvar = vae(data_batch)
        loss = loss_function(recon_batch, data_batch, mean, logvar)
        loss.backward()
        optimizer.step()
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")



Epoch [1/100], Loss: 1.7320
Epoch [2/100], Loss: 0.9102
Epoch [3/100], Loss: 0.5158
Epoch [4/100], Loss: 0.3959
Epoch [5/100], Loss: 0.1548
Epoch [6/100], Loss: -0.0612
Epoch [7/100], Loss: 0.0843
Epoch [8/100], Loss: 0.0423
Epoch [9/100], Loss: 0.0952
Epoch [10/100], Loss: -0.0006
Epoch [11/100], Loss: -0.1765
Epoch [12/100], Loss: 0.4057
Epoch [13/100], Loss: -0.4080
Epoch [14/100], Loss: 0.3259
Epoch [15/100], Loss: -0.3624
Epoch [16/100], Loss: 0.0864
Epoch [17/100], Loss: -0.4319
Epoch [18/100], Loss: -0.0015
Epoch [19/100], Loss: 0.5906
Epoch [20/100], Loss: -0.1686
Epoch [21/100], Loss: -0.5015
Epoch [22/100], Loss: -0.3854
Epoch [23/100], Loss: -0.1229
Epoch [24/100], Loss: -0.2852
Epoch [25/100], Loss: -1.0443
Epoch [26/100], Loss: 0.0722
Epoch [27/100], Loss: -0.3509
Epoch [28/100], Loss: -0.3275
Epoch [29/100], Loss: -0.3905
Epoch [30/100], Loss: -0.4142
Epoch [31/100], Loss: -0.3238
Epoch [32/100], Loss: -1.2675
Epoch [33/100], Loss: 0.8948
Epoch [34/100], Loss: -0.3285
Epo

In [6]:
# Generate synthetic samples
vae.eval()
num_samples = 10
with torch.no_grad():
    latent_samples = torch.randn(num_samples, latent_dim)
    generated_samples = vae.decode(latent_samples)

# Denormalize generated samples
generated_samples = scaler.inverse_transform(generated_samples)

# Print generated samples
generated_df = pd.DataFrame(generated_samples, columns=X.columns)
print(generated_df)


   PassengerId       Sex        Age     SibSp     Parch       Fare  Embarked  \
0   459.198878  0.636236  29.642093  0.514045  0.432584  34.567251  1.636096   
1   462.617783  0.636236  29.642093  0.514045  0.432584  34.567251  1.635904   
2   459.009791  0.636236  29.642093  0.514045  0.432584  34.567251  1.636230   
3   458.483907  0.636236  29.642093  0.514045  0.432584  34.567251  1.611878   
4   468.815559  0.636236  29.642093  0.514045  0.432584  34.567251  1.635299   
5   465.978866  0.636236  29.642093  0.514045  0.432584  34.567251  1.637891   
6   464.253707  0.636236  29.642093  0.514045  0.432584  34.567251  1.653764   
7   455.762989  0.636236  29.642093  0.514045  0.432584  34.567251  1.645715   
8   454.439645  0.636236  29.642093  0.514045  0.432584  34.567251  1.623327   
9   459.368522  0.636236  29.642093  0.514045  0.432584  34.567251  1.636562   

   Pclass_1  Pclass_2  Pclass_3  
0  0.258427  0.247796  0.498596  
1  0.258427  0.256086  0.498596  
2  0.258427  0.24