In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from IPython.display import display

we need to split the input and output into numpy arrays, in order to apply the classifiers in scikit learn.

In [21]:
# Load the breast_cancer dataset
data = load_breast_cancer()
# Print number of records
print(f'Number of records: {len(data.data)}')
# display(data)
X = data.data
# display(X)
y = data.target

Number of records: 569


In [24]:
# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f'Number of records X_train : {len(X_train)}')
print(f'Number of records X_test: {len(X_test)}')
print(f'Number of records y_train: {len(y_train)}')
print(f'Number of records: {len(y_test)}')
# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


Number of records X_train : 455
Number of records X_test: 114
Number of records y_train: 455
Number of records: 114


In [25]:
# Convert to PyTorch tensors
import torch
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).view(-1, 1)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).view(-1, 1)

print(f'Number of records X_train_tensor : {len(X_train_tensor)}')
print(f'Number of records y_train_tensor: {len(y_train_tensor)}')
print(f'Number of records X_test_tensor: {len(X_test_tensor)}')
print(f'Number of records y_test_tensor: {len(y_test_tensor)}')

Number of records X_train_tensor : 455
Number of records y_train_tensor: 455
Number of records X_test_tensor: 114
Number of records y_test_tensor: 114


In [5]:
import torch.nn as nn
import torch.optim as optim

# Define the generator network
class Generator(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(Generator, self).__init__()
        self.main = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(True),
            nn.Linear(128, 256),
            nn.ReLU(True),
            nn.Linear(256, 512),
            nn.ReLU(True),
            nn.Linear(512, output_dim),
            nn.Tanh()
        )

    def forward(self, x):
        return self.main(x)

# Define the discriminator network
class Discriminator(nn.Module):
    def __init__(self, input_dim):
        super(Discriminator, self).__init__()
        self.main = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Linear(512, 256),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Linear(256, 128),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Linear(128, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.main(x)



In [29]:
# Initialize the networks
input_dim = X_train.shape[1]
generator = Generator(input_dim=100, output_dim=input_dim)
discriminator = Discriminator(input_dim=input_dim)
# Generate random noise
noise = torch.randn(1, 100)  # Batch size of 1, noise dimension of 100

# Generate data using the generator
generated_data = generator(noise)

# Pass the generated data through the discriminator
discriminator_output = discriminator(generated_data)

# Print the generated data and the discriminator's output
print("Generated Data:")
print(len(generated_data))

print("\nDiscriminator Output:")
print(len(discriminator_output))
# Loss function and optimizers
criterion = nn.BCELoss()
optimizer_g = optim.Adam(generator.parameters(), lr=0.0002)
optimizer_d = optim.Adam(discriminator.parameters(), lr=0.0002)

Generated Data:
1

Discriminator Output:
1


In [7]:
# Training loop
num_epochs = 100
batch_size = 64
fixed_noise = torch.randn(batch_size, 100)

for epoch in range(num_epochs):
    for i in range(0, X_train_tensor.size(0), batch_size):
        real_data = X_train_tensor[i:i+batch_size]
        batch_size = real_data.size(0)
        
        # Train discriminator
        real_labels = torch.ones(batch_size, 1)
        fake_labels = torch.zeros(batch_size, 1)
        
        outputs = discriminator(real_data)
        d_loss_real = criterion(outputs, real_labels)
        
        noise = torch.randn(batch_size, 100)
        fake_data = generator(noise)
        outputs = discriminator(fake_data.detach())
        d_loss_fake = criterion(outputs, fake_labels)
        
        d_loss = d_loss_real + d_loss_fake
        optimizer_d.zero_grad()
        d_loss.backward()
        optimizer_d.step()
        
        # Train generator
        noise = torch.randn(batch_size, 100)
        fake_data = generator(noise)
        outputs = discriminator(fake_data)
        g_loss = criterion(outputs, real_labels)
        
        optimizer_g.zero_grad()
        g_loss.backward()
        optimizer_g.step()
        
    print(f'Epoch [{epoch+1}/{num_epochs}], d_loss: {d_loss.item():.4f}, g_loss: {g_loss.item():.4f}')

Epoch [1/100], d_loss: 1.2788, g_loss: 0.6496
Epoch [2/100], d_loss: 0.7212, g_loss: 1.0268
Epoch [3/100], d_loss: 0.6144, g_loss: 2.3056
Epoch [4/100], d_loss: 0.4650, g_loss: 2.9835
Epoch [5/100], d_loss: 0.5818, g_loss: 3.4171
Epoch [6/100], d_loss: 0.0618, g_loss: 3.6727
Epoch [7/100], d_loss: 0.0289, g_loss: 4.5302
Epoch [8/100], d_loss: 0.2477, g_loss: 3.3322
Epoch [9/100], d_loss: 0.0740, g_loss: 4.5771
Epoch [10/100], d_loss: 0.0077, g_loss: 5.1036
Epoch [11/100], d_loss: 0.0310, g_loss: 5.7531
Epoch [12/100], d_loss: 0.1711, g_loss: 3.7552
Epoch [13/100], d_loss: 0.0569, g_loss: 4.0618
Epoch [14/100], d_loss: 0.2798, g_loss: 5.2390
Epoch [15/100], d_loss: 0.0948, g_loss: 4.1970
Epoch [16/100], d_loss: 0.0102, g_loss: 5.4206
Epoch [17/100], d_loss: 0.0041, g_loss: 6.3764
Epoch [18/100], d_loss: 0.0419, g_loss: 4.9792
Epoch [19/100], d_loss: 0.0774, g_loss: 4.1437
Epoch [20/100], d_loss: 0.1650, g_loss: 3.7880
Epoch [21/100], d_loss: 0.0157, g_loss: 4.7949
Epoch [22/100], d_loss

In [30]:
# Generate synthetic data
def generate_synthetic_data(generator, num_samples):
    noise = torch.randn(num_samples, 100)
    synthetic_data = generator(noise)
    return synthetic_data

# Generate 1000 synthetic samples
num_synthetic_samples = 1000
synthetic_data = generate_synthetic_data(generator, num_synthetic_samples)
print(f'Synthetic data shape: {synthetic_data.size()}')
# Combine synthetic data with real data
combined_X_train = torch.cat((X_train_tensor, synthetic_data), 0)
combined_y_train = torch.cat((y_train_tensor, torch.ones(num_synthetic_samples, 1)), 0)

# Print the number of total records
print(f'Total number of records: {combined_X_train.size(0)}')
print(f'Total number of records: {combined_y_train.size(0)}')


Synthetic data shape: torch.Size([1000, 30])
Total number of records: 1455
Total number of records: 1455


In [9]:
# Define a simple classifier
class SimpleClassifier(nn.Module):
    def __init__(self, input_dim):
        super(SimpleClassifier, self).__init__()
        self.main = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(True),
            nn.Linear(128, 64),
            nn.ReLU(True),
            nn.Linear(64, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.main(x)

classifier = SimpleClassifier(input_dim=input_dim)
criterion = nn.BCELoss()
optimizer = optim.Adam(classifier.parameters(), lr=0.001)

# Training loop for the classifier
num_epochs = 50
for epoch in range(num_epochs):
    for i in range(0, combined_X_train.size(0), batch_size):
        inputs = combined_X_train[i:i+batch_size]
        labels = combined_y_train[i:i+batch_size]
        
        outputs = classifier(inputs)
        loss = criterion(outputs, labels)
        
        optimizer.zero_grad()
        loss.backward(retain_graph=True)
        optimizer.step()
        
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

Epoch [1/50], Loss: 0.0006
Epoch [2/50], Loss: 0.0006
Epoch [3/50], Loss: 0.0005
Epoch [4/50], Loss: 0.0004
Epoch [5/50], Loss: 0.0003
Epoch [6/50], Loss: 0.0003
Epoch [7/50], Loss: 0.0002
Epoch [8/50], Loss: 0.0002
Epoch [9/50], Loss: 0.0002
Epoch [10/50], Loss: 0.0001
Epoch [11/50], Loss: 0.0001
Epoch [12/50], Loss: 0.0000
Epoch [13/50], Loss: 0.0000
Epoch [14/50], Loss: 0.0000
Epoch [15/50], Loss: 0.0000
Epoch [16/50], Loss: 0.0000
Epoch [17/50], Loss: 0.0000
Epoch [18/50], Loss: 0.0000
Epoch [19/50], Loss: 0.0000
Epoch [20/50], Loss: 0.0000
Epoch [21/50], Loss: 0.0000
Epoch [22/50], Loss: 0.0000
Epoch [23/50], Loss: 0.0000
Epoch [24/50], Loss: 0.0000
Epoch [25/50], Loss: 0.0000
Epoch [26/50], Loss: 0.0000
Epoch [27/50], Loss: 0.0000
Epoch [28/50], Loss: 0.0000
Epoch [29/50], Loss: 0.0000
Epoch [30/50], Loss: 0.0000
Epoch [31/50], Loss: 0.0000
Epoch [32/50], Loss: 0.0000
Epoch [33/50], Loss: 0.0000
Epoch [34/50], Loss: 0.0000
Epoch [35/50], Loss: 0.0000
Epoch [36/50], Loss: 0.0000
E

Testing models

In [10]:
# Evaluate the classifier
classifier.eval()
with torch.no_grad():
    outputs = classifier(X_test_tensor)
    predicted = (outputs > 0.5).float()
    accuracy = (predicted == y_test_tensor).float().mean()
    print(f'Accuracy of the classifier on the test set: {accuracy.item() * 100:.2f}%')

Accuracy of the classifier on the test set: 96.49%


In [32]:
# Convert tensors to numpy arrays
combined_X_train_np = combined_X_train.detach().numpy()
combined_y_train_np = combined_y_train.detach().numpy()

# Create a DataFrame
combined_df = pd.DataFrame(combined_X_train_np, columns=data.feature_names)
combined_df['target'] = combined_y_train_np

# Write the DataFrame to a CSV file
combined_df.to_csv('combined_data.csv', index=False)