In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random

import torch
import torch.nn as nn
import torch.optim as optim

In [2]:
data = pd.read_csv("/kaggle/input/pima-indians-diabetes-database/diabetes.csv")
data_tensor = torch.tensor(data.values, dtype=torch.float32)
data_tensor

tensor([[  6.0000, 148.0000,  72.0000,  ...,   0.6270,  50.0000,   1.0000],
        [  1.0000,  85.0000,  66.0000,  ...,   0.3510,  31.0000,   0.0000],
        [  8.0000, 183.0000,  64.0000,  ...,   0.6720,  32.0000,   1.0000],
        ...,
        [  5.0000, 121.0000,  72.0000,  ...,   0.2450,  30.0000,   0.0000],
        [  1.0000, 126.0000,  60.0000,  ...,   0.3490,  47.0000,   1.0000],
        [  1.0000,  93.0000,  70.0000,  ...,   0.3150,  23.0000,   0.0000]])

## GAN

Recall the GAN objective
$$\min_\psi \max_\phi \,\ \mathbb E_{x \sim Pop}[ -\ln P_\psi(1 | x) ] + \mathbb E_{z \sim \mathcal N(0,1)} [- \ln P_\psi(0|G_\phi(z))  ] \,,$$
where $G_\phi$ is a network that maps gaussian noise $z \sim \mathcal N(0,1)$ to $G(z)$ with the same shape as $x$, and $P_\psi$ is modeled by another network (the discriminator) that maps real samples $x$ and 'fake' samples $G(z)$ to a distribution over $\{0,1\}$.

We will follow the common practice of adopting a different objective for the generator network $G$:
$$\min_\phi \,\ \mathbb E_{z \sim \mathcal N(0,1)} [- \ln P_\psi(1|G_\phi (z) )] $$ng $G$.

In [3]:
seed = 0
torch.manual_seed(seed)
random.seed(seed)
np.random.seed(seed)

LATENT_DIM = 10
MAX_EPOCHS = 10000
BATCH_SIZE = 64

In [4]:
# define latent space
def create_latent_space(latent_dim=LATENT_DIM, batch_size=BATCH_SIZE):
    latent = torch.randn(batch_size, latent_dim) # already reshaped (n x latet_dim)
    return latent # batch x 9


# define generator
class Generator(nn.Module):
    def __init__(self, input_dim=LATENT_DIM, output_dim=9): # 9 features
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 15),
            nn.ReLU(),
            nn.Linear(15, 30),
            nn.ReLU(),
            nn.Linear(30, output_dim)
        )
    def forward(self, n_samples):
        latent_space = create_latent_space(latent_dim=LATENT_DIM, batch_size=n_samples)
        samples = self.model(latent_space)
        return samples


# define generated and real samples
def get_generated_samples(generator, batch_size=BATCH_SIZE):
    generated_samples = generator(batch_size)
    labels_for_generated_samples = torch.zeros(generated_samples.shape[0], 1)
    return generated_samples, labels_for_generated_samples
def get_real_samples(data, batch_size=BATCH_SIZE):
    samples = data.sample(batch_size, replace=True)
    real_samples = torch.tensor(samples.values, dtype=torch.float32)
    labels_for_real_samples = torch.ones(batch_size, 1)
    return real_samples, labels_for_real_samples


# define discriminator
class Discriminator(nn.Module): 
    def __init__(self, input_dim=9, output_dim=1): # output=1 => fake or real
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 25),
            nn.ReLU(),
            nn.Linear(25, 50),
            nn.ReLU(),
            nn.Linear(50, output_dim),
            nn.Sigmoid() # probs between 0 and 1
        )
    def forward(self, x):
        return self.model(x)

In [5]:
# get the generator and discriminator
generator = Generator() 
discriminator = Discriminator()

# define binary cross entropy loss (since this is real or fake classification)
gen_opt = torch.optim.Adam(generator.parameters(), lr=0.0001, betas=(0.5, 0.999))
dis_opt = torch.optim.Adam(discriminator.parameters(), lr=0.00005, betas=(0.5, 0.999))
criterion = torch.nn.BCEWithLogitsLoss()
# criterion = torch.nn.BCELoss()

In [6]:
%%time

for epoch in range(MAX_EPOCHS):
    
    generated_samples, generated_labels = get_generated_samples(generator, batch_size=BATCH_SIZE)
    real_samples, real_labels = get_real_samples(data, batch_size=BATCH_SIZE)

    # reset gradients
    gen_opt.zero_grad()
    dis_opt.zero_grad()

    # Train G
    # logit raw scores predicted (higher=real, lower=generated)
    # this means, if higher, discriminator sees that generated~real
    pred_generated = discriminator(generated_samples)
    # we want the discriminator to classify the generated samples as real (trick the discriminator).
    gen_loss = criterion(pred_generated, real_labels)
    # backprop weights and gradients of generator
    gen_loss.backward()
    gen_opt.step()

    # Train D 
    # goal: want real samples => labeled 1 AND generated_samples => labeled 0
    # train the discrimninator on the real samples
    pred_real = discriminator(real_samples)
    dis_loss_real = criterion(pred_real, real_labels)
    # now train D on the generated samples
    # only passing the actual values of those samples, and not any gradient
    # information from the generator to the discriminator
    # (independent from each other)
    pred_generated = discriminator(generated_samples.detach())
    dis_loss_generated = criterion(pred_generated, generated_labels)

    # backprop
    dis_loss = (dis_loss_real + dis_loss_generated) / 2
    dis_loss.backward()
    dis_opt.step()

    if epoch % 1000 == 0:
        print(f"\niteration {epoch}")
        print(f'generator loss = {gen_loss.item():.3f}, discriminator loss = {dis_loss.item():.3f}')


iteration 0
generator loss = 0.473, discriminator loss = 0.821

iteration 1000
generator loss = 0.402, discriminator loss = 0.710

iteration 2000
generator loss = 0.402, discriminator loss = 0.710

iteration 3000
generator loss = 0.402, discriminator loss = 0.710

iteration 4000
generator loss = 0.389, discriminator loss = 0.724

iteration 5000
generator loss = 0.386, discriminator loss = 0.729

iteration 6000
generator loss = 0.385, discriminator loss = 0.731

iteration 7000
generator loss = 0.356, discriminator loss = 0.763

iteration 8000
generator loss = 0.361, discriminator loss = 0.760

iteration 9000
generator loss = 0.379, discriminator loss = 0.741
CPU times: user 1min 29s, sys: 158 ms, total: 1min 29s
Wall time: 45 s


In [7]:
generated_samples = generator(100)
generated_samples_np = generated_samples.detach().numpy()
df_generated_samples = pd.DataFrame(generated_samples_np, columns=data.columns)

df_generated_samples['Outcome'] = df_generated_samples['Outcome'] > df_generated_samples.Outcome.mean()
df_generated_samples["Outcome"] = df_generated_samples["Outcome"].astype(int)

df_generated_samples

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,2.778385,124.370720,50.418991,23.068401,-0.521704,32.774773,-9.208471,39.540447,0
1,2.972859,136.036026,71.732605,32.107246,-1.007810,41.827599,-12.613653,44.087685,0
2,2.888418,100.156136,99.960388,44.587700,-4.726610,51.609230,-19.200035,40.799862,1
3,3.838099,116.195923,118.590347,53.922832,-5.077444,60.689579,-22.286690,48.179085,1
4,3.878526,108.524384,101.526047,45.766506,-3.524993,52.327671,-18.401426,42.892467,0
...,...,...,...,...,...,...,...,...,...
95,1.891182,91.348785,52.812572,24.798412,-3.143342,30.751760,-9.936153,32.101547,1
96,2.109382,85.802658,80.014572,37.787346,-5.157355,42.505005,-15.137621,35.837944,1
97,4.847567,254.151688,90.468102,42.215977,0.842429,58.608524,-14.597937,73.629684,0
98,3.640663,152.315125,156.229889,69.110641,-8.119746,79.391403,-30.202440,61.988037,0


In [8]:
df_generated_samples["Outcome"].sort_values()

0     0
36    0
37    0
38    0
41    0
     ..
16    1
65    1
68    1
71    1
99    1
Name: Outcome, Length: 100, dtype: int64

In [9]:
df_generated_samples["Outcome"].value_counts()

Outcome
0    53
1    47
Name: count, dtype: int64

In [10]:
data["Outcome"].value_counts()

Outcome
0    500
1    268
Name: count, dtype: int64

Imbalanced data, differently distributed variables => https://arxiv.org/abs/1907.00503