# Learning Probability Density Functions using Data Only (GAN)

**Student:** Rohan Malhotra  
**Roll Number:** 102303437  

This notebook:
1. Loads NO₂ concentration data `x`
2. Transforms it to `z = x + a_r sin(b_r x)`
3. Trains a 1D GAN on samples of `z` only
4. Generates samples from the trained generator
5. Approximates the PDF using histogram and KDE


In [None]:
# If needed, uncomment to install dependencies
# !pip install pandas numpy matplotlib scikit-learn torch --quiet


In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.neighbors import KernelDensity

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)


In [None]:
ROLL_NUMBER = 102303437

a_r = 0.5 * (ROLL_NUMBER % 7)
b_r = 0.3 * ((ROLL_NUMBER % 5) + 1)

print('ROLL_NUMBER =', ROLL_NUMBER)
print('a_r =', a_r)
print('b_r =', b_r)


## Load Dataset (NO₂ as feature x)

Download the Kaggle CSV and update `CSV_PATH`.


In [None]:
CSV_PATH = 'india_air_quality_data.csv'  # <- change this

if not os.path.exists(CSV_PATH):
    raise FileNotFoundError(f'File not found: {CSV_PATH}. Download Kaggle CSV and update CSV_PATH.')

df = pd.read_csv(CSV_PATH)
print('Shape:', df.shape)
print('Columns:', list(df.columns))
df.head()


In [None]:
# Auto-detect NO2 column (case-insensitive)
cands = [c for c in df.columns if 'no2' in str(c).strip().lower() or str(c).strip().lower() in {'no₂'}]
print('Candidates:', cands)

if not cands:
    raise ValueError('No NO2 column found automatically. Set NO2_COL manually after checking df.columns.')

NO2_COL = cands[0]
print('Using NO2_COL =', NO2_COL)

x = pd.to_numeric(df[NO2_COL], errors='coerce').dropna().values.astype(np.float32)
x = x[np.isfinite(x)]

print('Valid samples:', len(x))
print('x min/max/mean:', float(np.min(x)), float(np.max(x)), float(np.mean(x)))


In [None]:
# Transform x -> z
z = (x + a_r * np.sin(b_r * x)).astype(np.float32)

print('z min/max/mean:', float(np.min(z)), float(np.max(z)), float(np.mean(z)))

plt.figure(figsize=(10,4))
plt.subplot(1,2,1)
plt.hist(x, bins=60, density=True)
plt.title('Original x (NO2)')
plt.xlabel('x'); plt.ylabel('Density')

plt.subplot(1,2,2)
plt.hist(z, bins=60, density=True)
plt.title('Transformed z')
plt.xlabel('z'); plt.ylabel('Density')
plt.tight_layout()
plt.show()


In [None]:
# Standardize z for stable GAN training
z_mean, z_std = z.mean(), z.std() + 1e-8
z_norm = ((z - z_mean) / z_std).reshape(-1, 1)

batch_size = 128
loader = DataLoader(
    TensorDataset(torch.tensor(z_norm, dtype=torch.float32)),
    batch_size=batch_size,
    shuffle=True,
    drop_last=True
)

print('Normalized z mean/std:', float(z_norm.mean()), float(z_norm.std()))
print('Batches per epoch:', len(loader))


## GAN Architecture (1D MLP GAN)

- **Generator:** noise \(\epsilon \sim N(0,1)\) → scalar fake sample
- **Discriminator:** scalar sample → probability real


In [None]:
LATENT_DIM = 8

class Generator(nn.Module):
    def __init__(self, latent_dim=LATENT_DIM):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(latent_dim, 32), nn.LeakyReLU(0.2),
            nn.Linear(32, 64), nn.LeakyReLU(0.2),
            nn.Linear(64, 32), nn.LeakyReLU(0.2),
            nn.Linear(32, 1)
        )
    def forward(self, z): return self.net(z)

class Discriminator(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(1, 32), nn.LeakyReLU(0.2),
            nn.Linear(32, 64), nn.LeakyReLU(0.2),
            nn.Linear(64, 32), nn.LeakyReLU(0.2),
            nn.Linear(32, 1), nn.Sigmoid()
        )
    def forward(self, x): return self.net(x)

G = Generator().to(device)
D = Discriminator().to(device)

print(G)
print(D)


In [None]:
# Training setup
epochs = 300
lr = 1e-4
criterion = nn.BCELoss()
g_opt = optim.Adam(G.parameters(), lr=lr, betas=(0.5, 0.999))
d_opt = optim.Adam(D.parameters(), lr=lr, betas=(0.5, 0.999))

g_losses, d_losses = [], []


In [None]:
# Train GAN
for epoch in range(1, epochs + 1):
    g_epoch = 0.0
    d_epoch = 0.0
    for (real_batch,) in loader:
        real_batch = real_batch.to(device)
        bs = real_batch.size(0)

        # Train D
        d_opt.zero_grad()
        real_y = torch.full((bs, 1), 0.9, device=device)  # label smoothing
        fake_y = torch.zeros((bs, 1), device=device)

        real_pred = D(real_batch)
        d_real_loss = criterion(real_pred, real_y)

        noise = torch.randn(bs, LATENT_DIM, device=device)
        fake_batch = G(noise).detach()
        fake_pred = D(fake_batch)
        d_fake_loss = criterion(fake_pred, fake_y)

        d_loss = d_real_loss + d_fake_loss
        d_loss.backward()
        d_opt.step()

        # Train G
        g_opt.zero_grad()
        noise = torch.randn(bs, LATENT_DIM, device=device)
        gen_batch = G(noise)
        gen_pred = D(gen_batch)
        g_loss = criterion(gen_pred, torch.ones((bs,1), device=device))
        g_loss.backward()
        g_opt.step()

        d_epoch += d_loss.item()
        g_epoch += g_loss.item()

    d_epoch /= max(1, len(loader))
    g_epoch /= max(1, len(loader))
    d_losses.append(d_epoch)
    g_losses.append(g_epoch)

    if epoch % 25 == 0 or epoch == 1:
        print(f'Epoch {epoch}/{epochs} | D={d_epoch:.4f} | G={g_epoch:.4f}')


In [None]:
# Training loss plot
plt.figure(figsize=(8,4))
plt.plot(d_losses, label='D loss')
plt.plot(g_losses, label='G loss')
plt.xlabel('Epoch'); plt.ylabel('Loss')
plt.title('GAN Training Losses')
plt.legend()
plt.grid(alpha=0.3)
plt.show()


In [None]:
# Generate samples from G and map back to original z scale
G.eval()
n_gen = 50000

with torch.no_grad():
    noise = torch.randn(n_gen, LATENT_DIM, device=device)
    zf_norm = G(noise).cpu().numpy().reshape(-1)

zf = (zf_norm * z_std) + z_mean

print('Generated z_f min/max/mean:', float(np.min(zf)), float(np.max(zf)), float(np.mean(zf)))


In [None]:
# KDE-based PDF approximation p_h(z) from generated samples
grid = np.linspace(min(z.min(), zf.min()), max(z.max(), zf.max()), 1000).reshape(-1,1)

bandwidth = max(0.05, 0.15 * float(np.std(z)))
kde = KernelDensity(kernel='gaussian', bandwidth=bandwidth)
kde.fit(zf.reshape(-1,1))
logp = kde.score_samples(grid)
ph = np.exp(logp)

print('KDE bandwidth:', bandwidth)


In [None]:
# Final PDF plot (real vs generated + KDE estimate)
plt.figure(figsize=(10,5))
plt.hist(z, bins=80, density=True, alpha=0.35, label='Real z (hist)')
plt.hist(zf, bins=80, density=True, alpha=0.35, label='Generated z_f (hist)')
plt.plot(grid.squeeze(), ph, linewidth=2, label='Estimated PDF p_h(z) from GAN samples (KDE)')
plt.xlabel('z'); plt.ylabel('Density')
plt.title('PDF Approximation of Transformed Variable z using GAN')
plt.legend()
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()


In [None]:
# Optional: save artifacts for README/report
# plt.savefig('gan_pdf_plot.png', dpi=200)
# pd.DataFrame({'epoch': np.arange(1, len(d_losses)+1), 'd_loss': d_losses, 'g_loss': g_losses}).to_csv('training_losses.csv', index=False)


## Observations (fill after execution)
Comment on:
- Mode coverage
- Training stability
- Quality of generated distribution
