In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error, r2_score
from scipy import stats

In [None]:
#Loading Data

df = pd.read_csv('../CNt_Reinforced_Concrete_Final_Dataset.csv')
target_col = 'Compressive strength (MPa)'
feature_cols = [c for c in df.columns if c != target_col] # getting remaining columns

X = df[feature_cols].values
y = df[target_col].values.reshape(-1, 1)

# Normalize to [-1, 1] for stable GAN training. 

X_mean, X_std = X.mean(0), X.std(0)
y_mean, y_std = y.mean(0), y.std(0)

X_norm = (X - X_mean) / (X_std + 1e-8)
y_norm = (y - y_mean) / (y_std + 1e-8)

In [None]:
#2 WGAN-GP Model

class Generator(nn.Module):
    def __init__(self, noise_dim=32, out_dim=9):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(noise_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 256),
            nn.ReLU(),
            nn.Linear(256, out_dim),
            nn.Tanh()
        )

    def forward(self, z):
        return self.net(z)

class Critic(nn.Module):
    def __init__(self, in_dim=9):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, 256),
            nn.LeakyReLU(0.2),
            nn.Linear(256, 128),
            nn.LeakyReLU(0.2),
            nn.Linear(128, 1)
        )

    def forward(self, x):
        return self.net(x)

# Hyper‑params #These will be tuned accordingly
NOISE_DIM = 32
BATCH_SIZE = 32
EPOCHS = 200
CRITIC_ITER = 5
LAMBDA_GP = 10
LR = 1e-4

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

G = Generator(NOISE_DIM, X.shape[1] + 1).to(device)
C = Critic(X.shape[1] + 1).to(device)

opt_G = torch.optim.Adam(G.parameters(), lr=LR, betas=(0.0, 0.9))
opt_C = torch.optim.Adam(C.parameters(), lr=LR, betas=(0.0, 0.9))

# DataLoader
dataset = TensorDataset(torch.tensor(np.c_[X_norm, y_norm], dtype=torch.float32))
loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

# Gradient penalty
def gradient_penalty(critic, real, fake):
    alpha = torch.rand(real.size(0), 1, device=device)
    interp = (alpha * real + (1 - alpha) * fake).requires_grad_(True)
    d_interp = critic(interp)
    grad = torch.autograd.grad(
        d_interp, interp, torch.ones_like(d_interp),
        create_graph=True, retain_graph=True
    )[0]
    return ((grad.norm(2, dim=1) - 1) ** 2).mean()

In [None]:
for epoch in range(EPOCHS):
    for i, (real_batch,) in enumerate(loader):
        real_batch = real_batch.to(device)

        # Update Critic
        for _ in range(CRITIC_ITER):
            noise = torch.randn(real_batch.size(0), NOISE_DIM, device=device)
            fake_batch = G(noise)
            d_real = C(real_batch)
            d_fake = C(fake_batch.detach())
            gp = gradient_penalty(C, real_batch, fake_batch)
            loss_C = d_fake.mean() - d_real.mean() + LAMBDA_GP * gp

            opt_C.zero_grad()
            loss_C.backward()
            opt_C.step()

        # Update Generator
        noise = torch.randn(real_batch.size(0), NOISE_DIM, device=device)
        fake_batch = G(noise)
        loss_G = -C(fake_batch).mean()
        opt_G.zero_grad()
        loss_G.backward()
        opt_G.step()

    print(f'Epoch {epoch+1}/{EPOCHS}  loss_C={loss_C.item():.4f}  loss_G={loss_G.item():.4f}')


In [None]:

# 4. Generate and de‑normalize synthetic data (≈2× original size)

G.eval()
n_synth = 2 * len(df)  # ≈ 600 rows
with torch.no_grad():
    z = torch.randn(n_synth, NOISE_DIM, device=device)
    synth_norm = G(z).cpu().numpy()

synth_X = synth_norm[:, :-1] * (X_std + 1e-8) + X_mean
synth_y = synth_norm[:, -1:] * (y_std + 1e-8) + y_mean

synth_df = pd.DataFrame(np.c_[synth_X, synth_y], columns=df.columns)

# Save
synth_df.to_csv('../CNT_Concrete_Synth_Wgan.csv', index=False)
print(f'Saved as CNT_Concrete_Synth_Wgan.csv')


In [None]:
# 5. DECISION METRICS . To check if the augmented data is suitable for usage


def evaluate_augmentation(real, synth, target):
    """
    Returns a dict of metrics (lower = more similar / better).
    """
    metrics = {}

    # 1. Marginal statistics (mean & std distance)
    for col in real.columns:
        r, s = real[col], synth[col]
        metrics[f'dmean_{col}'] = abs(r.mean() - s.mean())
        metrics[f'dstd_{col}']  = abs(r.std()  - s.std())



    # 2. Kolmogorov‑Smirnov distance (distribution similarity)
    for col in real.columns:
        ks_stat, _ = stats.ks_2samp(real[col], synth[col])
        metrics[f'KS_{col}'] = ks_stat

    # 3. Pairwise correlation preservation
    corr_real = real.corr().values
    corr_synth = synth.corr().values
    corr_mse = np.mean((corr_real - corr_synth) ** 2)
    metrics['corr_mse'] = corr_mse

    # 4. Predictive utility (most important)
    # Train a simple RF on synthetic data, test on real data
    X_real = real.drop(columns=[target])
    y_real = real[target]
    X_synth = synth.drop(columns=[target])
    y_synth = synth[target]

    rf = RandomForestRegressor(n_estimators=100, random_state=42)
    rf.fit(X_synth, y_synth)
    preds = rf.predict(X_real)
    metrics['pred_r2_on_real'] = r2_score(y_real, preds)
    metrics['pred_rmse_on_real'] = root_mean_squared_error(y_real, preds, squared=False)

    return metrics

# Run evaluation
score = evaluate_augmentation(df, synth_df, target_col)

print('\n QUALITY DECISION METRICS ')

for k, v in score.items():
    print(f'{k:<20}  {v:.5f}')

# Suggested thresholds
good_enough = (
    score['pred_r2_on_real'] > 0.5 and
    score['corr_mse'] < 0.5 and
    all(score[f'KS_{c}'] < 0.3 for c in df.columns)
)

if good_enough:
    print("OKAY")
else:
    print("NOT OKAY")
    