# CNN-based GAN Baseline (Full Pipeline)

**Features:**
1. **Training**: Tanh-based GAN to prevent mode collapse.
2. **Saving**: Saves 'best' (latest) generator checkpoint.
3. **Generation**: Generates 10,000 synthetic samples.
4. **Post-Processing**: Concatenates with REAL Time Features to create a complete dataset compatibile with Diffusion Models.

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
# --------------------
# 1. LOAD & PREPARE DATA
# --------------------
CSV_FILE = r'C:\Users\Raymond Tie\Desktop\DiffusionModel_NILM\Data\datasets\dishwasher_multivariate.csv'
OUTPUT_DIR = r'C:\Users\Raymond Tie\Desktop\DiffusionModel_NILM\Synthetic_Data\dishwasher'
os.makedirs(OUTPUT_DIR, exist_ok=True)

WINDOW_SIZE = 512
BATCH_SIZE = 64

df = pd.read_csv(CSV_FILE)

# Identify Power Column
power_col = next((c for c in df.columns if 'power' in c.lower() or 'dishwasher' in c.lower()), df.columns[-1])
print(f"Target Power Column: {power_col}")

# Separate Power and Time Features
# Assume all other columns are Time Features
time_cols = [c for c in df.columns if c != power_col]
print(f"Time Columns: {time_cols}")

raw_power = df[power_col].values.astype(float)
raw_time = df[time_cols].values.astype(float)

# Normalize Power to [-1, 1]
p_min, p_max = raw_power.min(), raw_power.max()
raw_power_norm = (raw_power - p_min) / (p_max - p_min)
raw_power_norm = raw_power_norm * 2 - 1

class NILM_Dataset(Dataset):
    def __init__(self, power, time):
        self.windows = []
        stride = 64
        for i in range(0, len(power) - WINDOW_SIZE, stride):
            p_win = power[i:i+WINDOW_SIZE]
            t_win = time[i:i+WINDOW_SIZE]
            
            # Filter silent windows slightly
            if np.max(p_win) > -0.9: 
                self.windows.append((p_win, t_win))
        print(f"Training Windows: {len(self.windows)}")
        
    def __len__(self): return len(self.windows)
    def __getitem__(self, idx):
        p, t = self.windows[idx]
        return torch.from_numpy(p).float().unsqueeze(0), torch.from_numpy(t).float()

train_loader = DataLoader(NILM_Dataset(raw_power_norm, raw_time), batch_size=BATCH_SIZE, shuffle=True, drop_last=True)

In [None]:
# --------------------
# 2. MODEL DEFINITION
# --------------------
class Generator(nn.Module):
    def __init__(self):
        super(Generator, self).__init__()
        self.fc = nn.Linear(100, 128 * 16)
        
        def up_block(in_c, out_c):
            return nn.Sequential(
                nn.Upsample(scale_factor=2, mode='linear', align_corners=False),
                nn.Conv1d(in_c, out_c, 3, 1, 1),
                nn.BatchNorm1d(out_c),
                nn.ReLU(True)
            )

        self.model = nn.Sequential(
            up_block(128, 64),
            up_block(64, 32),
            up_block(32, 16),
            up_block(16, 8),
            nn.Upsample(scale_factor=2, mode='linear', align_corners=False),
            nn.Conv1d(8, 1, 3, 1, 1),
            nn.Tanh() # Output [-1, 1]
        )

    def forward(self, z):
        x = self.fc(z).view(-1, 128, 16)
        return self.model(x)

class Discriminator(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv = nn.Sequential(
            nn.Conv1d(1, 16, 4, 2, 1), nn.LeakyReLU(0.2),
            nn.Conv1d(16, 32, 4, 2, 1), nn.BatchNorm1d(32), nn.LeakyReLU(0.2),
            nn.Conv1d(32, 64, 4, 2, 1), nn.BatchNorm1d(64), nn.LeakyReLU(0.2),
            nn.Conv1d(64, 128, 4, 2, 1), nn.BatchNorm1d(128), nn.LeakyReLU(0.2),
            nn.AdaptiveAvgPool1d(1), nn.Flatten(), nn.Linear(128, 1), nn.Sigmoid()
        )
    def forward(self, x):
        return self.conv(x)

G = Generator().to(device)
D = Discriminator().to(device)
opt_G = torch.optim.Adam(G.parameters(), lr=0.0002, betas=(0.5, 0.999))
opt_D = torch.optim.Adam(D.parameters(), lr=0.0002, betas=(0.5, 0.999))
criterion = nn.BCELoss()

In [None]:
# --------------------
# 3. TRAINING LOOP (With Save)
# --------------------
from IPython.display import clear_output

def plot_status(real, fake, epoch, d_loss, g_loss):
    clear_output(wait=True)
    r = (real[0,0].detach().cpu().numpy() + 1) / 2
    f = (fake[0,0].detach().cpu().numpy() + 1) / 2
    
    plt.figure(figsize=(15, 6))
    plt.suptitle(f"Epoch {epoch} | D: {d_loss:.4f} G: {g_loss:.4f}")
    plt.subplot(1,2,1)
    plt.plot(r, label='Real (Rescaled)', alpha=0.6)
    plt.plot(f, label='Fake (Rescaled)', alpha=0.8)
    plt.legend(); plt.ylim(-0.1, 1.1)
    plt.show()

print("Training...")
for epoch in range(1, 101): # 100 Epochs is usually enough for simple CNN
    for batch_idx, (p, t) in enumerate(train_loader):
        real = p.to(device)
        bs = real.size(0)
        
        # Train D
        opt_D.zero_grad()
        z = torch.randn(bs, 100).to(device)
        fake = G(z)
        loss_d = (criterion(D(real), torch.full((bs,1), 0.9).to(device)) + \
                  criterion(D(fake.detach()), torch.zeros(bs,1).to(device))) / 2
        loss_d.backward()
        opt_D.step()
        
        # Train G (3 times)
        for _ in range(3):
            opt_G.zero_grad()
            z = torch.randn(bs, 100).to(device)
            fake = G(z)
            loss_g = criterion(D(fake), torch.ones(bs,1).to(device))
            loss_g.backward()
            opt_G.step()
        
    if epoch % 5 == 0:
        plot_status(real, fake, epoch, loss_d.item(), loss_g.item())
        
    # Checkpoint every 50 epochs
    if epoch % 50 == 0:
        torch.save(G.state_dict(), os.path.join(OUTPUT_DIR, f'generator_epoch_{epoch}.pth'))

# Save Final Model
torch.save(G.state_dict(), os.path.join(OUTPUT_DIR, 'generator_latest.pth'))
print("✅ Training Complete. Model Saved.")

In [None]:
# --------------------
# 4. GENERATION & CONCATENATION
# --------------------
print("Generating Synthetic Data...")
G.eval()
NUM_SAMPLES = 10000
generated_samples = []

# We need to pair generated power with REAL time features.
# Strategy: Randomly sample windows of Time Features from the real dataset.

with torch.no_grad():
    for _ in range(NUM_SAMPLES // BATCH_SIZE + 1):
        z = torch.randn(BATCH_SIZE, 100).to(device)
        fake_power = G(z).cpu().numpy() # (B, 1, 512)
        
        # Rescale back from [-1, 1] to [0, 1] (or original scale)
        fake_power = (fake_power + 1) / 2
        fake_power = fake_power.squeeze(1) # (B, 512)
        
        generated_samples.append(fake_power)

# Concatenate all batches
all_power = np.concatenate(generated_samples, axis=0)[:NUM_SAMPLES] # (10000, 512)

# Get Random Time Feature Windows
# We reload dataset to get just time windows
all_time_windows = []
possible_indices = list(range(len(train_loader.dataset)))
selected_indices = np.random.choice(possible_indices, NUM_SAMPLES, replace=True)

for idx in selected_indices:
    _, t_win = train_loader.dataset[idx]
    all_time_windows.append(t_win.numpy()) # (512, TimeDim)

all_time = np.array(all_time_windows) # (10000, 512, TimeDim)

print(f"Generated Power Shape: {all_power.shape}")
print(f"Sampled Time Shape:    {all_time.shape}")

# --------------------
# 5. MERGE & SAVE
# --------------------
# Combine (N, L, 1) and (N, L, T) -> (N, L, 1+T)
all_power_expanded = np.expand_dims(all_power, axis=2)
final_data = np.concatenate([all_power_expanded, all_time], axis=2)

print(f"Final Dataset Shape: {final_data.shape}")

# Save as .npy
save_path = os.path.join(OUTPUT_DIR, 'synthetic_dishwasher.npy')
np.save(save_path, final_data)
print(f"✅ Saved Synthetic Dataset to: {save_path}")