<a href="https://colab.research.google.com/github/satheesh3180/load_breast_cancer-/blob/main/PROJECT_FINAL1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [59]:
#1. Imports

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error

def generate_multivariate_series(n_steps=1500):
    t = np.arange(n_steps)

    trend = 0.001 * t
    seasonality = np.sin(2 * np.pi * t / 24)
    temperature = np.sin(2 * np.pi * t / 365)
    volatility = np.random.normal(0, 0.2, n_steps).cumsum()
    industrial = 0.5 * seasonality + 0.3 * volatility

    target = (
        trend + seasonality + temperature +
        industrial + np.random.normal(0, 0.1, n_steps)
    )

    return pd.DataFrame({
        "target": target,
        "seasonality": seasonality,
        "temperature": temperature,
        "industrial": industrial,
        "volatility": volatility
    })

data = generate_multivariate_series()
print("Dataset shape:", data.shape)




Dataset shape: (1500, 5)


In [60]:
#ðŸ”¹ 3. Sequence Creation

def create_sequences(df, seq_len=48):
    X, y = [], []
    for i in range(len(df) - seq_len):
        X.append(df.iloc[i:i+seq_len].values)
        y.append(df.iloc[i+seq_len]["target"])
    return np.array(X), np.array(y)

scaler = StandardScaler()
scaled = scaler.fit_transform(data)

X, y = create_sequences(pd.DataFrame(scaled, columns=data.columns))

X = torch.tensor(X, dtype=torch.float32)
y = torch.tensor(y, dtype=torch.float32)

dataset = TensorDataset(X, y)
loader = DataLoader(dataset, batch_size=32, shuffle=True)


In [61]:
#ðŸ”¹ 4. Transformer with Attention (FROM SCRATCH)

class TransformerBlock(nn.Module):
    def __init__(self, embed_dim=64, heads=4):
        super().__init__()
        self.attn = nn.MultiheadAttention(embed_dim, heads)
        self.norm1 = nn.LayerNorm(embed_dim)
        self.ff = nn.Sequential(
            nn.Linear(embed_dim, 128),
            nn.ReLU(),
            nn.Linear(128, embed_dim)
        )
        self.norm2 = nn.LayerNorm(embed_dim)

    def forward(self, x):
        attn_out, attn_weights = self.attn(x, x, x)
        x = self.norm1(x + attn_out)
        ff_out = self.ff(x)
        x = self.norm2(x + ff_out)
        return x, attn_weights

In [62]:
#ðŸ”¹ 5. Forecasting Model

class TimeSeriesTransformer(nn.Module):
    def __init__(self, n_features):
        super().__init__()
        self.embedding = nn.Linear(n_features, 64)
        self.transformer = TransformerBlock()
        self.fc = nn.Linear(64, 1)

    def forward(self, x):
        x = self.embedding(x)
        x = x.permute(1, 0, 2)   # (seq, batch, features)
        x, attn = self.transformer(x)
        out = self.fc(x[-1])
        return out.squeeze(), attn

In [63]:
#Custom Encoderâ€“Decoder Transformer

#Padding & Masking

#Hyperparameter Optimization**

#1. Padding + Masking utilities
def create_padding_mask(seq):
    # seq shape: (batch, seq_len, features)
    # padding assumed as zeros
    return (seq.abs().sum(dim=-1) == 0)  # (batch, seq_len)

#Custom Transformer Encoder Block
class EncoderBlock(nn.Module):
    def __init__(self, embed_dim, heads):
        super().__init__()
        self.attn = nn.MultiheadAttention(embed_dim, heads, batch_first=True)
        self.norm1 = nn.LayerNorm(embed_dim)
        self.ff = nn.Sequential(
            nn.Linear(embed_dim, 128),
            nn.ReLU(),
            nn.Linear(128, embed_dim)
        )
        self.norm2 = nn.LayerNorm(embed_dim)

    def forward(self, x, mask):
        attn_out, attn_weights = self.attn(
            x, x, x, key_padding_mask=mask
        )
        x = self.norm1(x + attn_out)
        ff_out = self.ff(x)
        x = self.norm2(x + ff_out)
        return x, attn_weights

#Custom Transformer Decoder Block
class DecoderBlock(nn.Module):
    def __init__(self, embed_dim, heads):
        super().__init__()
        self.attn = nn.MultiheadAttention(embed_dim, heads, batch_first=True)
        self.norm = nn.LayerNorm(embed_dim)

    def forward(self, x, enc_out, mask):
        out, _ = self.attn(
            x, enc_out, enc_out, key_padding_mask=mask
        )
        return self.norm(out)

#Full Encoderâ€“Decoder Model

class TimeSeriesTransformer(nn.Module):
    def __init__(self, n_features, embed_dim, heads):
        super().__init__()
        self.embedding = nn.Linear(n_features, embed_dim)
        self.encoder = EncoderBlock(embed_dim, heads)
        self.decoder = DecoderBlock(embed_dim, heads)
        self.fc = nn.Linear(embed_dim, 1)

    def forward(self, src):
        mask = create_padding_mask(src)
        x = self.embedding(src)

        enc_out, attn_weights = self.encoder(x, mask)
        dec_out = self.decoder(enc_out[:, -1:].clone(), enc_out, mask)

        out = self.fc(dec_out.squeeze(1))
        return out.squeeze(), attn_weights

In [64]:
#. Hyperparameter Optimization (Grid Search)
#Parameters Tuned (as per question)

#Learning rate

#Sequence length

#Number of attention heads

param_grid = {
    "lr": [0.001, 0.0005],
    "seq_len": [24, 48],
    "heads": [2, 4]
}


In [65]:
#Training + Grid Search Loop

best_loss = float("inf")
best_params = None

for lr in param_grid["lr"]:
    for seq_len in param_grid["seq_len"]:
        for heads in param_grid["heads"]:

            X, y = create_sequences(
                pd.DataFrame(scaled, columns=data.columns),
                seq_len
            )

            X = torch.tensor(X, dtype=torch.float32)
            y = torch.tensor(y, dtype=torch.float32)

            loader = DataLoader(
                TensorDataset(X, y), batch_size=32, shuffle=True
            )

            model = TimeSeriesTransformer(
                n_features=X.shape[2],
                embed_dim=64,
                heads=heads
            )

            optimizer = torch.optim.Adam(model.parameters(), lr=lr)
            loss_fn = nn.MSELoss()

            total_loss = 0
            for xb, yb in loader:
                optimizer.zero_grad()
                preds, _ = model(xb)
                loss = loss_fn(preds, yb)
                loss.backward()
                optimizer.step()
                total_loss += loss.item()

            avg_loss = total_loss / len(loader)

            print(
                f"LR={lr}, Seq={seq_len}, Heads={heads} â†’ Loss={avg_loss:.4f}"
            )

            if avg_loss < best_loss:
                best_loss = avg_loss
                best_params = (lr, seq_len, heads)


LR=0.001, Seq=24, Heads=2 â†’ Loss=0.2317
LR=0.001, Seq=24, Heads=4 â†’ Loss=0.2285
LR=0.001, Seq=48, Heads=2 â†’ Loss=0.2195
LR=0.001, Seq=48, Heads=4 â†’ Loss=0.2564
LR=0.0005, Seq=24, Heads=2 â†’ Loss=0.2456
LR=0.0005, Seq=24, Heads=4 â†’ Loss=0.3691
LR=0.0005, Seq=48, Heads=2 â†’ Loss=0.3460
LR=0.0005, Seq=48, Heads=4 â†’ Loss=0.2164


In [66]:
#Best Hyperparameters (Required Deliverable)

print("\nBest Hyperparameters Found:")
print("Learning Rate:", best_params[0])
print("Sequence Length:", best_params[1])
print("Attention Heads:", best_params[2])
print("Best Loss:", best_loss)



Best Hyperparameters Found:
Learning Rate: 0.0005
Sequence Length: 48
Attention Heads: 4
Best Loss: 0.21640216769731563


In [67]:
#ðŸ”¹ 6. Training Loop

device = "cuda" if torch.cuda.is_available() else "cpu"
# Use embed_dim=64 as it was used in the hyperparameter search, and best_params[2] for heads
model = TimeSeriesTransformer(n_features=X.shape[2], embed_dim=64, heads=best_params[2]).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.0005) # Keeping the original learning rate from this cell
loss_fn = nn.MSELoss()

for epoch in range(5):
    total_loss = 0
    for xb, yb in loader:
        xb, yb = xb.to(device), yb.to(device)

        optimizer.zero_grad()
        preds, attn = model(xb)
        loss = loss_fn(preds, yb)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1} | Loss: {total_loss/len(loader):.4f}")

Epoch 1 | Loss: 0.2125
Epoch 2 | Loss: 0.0552
Epoch 3 | Loss: 0.0422
Epoch 4 | Loss: 0.0407
Epoch 5 | Loss: 0.0362


In [68]:
#ðŸ”¹ 7. Evaluation Metrics

model.eval()
with torch.no_grad():
    preds, attn = model(X.to(device))

preds = preds.cpu().numpy()
actual = y.numpy()

mae = mean_absolute_error(actual, preds)
rmse = np.sqrt(mean_squared_error(actual, preds))

print("MAE:", mae)
print("RMSE:", rmse)
print("Attention weights shape:", attn.shape)


MAE: 0.15098091959953308
RMSE: 0.17781986025240762
Attention weights shape: torch.Size([1452, 48, 48])


In [69]:
#ðŸ”¹ 8. SARIMAX Baseline (Comparison)

from statsmodels.tsa.statespace.sarimax import SARIMAX

sarimax = SARIMAX(data["target"], order=(2,1,2))
sarimax_fit = sarimax.fit(disp=False)

print("SARIMAX AIC:", sarimax_fit.aic)


SARIMAX AIC: -1280.9912060148326
