In [2]:
# --------------------- Imports ---------------------
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
import torch.optim as optim
from google.colab import drive

# --------------------- Matplotlib Setup ---------------------
mpl.rcParams.update({
    'font.size': 14,
    'axes.titlesize': 15,
    'axes.labelsize': 12,
    'xtick.labelsize': 11,
    'ytick.labelsize': 11,
    'legend.fontsize': 11,
    'figure.dpi': 300,
    'savefig.dpi': 300,
    'figure.autolayout': True,
})

# --------------------- Load Data ---------------------
print("Mounting Google Drive and loading dataset...")
drive.mount('/content/drive')
total_capture_7k = pd.read_csv('drive/My Drive/C02 project/correlation_wide.csv')
print(f"Loaded dataset with shape: {total_capture_7k.shape}")

# --------------------- Identify Unique Static Parameter Sets ---------------------
static_cols = [
    'MikeSorghum', 'Quartz', 'Plagioclase', 'Apatite', 'Ilmenite',
    'Diopside_Mn', 'Diopside', 'Olivine', 'Alkali-feldspar',
    'Montmorillonite', 'Glass', 'temp', 'shift', 'year'
]

# Add timestep count per file_id
file_lengths = total_capture_7k.groupby('file_id').size().rename("num_timesteps").reset_index()
static_rows = total_capture_7k.groupby('file_id')[static_cols].first().reset_index()
static_rows = static_rows.merge(file_lengths, on='file_id')

# Filter only unique static parameter sets
unique_static_rows = static_rows.drop_duplicates(subset=static_cols)
unique_file_ids = unique_static_rows['file_id'].tolist()

# --------------------- Extract Time Series Data ---------------------
filtered_df = total_capture_7k[total_capture_7k['file_id'].isin(unique_file_ids)].copy()

# Truncate each group to 101 timesteps
filtered_df = filtered_df.groupby('file_id').head(101).reset_index(drop=True)

# --------------------- Static Feature Table ---------------------
Input_Link_Table = filtered_df.groupby('file_id').agg({col: 'first' for col in static_cols}).reset_index()
print(f"Static feature table created: Input_Link_Table.shape = {Input_Link_Table.shape}")

# --------------------- Time Series Structuring ---------------------
result = filtered_df[['Total_CO2_capture', 'year', 'file_id']]
file_ids = result['file_id'].unique()
num_file_ids = len(file_ids)
max_timesteps = 101
relevant_data = np.zeros((num_file_ids, max_timesteps))
file_id_order = np.zeros(num_file_ids)

for i, file_id in enumerate(file_ids):
    file_data = result[result['file_id'] == file_id]['Total_CO2_capture'].values
    relevant_data[i, :len(file_data)] = file_data
    file_id_order[i] = file_id
print(f"Time series matrix constructed: relevant_data.shape = {relevant_data.shape}")

# --------------------- Clustering ---------------------
scaler = StandardScaler()
normalized_data = scaler.fit_transform(relevant_data)
kmeans = KMeans(n_clusters=8, random_state=42)
clusters = kmeans.fit_predict(normalized_data)
print("Performed KMeans clustering into 8 clusters")

# Compute boundary stats
cluster_boundaries = []
for cluster_id in range(8):
    cluster_data = normalized_data[clusters == cluster_id]
    min_v = scaler.inverse_transform(np.min(cluster_data, axis=0).reshape(1, -1)).flatten()
    median_v = scaler.inverse_transform(np.median(cluster_data, axis=0).reshape(1, -1)).flatten()
    mean_v = scaler.inverse_transform(np.mean(cluster_data, axis=0).reshape(1, -1)).flatten()
    max_v = scaler.inverse_transform(np.max(cluster_data, axis=0).reshape(1, -1)).flatten()
    cluster_boundaries.append((min_v, median_v, mean_v, max_v))
cluster_boundaries = np.array(cluster_boundaries)
print(f"Cluster boundary stats calculated: cluster_boundaries.shape = {cluster_boundaries.shape}")

# --------------------- Merge Static Features with Clusters ---------------------
Clustering_link_table = pd.DataFrame({'file_id': file_id_order.astype(int), 'cluster': clusters})
Clustering_link_table = Clustering_link_table.sort_values(by='file_id').reset_index(drop=True)
merged_df = pd.merge(Input_Link_Table, Clustering_link_table, on='file_id')
print(f"Final input features (static + cluster): merged_df.shape = {merged_df.shape}")

# --------------------- Create Output Time Series DataFrame ---------------------
data = [[file_id_order[i].astype(int), t, relevant_data[i, t]] for i in range(len(file_id_order)) for t in range(max_timesteps)]
df_output = pd.DataFrame(data, columns=['file_id', 'timestep', 'CO2']).sort_values(by=['file_id', 'timestep'])
print(f"Final output time series: df_output.shape = {df_output.shape}")

# --------------------- Summary ---------------------
print("Data Preparation Summary:")
print(f"Static Input Table: merged_df [{merged_df.shape[0]} rows × {merged_df.shape[1]} columns]")
print(f"Time Series Output: df_output [{df_output.shape[0]} rows × 3 columns]")
print(f"Cluster Boundaries: cluster_boundaries [{cluster_boundaries.shape}]")


Mounting Google Drive and loading dataset...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Loaded dataset with shape: (1192157, 17)
Static feature table created: Input_Link_Table.shape = (2703, 15)
Time series matrix constructed: relevant_data.shape = (2703, 101)
Performed KMeans clustering into 8 clusters
Cluster boundary stats calculated: cluster_boundaries.shape = (8, 4, 101)
Final input features (static + cluster): merged_df.shape = (2703, 16)
Final output time series: df_output.shape = (273003, 3)
Data Preparation Summary:
Static Input Table: merged_df [2703 rows × 16 columns]
Time Series Output: df_output [273003 rows × 3 columns]
Cluster Boundaries: cluster_boundaries [(8, 4, 101)]


**LSTM**

In [3]:
import torch
import torch.nn as nn

class LSTM_MIMO(nn.Module):
    def __init__(self, input_len, output_len, static_dim, hidden_dim=128, num_layers=2):
        super(LSTM_MIMO, self).__init__()
        self.input_len = input_len
        self.output_len = output_len

        self.static_fc = nn.Sequential(
            nn.Linear(static_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim)
        )

        self.lstm = nn.LSTM(input_size=1 + hidden_dim, hidden_size=hidden_dim,
                            num_layers=num_layers, batch_first=True)

        self.output_fc = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_len)
        )

    def forward(self, x_seq, x_static):
        # x_seq: [B, T, 1]
        # x_static: [B, static_dim]
        batch_size, seq_len, _ = x_seq.size()
        static_encoded = self.static_fc(x_static)  # [B, H]
        static_expanded = static_encoded.unsqueeze(1).expand(-1, seq_len, -1)  # [B, T, H]
        lstm_input = torch.cat([x_seq, static_expanded], dim=-1)  # [B, T, 1+H]
        lstm_out, _ = self.lstm(lstm_input)  # [B, T, H]
        last_hidden = lstm_out[:, -1, :]  # [B, H]
        out = self.output_fc(last_hidden)  # [B, output_len]
        return out


# Experiments - Overnight duration experimentation

In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, random_split
from sklearn.model_selection import train_test_split
import pandas as pd
import os

# Setup
splits = [(80, 20), (60, 40), (50, 50), (40, 60), (20, 80), (10, 90), (5, 95)]
LSTM_mse = []

# For saving models
os.makedirs("drive/My Drive/DSSM-Models", exist_ok=True)

np.random.seed(42)
torch.manual_seed(42)

for train_pct, test_pct in splits:
    split_name = f"{train_pct}_{test_pct}"
    train_ids, test_ids = train_test_split(df_output['file_id'].unique(), test_size=0.2, random_state=42)
    df_train = df_output[df_output['file_id'].isin(train_ids)]
    df_test = df_output[df_output['file_id'].isin(test_ids)]

    train_timestep = int(train_pct / 100 * 101)
    pred_timestep = 101 - train_timestep
    print(f"\n====== Split: {split_name} | X (input): {train_timestep} AND Y (output): {pred_timestep} ======")

    # Pivot time series
    X_train = df_train.pivot(index='file_id', columns='timestep', values='CO2').values[:, :train_timestep]
    Y_train = df_train.pivot(index='file_id', columns='timestep', values='CO2').values[:, train_timestep:]
    X_test = df_test.pivot(index='file_id', columns='timestep', values='CO2').values[:, :train_timestep]
    Y_test = df_test.pivot(index='file_id', columns='timestep', values='CO2').values[:, train_timestep:]

    static_train = merged_df[merged_df['file_id'].isin(train_ids)].drop(columns=['file_id', 'cluster']).values
    static_test = merged_df[merged_df['file_id'].isin(test_ids)].drop(columns=['file_id', 'cluster']).values

    # Tensors
    X_train_tensor = torch.tensor(X_train[:, :, None], dtype=torch.float32)
    Y_train_tensor = torch.tensor(Y_train, dtype=torch.float32)
    static_train_tensor = torch.tensor(static_train, dtype=torch.float32)

    X_test_tensor = torch.tensor(X_test[:, :, None], dtype=torch.float32)
    Y_test_tensor = torch.tensor(Y_test, dtype=torch.float32)
    static_test_tensor = torch.tensor(static_test, dtype=torch.float32)

    print(f"Train X shape: {X_train_tensor.shape}, Train Y shape: {Y_train_tensor.shape}, Static: {static_train_tensor.shape}")
    print(f"Test  X shape: {X_test_tensor.shape}, Test  Y shape: {Y_test_tensor.shape}, Static: {static_test_tensor.shape}")

    # Dataset and loaders
    full_train_dataset = TensorDataset(X_train_tensor, static_train_tensor, Y_train_tensor)
    val_size = int(0.1 * len(full_train_dataset))
    train_size = len(full_train_dataset) - val_size
    train_dataset, val_dataset = random_split(full_train_dataset, [train_size, val_size], generator=torch.Generator().manual_seed(42))
    test_dataset = TensorDataset(X_test_tensor, static_test_tensor, Y_test_tensor)

    train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=64)
    test_loader = DataLoader(test_dataset, batch_size=64)

    # ------------------ Model and Config ------------------
    lstm_config = {
        "static_dim": static_train.shape[1],
        "hidden_dim": 128,
        "num_layers": 2,
        "dropout": 0.0,
        "lr": 0.001,
        "batch_size": 64,
        "epochs": 500
    }

    model = LSTM_MIMO(
        input_len=train_timestep,
        output_len=pred_timestep,
        static_dim=lstm_config["static_dim"],
        hidden_dim=lstm_config["hidden_dim"],
        num_layers=lstm_config["num_layers"]
    )

    optimizer = optim.Adam(model.parameters(), lr=lstm_config["lr"])
    criterion = nn.MSELoss()

    # ------------------ Training ------------------
    best_val_loss = float('inf')
    best_model_path = f"drive/My Drive/DSSM-Models/LSTM_best_{split_name}.pt"
    print("Training started...")

    for epoch in range(1, lstm_config["epochs"] + 1):
        model.train()
        for X_batch, static_batch, Y_batch in train_loader:
            optimizer.zero_grad()
            preds = model(X_batch, static_batch)
            loss = criterion(preds, Y_batch)
            loss.backward()
            optimizer.step()

        # Validation
        model.eval()
        val_loss = 0.0
        val_samples = 0
        with torch.no_grad():
            for X_batch, static_batch, Y_batch in val_loader:
                preds = model(X_batch, static_batch)
                batch_loss = criterion(preds, Y_batch).item()
                val_loss += batch_loss * X_batch.size(0)
                val_samples += X_batch.size(0)
        avg_val_loss = val_loss / val_samples

        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            torch.save(model.state_dict(), best_model_path)

        print(f"[Epoch {epoch}] Training Loss: {loss.item():.6f} | Val Loss: {avg_val_loss:.6f}")

    # ------------------ Evaluation ------------------
    model.load_state_dict(torch.load(best_model_path))
    model.eval()
    total_mse = 0.0
    total_samples = 0
    with torch.no_grad():
        for X_batch, static_batch, Y_batch in test_loader:
            outputs = model(X_batch, static_batch)
            batch_mse = criterion(outputs, Y_batch).item()
            total_mse += batch_mse * X_batch.size(0)
            total_samples += X_batch.size(0)

    avg_mse = total_mse / total_samples
    print(f"Final Test MSE for LSTM ({split_name}): {avg_mse:.6f}")
    LSTM_mse.append({'Split': split_name, 'Test_MSE': avg_mse})

# Save result as DataFrame
LSTM_mse = pd.DataFrame(LSTM_mse)


Train X shape: torch.Size([2162, 80, 1]), Train Y shape: torch.Size([2162, 21]), Static: torch.Size([2162, 14])
Test  X shape: torch.Size([541, 80, 1]), Test  Y shape: torch.Size([541, 21]), Static: torch.Size([541, 14])
Training started...
[Epoch 1] Training Loss: 0.208605 | Val Loss: 0.310658
[Epoch 2] Training Loss: 0.161890 | Val Loss: 0.187543
[Epoch 3] Training Loss: 0.212877 | Val Loss: 0.181633
[Epoch 4] Training Loss: 0.116815 | Val Loss: 0.185046
[Epoch 5] Training Loss: 0.114678 | Val Loss: 0.182446
[Epoch 6] Training Loss: 0.131637 | Val Loss: 0.183438
[Epoch 7] Training Loss: 0.138782 | Val Loss: 0.191557
[Epoch 8] Training Loss: 0.154229 | Val Loss: 0.187848
[Epoch 9] Training Loss: 0.168869 | Val Loss: 0.184001
[Epoch 10] Training Loss: 0.130706 | Val Loss: 0.184492
[Epoch 11] Training Loss: 0.128849 | Val Loss: 0.192801
[Epoch 12] Training Loss: 0.175763 | Val Loss: 0.193712
