# DATA PREPERATION

In [None]:
# --------------------- Imports ---------------------
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
import torch.optim as optim
from google.colab import drive

# --------------------- Matplotlib Setup ---------------------
mpl.rcParams.update({
    'font.size': 14,
    'axes.titlesize': 15,
    'axes.labelsize': 12,
    'xtick.labelsize': 11,
    'ytick.labelsize': 11,
    'legend.fontsize': 11,
    'figure.dpi': 300,
    'savefig.dpi': 300,
    'figure.autolayout': True,
})

# --------------------- Load Data ---------------------
print("Mounting Google Drive and loading dataset...")
drive.mount('/content/drive')
total_capture_7k = pd.read_csv('drive/My Drive/correlation_wide.csv')
print(f"Loaded dataset with shape: {total_capture_7k.shape}")

# --------------------- Identify Unique Static Parameter Sets ---------------------
static_cols = [
    'MikeSorghum', 'Quartz', 'Plagioclase', 'Apatite', 'Ilmenite',
    'Diopside_Mn', 'Diopside', 'Olivine', 'Alkali-feldspar',
    'Montmorillonite', 'Glass', 'temp', 'shift', 'year'
]

# Add timestep count per file_id
file_lengths = total_capture_7k.groupby('file_id').size().rename("num_timesteps").reset_index()
static_rows = total_capture_7k.groupby('file_id')[static_cols].first().reset_index()
static_rows = static_rows.merge(file_lengths, on='file_id')

# Filter only unique static parameter sets
unique_static_rows = static_rows.drop_duplicates(subset=static_cols)
unique_file_ids = unique_static_rows['file_id'].tolist()

# --------------------- Extract Time Series Data ---------------------
filtered_df = total_capture_7k[total_capture_7k['file_id'].isin(unique_file_ids)].copy()

# Truncate each group to 101 timesteps
filtered_df = filtered_df.groupby('file_id').head(101).reset_index(drop=True)

# --------------------- Static Feature Table ---------------------
Input_Link_Table = filtered_df.groupby('file_id').agg({col: 'first' for col in static_cols}).reset_index()
print(f"Static feature table created: Input_Link_Table.shape = {Input_Link_Table.shape}")

# --------------------- Time Series Structuring ---------------------
result = filtered_df[['Total_CO2_capture', 'year', 'file_id']]
file_ids = result['file_id'].unique()
num_file_ids = len(file_ids)
max_timesteps = 101
relevant_data = np.zeros((num_file_ids, max_timesteps))
file_id_order = np.zeros(num_file_ids)

for i, file_id in enumerate(file_ids):
    file_data = result[result['file_id'] == file_id]['Total_CO2_capture'].values
    relevant_data[i, :len(file_data)] = file_data
    file_id_order[i] = file_id
print(f"Time series matrix constructed: relevant_data.shape = {relevant_data.shape}")

# --------------------- Clustering ---------------------
scaler = StandardScaler()
normalized_data = scaler.fit_transform(relevant_data)
kmeans = KMeans(n_clusters=8, random_state=42)
clusters = kmeans.fit_predict(normalized_data)
print("Performed KMeans clustering into 8 clusters")

# Compute boundary stats
cluster_boundaries = []
for cluster_id in range(8):
    cluster_data = normalized_data[clusters == cluster_id]
    min_v = scaler.inverse_transform(np.min(cluster_data, axis=0).reshape(1, -1)).flatten()
    median_v = scaler.inverse_transform(np.median(cluster_data, axis=0).reshape(1, -1)).flatten()
    mean_v = scaler.inverse_transform(np.mean(cluster_data, axis=0).reshape(1, -1)).flatten()
    max_v = scaler.inverse_transform(np.max(cluster_data, axis=0).reshape(1, -1)).flatten()
    cluster_boundaries.append((min_v, median_v, mean_v, max_v))
cluster_boundaries = np.array(cluster_boundaries)
print(f"Cluster boundary stats calculated: cluster_boundaries.shape = {cluster_boundaries.shape}")

# --------------------- Merge Static Features with Clusters ---------------------
Clustering_link_table = pd.DataFrame({'file_id': file_id_order.astype(int), 'cluster': clusters})
Clustering_link_table = Clustering_link_table.sort_values(by='file_id').reset_index(drop=True)
merged_df = pd.merge(Input_Link_Table, Clustering_link_table, on='file_id')
print(f"Final input features (static + cluster): merged_df.shape = {merged_df.shape}")

# --------------------- Create Output Time Series DataFrame ---------------------
data = [[file_id_order[i].astype(int), t, relevant_data[i, t]] for i in range(len(file_id_order)) for t in range(max_timesteps)]
df_output = pd.DataFrame(data, columns=['file_id', 'timestep', 'CO2']).sort_values(by=['file_id', 'timestep'])
print(f"Final output time series: df_output.shape = {df_output.shape}")

# --------------------- Summary ---------------------
print("Data Preparation Summary:")
print(f"Static Input Table: merged_df [{merged_df.shape[0]} rows × {merged_df.shape[1]} columns]")
print(f"Time Series Output: df_output [{df_output.shape[0]} rows × 3 columns]")
print(f"Cluster Boundaries: cluster_boundaries [{cluster_boundaries.shape}]")

Mounting Google Drive and loading dataset...
Mounted at /content/drive
Loaded dataset with shape: (1192157, 17)
Static feature table created: Input_Link_Table.shape = (2703, 15)
Time series matrix constructed: relevant_data.shape = (2703, 101)
Performed KMeans clustering into 8 clusters
Cluster boundary stats calculated: cluster_boundaries.shape = (8, 4, 101)
Final input features (static + cluster): merged_df.shape = (2703, 16)
Final output time series: df_output.shape = (273003, 3)
Data Preparation Summary:
Static Input Table: merged_df [2703 rows × 16 columns]
Time Series Output: df_output [273003 rows × 3 columns]
Cluster Boundaries: cluster_boundaries [(8, 4, 101)]


# MODELS

**Advanced DSSM model**

In [None]:
class AdvancedDSSMDeepState(nn.Module):
    def __init__(self, input_dim, static_dim, hidden_dim, output_dim):
        super(AdvancedDSSMDeepState, self).__init__()

        # Static Data Path (Fully connected layers for static features)
        self.fc_static1 = nn.Linear(static_dim, 512)
        self.fc_static2 = nn.Linear(512, 256)
        self.fc_static3 = nn.Linear(256, 128)
        self.fc_static4 = nn.Linear(128, 64)

        # Time-series Path (Conv1D for feature extraction)
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=hidden_dim, kernel_size=3, padding=1)
        self.relu = nn.ReLU()

        # Deep State Dynamics (LSTM for latent state transitions)
        self.lstm_state = nn.LSTM(hidden_dim + 64, hidden_dim, batch_first=True)

        # Observation Model (Mapping latent states to outputs)
        self.fc1 = nn.Linear(hidden_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, time_series_input, static_input):
        # Static Data Path
        static_out = self.relu(self.fc_static1(static_input))
        static_out = self.relu(self.fc_static2(static_out))
        static_out = self.relu(self.fc_static3(static_out))
        static_out = self.relu(self.fc_static4(static_out))  # Shape: [batch_size, 64]

        # Time-Series Data Path
        if len(time_series_input.shape) == 2:  # [batch_size, seq_len]
            time_series_input = time_series_input.unsqueeze(1)  # Add channel dimension: [batch_size, 1, seq_len]

        conv_out = self.conv1(time_series_input)  # Conv1D layer
        conv_out = self.relu(conv_out)
        conv_out = conv_out.transpose(1, 2)  # Shape: [batch_size, seq_len, hidden_dim]

        # Expand static features to match the sequence length
        static_expanded = static_out.unsqueeze(1).expand(-1, conv_out.size(1), -1)  # Shape: [batch_size, seq_len, 64]

        # Combine Conv1D features and static features
        lstm_input = torch.cat([conv_out, static_expanded], dim=2)  # Shape: [batch_size, seq_len, hidden_dim + 64]

        # Latent State Dynamics (LSTM for state transitions)
        lstm_out, _ = self.lstm_state(lstm_input)  # Shape: [batch_size, seq_len, hidden_dim]

        # Observation Model
        lstm_out_final = lstm_out[:, -1, :]  # Use the last state for prediction
        x = self.fc1(lstm_out_final)
        x = self.relu(x)
        output = self.fc2(x)  # Final prediction

        return output

def plot_boundary_cases_with_input(inputs, Boundary_case_actuals, Boundary_case_predicted, model_name, input_length):
    case_names = ["Best", "Average", "Worst"]
    x_range = input_length
    y_range = Boundary_case_actuals.shape[1]
    total_timesteps = x_range + y_range

    for i in range(3):
        plt.figure(figsize=(7.5, 3.2))

        # Plot input (X)
        plt.plot(range(x_range), inputs[i], color='black', alpha=0.5, label='Input')

        # Plot output actual vs predicted (Y)
        plt.plot(range(x_range, total_timesteps), Boundary_case_actuals[i], color='blue', alpha=0.8, label='Actual')
        plt.plot(range(x_range, total_timesteps), Boundary_case_predicted[i], color='red', alpha=0.8, label='Predicted')

        plt.xlabel("Time Steps")
        plt.ylabel("CO₂ Sequestration")
        plt.title(f"{case_names[i]} Case – {model_name}")
        plt.legend()
        plt.tight_layout(pad=2.5)

        filename = f"drive/My Drive/DSSM-Figures1/{model_name}_{case_names[i]}.pdf"
        plt.savefig(filename, format='pdf', bbox_inches='tight')
        plt.close()

**Advanced-DSSM-MSE-experiment (best model, saved model, hyperparameter listing, training status report)**

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import os
from itertools import product

# Output path
drive_path = '/content/drive/MyDrive/DSSM-Figures'
os.makedirs(drive_path, exist_ok=True)

# Grid search hyperparameters
hidden_dims = [50, 101, 200]
batch_sizes = [32, 64, 128]
learning_rates = [0.001, 0.005, 0.01]

# Result storage
DSSM_Advanced_mse = pd.DataFrame(columns=['hidden_dim', 'batch_size', 'learning_rate', 'Test_MSE'])

# Data split setup (only one split used here)
splits = [(20, 80)]

for train_pct, test_pct in splits:
    file_ids = df_output['file_id'].unique()
    trainval_ids, test_ids = train_test_split(file_ids, test_size=0.2, random_state=42)
    train_ids, val_ids = train_test_split(trainval_ids, test_size=0.2, random_state=42)

    def extract_X_Y(ids, pct):
        df_subset = df_output[df_output['file_id'].isin(ids)]
        pivoted = df_subset.pivot(index='file_id', columns='timestep', values='CO2').values
        split_idx = int(pct / 100 * 101)
        X = pivoted[:, :split_idx]
        Y = pivoted[:, split_idx:]
        return X, Y

    X_train, Y_train = extract_X_Y(train_ids, train_pct)
    X_val, Y_val = extract_X_Y(val_ids, train_pct)
    X_test, Y_test = extract_X_Y(test_ids, train_pct)

    static_train = merged_df[merged_df['file_id'].isin(train_ids)].drop(columns=['file_id', 'cluster']).values
    static_val = merged_df[merged_df['file_id'].isin(val_ids)].drop(columns=['file_id', 'cluster']).values
    static_test = merged_df[merged_df['file_id'].isin(test_ids)].drop(columns=['file_id', 'cluster']).values

    # Convert to tensors
    X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
    Y_train_tensor = torch.tensor(Y_train, dtype=torch.float32)
    static_train_tensor = torch.tensor(static_train, dtype=torch.float32)

    X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
    Y_val_tensor = torch.tensor(Y_val, dtype=torch.float32)
    static_val_tensor = torch.tensor(static_val, dtype=torch.float32)

    X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
    Y_test_tensor = torch.tensor(Y_test, dtype=torch.float32)
    static_test_tensor = torch.tensor(static_test, dtype=torch.float32)

    # Grid search loop
    for hidden_dim, batch_size, learning_rate in product(hidden_dims, batch_sizes, learning_rates):
        config_name = f"hd{hidden_dim}_bs{batch_size}_lr{learning_rate}"
        print(f"\n=== Training config: {config_name} ===")

        model = AdvancedDSSMDeepState(
            input_dim=X_train.shape[1],
            static_dim=static_train.shape[1],
            hidden_dim=hidden_dim,
            output_dim=Y_train.shape[1]
        )

        optimizer = optim.Adam(model.parameters(), lr=learning_rate)
        criterion = nn.MSELoss()

        train_loader = DataLoader(TensorDataset(X_train_tensor, static_train_tensor, Y_train_tensor),
                                  batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(TensorDataset(X_val_tensor, static_val_tensor, Y_val_tensor),
                                batch_size=batch_size)

        best_val_loss = float('inf')
        best_model_state = None

        for epoch in range(200):
            model.train()
            for X_batch, static_batch, Y_batch in train_loader:
                optimizer.zero_grad()
                preds = model(X_batch, static_batch)
                loss = criterion(preds, Y_batch)
                loss.backward()
                optimizer.step()

            model.eval()
            val_loss = 0.0
            with torch.no_grad():
                for X_batch, static_batch, Y_batch in val_loader:
                    preds = model(X_batch, static_batch)
                    val_loss += criterion(preds, Y_batch).item()
                val_loss /= len(val_loader)

            if val_loss < best_val_loss:
                best_val_loss = val_loss
                best_model_state = model.state_dict()
                torch.save(best_model_state, os.path.join(drive_path, f"best_model_{config_name}.pt"))

            if (epoch + 1) % 10 == 0:
                print(f"Epoch {epoch+1}: Train Loss = {loss.item():.6f}, Val Loss = {val_loss:.6f}")

        # Final test loss
        model.load_state_dict(torch.load(os.path.join(drive_path, f"best_model_{config_name}.pt")))
        model.eval()
        with torch.no_grad():
            test_preds = model(X_test_tensor, static_test_tensor)
            test_loss = criterion(test_preds, Y_test_tensor).item()

        DSSM_Advanced_mse.loc[len(DSSM_Advanced_mse)] = [hidden_dim, batch_size, learning_rate, test_loss]
        print(f"Test MSE for config {config_name}: {test_loss:.6f}")

# Save all results
DSSM_Advanced_mse.to_csv(os.path.join(drive_path, "DSSM_Advanced_gridsearch_results.csv"), index=False)
print(f"\n✅ All results saved to DSSM_Advanced_gridsearch_results.csv")


=== Training config: hd50_bs32_lr0.001 ===
Epoch 10: Train Loss = 0.002272, Val Loss = 0.025849
Epoch 20: Train Loss = 0.006224, Val Loss = 0.018834
Epoch 30: Train Loss = 0.001257, Val Loss = 0.017925
Epoch 40: Train Loss = 0.002396, Val Loss = 0.016167
Epoch 50: Train Loss = 0.004867, Val Loss = 0.014591
Epoch 60: Train Loss = 0.000927, Val Loss = 0.017997
Epoch 70: Train Loss = 0.001794, Val Loss = 0.012556
Epoch 80: Train Loss = 0.120569, Val Loss = 0.031388
Epoch 90: Train Loss = 0.002861, Val Loss = 0.014293
Epoch 100: Train Loss = 0.000354, Val Loss = 0.011990
Epoch 110: Train Loss = 0.000236, Val Loss = 0.006642
Epoch 120: Train Loss = 0.003129, Val Loss = 0.014372
Epoch 130: Train Loss = 0.001541, Val Loss = 0.007804
Epoch 140: Train Loss = 0.003383, Val Loss = 0.008697
Epoch 150: Train Loss = 0.001954, Val Loss = 0.006914
Epoch 160: Train Loss = 0.001556, Val Loss = 0.005949
Epoch 170: Train Loss = 0.001834, Val Loss = 0.006129
Epoch 180: Train Loss = 0.002474, Val Loss = 0.