# DATA PREPERATION

In [None]:
# --------------------- Imports ---------------------
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
import torch.optim as optim
from google.colab import drive

# --------------------- Matplotlib Setup ---------------------
mpl.rcParams.update({
    'font.size': 14,
    'axes.titlesize': 15,
    'axes.labelsize': 12,
    'xtick.labelsize': 11,
    'ytick.labelsize': 11,
    'legend.fontsize': 11,
    'figure.dpi': 300,
    'savefig.dpi': 300,
    'figure.autolayout': True,
})

# --------------------- Load Data ---------------------
print("Mounting Google Drive and loading dataset...")
drive.mount('/content/drive')
total_capture_7k = pd.read_csv('drive/My Drive/correlation_wide.csv')
print(f"Loaded dataset with shape: {total_capture_7k.shape}")

# --------------------- Identify Unique Static Parameter Sets ---------------------
static_cols = [
    'MikeSorghum', 'Quartz', 'Plagioclase', 'Apatite', 'Ilmenite',
    'Diopside_Mn', 'Diopside', 'Olivine', 'Alkali-feldspar',
    'Montmorillonite', 'Glass', 'temp', 'shift', 'year'
]

# Add timestep count per file_id
file_lengths = total_capture_7k.groupby('file_id').size().rename("num_timesteps").reset_index()
static_rows = total_capture_7k.groupby('file_id')[static_cols].first().reset_index()
static_rows = static_rows.merge(file_lengths, on='file_id')

# Filter only unique static parameter sets
unique_static_rows = static_rows.drop_duplicates(subset=static_cols)
unique_file_ids = unique_static_rows['file_id'].tolist()

# --------------------- Extract Time Series Data ---------------------
filtered_df = total_capture_7k[total_capture_7k['file_id'].isin(unique_file_ids)].copy()

# Truncate each group to 101 timesteps
filtered_df = filtered_df.groupby('file_id').head(101).reset_index(drop=True)

# --------------------- Static Feature Table ---------------------
Input_Link_Table = filtered_df.groupby('file_id').agg({col: 'first' for col in static_cols}).reset_index()
print(f"Static feature table created: Input_Link_Table.shape = {Input_Link_Table.shape}")

# --------------------- Time Series Structuring ---------------------
result = filtered_df[['Total_CO2_capture', 'year', 'file_id']]
file_ids = result['file_id'].unique()
num_file_ids = len(file_ids)
max_timesteps = 101
relevant_data = np.zeros((num_file_ids, max_timesteps))
file_id_order = np.zeros(num_file_ids)

for i, file_id in enumerate(file_ids):
    file_data = result[result['file_id'] == file_id]['Total_CO2_capture'].values
    relevant_data[i, :len(file_data)] = file_data
    file_id_order[i] = file_id
print(f"Time series matrix constructed: relevant_data.shape = {relevant_data.shape}")

# --------------------- Clustering ---------------------
scaler = StandardScaler()
normalized_data = scaler.fit_transform(relevant_data)
kmeans = KMeans(n_clusters=8, random_state=42)
clusters = kmeans.fit_predict(normalized_data)
print("Performed KMeans clustering into 8 clusters")

# Compute boundary stats
cluster_boundaries = []
for cluster_id in range(8):
    cluster_data = normalized_data[clusters == cluster_id]
    min_v = scaler.inverse_transform(np.min(cluster_data, axis=0).reshape(1, -1)).flatten()
    median_v = scaler.inverse_transform(np.median(cluster_data, axis=0).reshape(1, -1)).flatten()
    mean_v = scaler.inverse_transform(np.mean(cluster_data, axis=0).reshape(1, -1)).flatten()
    max_v = scaler.inverse_transform(np.max(cluster_data, axis=0).reshape(1, -1)).flatten()
    cluster_boundaries.append((min_v, median_v, mean_v, max_v))
cluster_boundaries = np.array(cluster_boundaries)
print(f"Cluster boundary stats calculated: cluster_boundaries.shape = {cluster_boundaries.shape}")

# --------------------- Merge Static Features with Clusters ---------------------
Clustering_link_table = pd.DataFrame({'file_id': file_id_order.astype(int), 'cluster': clusters})
Clustering_link_table = Clustering_link_table.sort_values(by='file_id').reset_index(drop=True)
merged_df = pd.merge(Input_Link_Table, Clustering_link_table, on='file_id')
print(f"Final input features (static + cluster): merged_df.shape = {merged_df.shape}")

# --------------------- Create Output Time Series DataFrame ---------------------
data = [[file_id_order[i].astype(int), t, relevant_data[i, t]] for i in range(len(file_id_order)) for t in range(max_timesteps)]
df_output = pd.DataFrame(data, columns=['file_id', 'timestep', 'CO2']).sort_values(by=['file_id', 'timestep'])
print(f"Final output time series: df_output.shape = {df_output.shape}")

# --------------------- Summary ---------------------
print("Data Preparation Summary:")
print(f"Static Input Table: merged_df [{merged_df.shape[0]} rows × {merged_df.shape[1]} columns]")
print(f"Time Series Output: df_output [{df_output.shape[0]} rows × 3 columns]")
print(f"Cluster Boundaries: cluster_boundaries [{cluster_boundaries.shape}]")


Mounting Google Drive and loading dataset...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Loaded dataset with shape: (1192157, 17)
Static feature table created: Input_Link_Table.shape = (2703, 15)
Time series matrix constructed: relevant_data.shape = (2703, 101)
Performed KMeans clustering into 8 clusters
Cluster boundary stats calculated: cluster_boundaries.shape = (8, 4, 101)
Final input features (static + cluster): merged_df.shape = (2703, 16)
Final output time series: df_output.shape = (273003, 3)
Data Preparation Summary:
Static Input Table: merged_df [2703 rows × 16 columns]
Time Series Output: df_output [273003 rows × 3 columns]
Cluster Boundaries: cluster_boundaries [(8, 4, 101)]


**NLinear**

In [None]:
class NLinear(nn.Module):
    def __init__(self, seq_len, pred_len, individual=False):
        super(NLinear, self).__init__()
        self.seq_len = seq_len
        self.pred_len = pred_len
        self.individual = individual

        if self.individual:
            self.Linear = nn.ModuleList()
            for i in range(1):  # Univariate case
                self.Linear.append(nn.Linear(self.seq_len, self.pred_len))
        else:
            self.Linear = nn.Linear(self.seq_len, self.pred_len)

    def forward(self, x):
        if len(x.shape) == 3:
            x = x.squeeze(-1)

        seq_last = x[:, -1:].detach()
        x = x - seq_last

        if self.individual:
            out = torch.zeros([x.size(0), self.pred_len], dtype=x.dtype).to(x.device)
            for i in range(1):
                out[:, :] = self.Linear[i](x)
        else:
            out = self.Linear(x)

        out = out + seq_last
        return out.unsqueeze(-1)

**Nlinear experiment**

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, Subset
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import os

# Model Definition
class NLinear(nn.Module):
    def __init__(self, seq_len, pred_len, individual=False):
        super(NLinear, self).__init__()
        self.seq_len = seq_len
        self.pred_len = pred_len
        self.individual = individual
        if self.individual:
            self.Linear = nn.ModuleList([nn.Linear(self.seq_len, self.pred_len)])
        else:
            self.Linear = nn.Linear(self.seq_len, self.pred_len)

    def forward(self, x):
        if len(x.shape) == 3:
            x = x.squeeze(-1)
        seq_last = x[:, -1:].detach()
        x = x - seq_last
        if self.individual:
            out = torch.zeros([x.size(0), self.pred_len], dtype=x.dtype).to(x.device)
            for i in range(1):
                out[:, :] = self.Linear[i](x)
        else:
            out = self.Linear(x)
        out = out + seq_last
        return out.unsqueeze(-1)

# Hyperparameters and Output Storage
nlinear_config = {
    'epochs': 500,
    'batch_size': 64,
    'learning_rate': 0.001,
    'individual': False,
    'model_name': 'NLinear'
}

NLinear_mse = pd.DataFrame(columns=['Split', 'Test_MSE'])
splits = [(80, 20), (60, 40), (50, 50), (40, 60), (20, 80), (10, 90), (5, 95), (3, 97), (1, 99)]
drive_path = '/content/drive/MyDrive/DSSM-Figures'
os.makedirs(drive_path, exist_ok=True)

for train_pct, test_pct in splits:
    split_name = f"{train_pct}_{test_pct}"
    print(f"\n==== Running Split: {split_name} ====")

    # Split file_ids
    file_ids = df_output['file_id'].unique()
    trainval_ids, test_ids = train_test_split(file_ids, test_size=0.2, random_state=42)
    train_ids, val_ids = train_test_split(trainval_ids, test_size=0.2, random_state=42)

    def extract_X_Y(ids, pct):
        df_subset = df_output[df_output['file_id'].isin(ids)]
        pivoted = df_subset.pivot(index='file_id', columns='timestep', values='CO2').values
        split_idx = int(pct / 100 * 101)
        X = pivoted[:, :split_idx]
        Y = pivoted[:, split_idx:]
        return X, Y

    X_train, Y_train = extract_X_Y(train_ids, train_pct)
    X_val, Y_val = extract_X_Y(val_ids, train_pct)
    X_test, Y_test = extract_X_Y(test_ids, train_pct)

    X_train_tensor = torch.tensor(X_train[:, :, None], dtype=torch.float32)
    Y_train_tensor = torch.tensor(Y_train[:, :, None], dtype=torch.float32)
    X_val_tensor = torch.tensor(X_val[:, :, None], dtype=torch.float32)
    Y_val_tensor = torch.tensor(Y_val[:, :, None], dtype=torch.float32)
    X_test_tensor = torch.tensor(X_test[:, :, None], dtype=torch.float32)
    Y_test_tensor = torch.tensor(Y_test[:, :, None], dtype=torch.float32)

    print(f"Split {split_name} — X (INPUT): {X_train.shape[1]}, Y (OUTPUT): {Y_train.shape[1]}")
    print(f"Train: {X_train_tensor.shape}, Val: {X_val_tensor.shape}, Test: {X_test_tensor.shape}")

    train_dataset = TensorDataset(X_train_tensor, Y_train_tensor)
    val_dataset = TensorDataset(X_val_tensor, Y_val_tensor)
    test_dataset = TensorDataset(X_test_tensor, Y_test_tensor)

    train_loader = DataLoader(train_dataset, batch_size=nlinear_config['batch_size'], shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=nlinear_config['batch_size'])
    test_loader = DataLoader(test_dataset, batch_size=nlinear_config['batch_size'])

    model = NLinear(seq_len=X_train.shape[1], pred_len=Y_train.shape[1], individual=nlinear_config['individual'])
    optimizer = optim.Adam(model.parameters(), lr=nlinear_config['learning_rate'])
    criterion = nn.MSELoss()

    best_val_loss = float('inf')
    best_model_state = None

    print("Training started...")
    for epoch in range(nlinear_config['epochs']):
        model.train()
        for X_batch, Y_batch in train_loader:
            optimizer.zero_grad()
            preds = model(X_batch)
            loss = criterion(preds, Y_batch)
            loss.backward()
            optimizer.step()

        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for X_batch, Y_batch in val_loader:
                preds = model(X_batch)
                val_loss += criterion(preds, Y_batch).item()
        val_loss /= len(val_loader)

        if (epoch + 1) % 10 == 0:
            print(f"Epoch {epoch+1}: Train Loss = {loss.item():.6f}, Val Loss = {val_loss:.6f}")

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model_state = model.state_dict()
            #print(f"New best model found at epoch {epoch+1} with val loss {val_loss:.6f}")
            torch.save(best_model_state, os.path.join(drive_path, f"best_model_{split_name}.pt"))

    model.load_state_dict(torch.load(os.path.join(drive_path, f"best_model_{split_name}.pt")))
    model.eval()

    total_mse = 0.0
    total_samples = 0
    with torch.no_grad():
        for X_batch, Y_batch in test_loader:
            outputs = model(X_batch)
            batch_mse = criterion(outputs, Y_batch).item()
            total_mse += batch_mse * X_batch.size(0)
            total_samples += X_batch.size(0)

    avg_mse = total_mse / total_samples
    print(f"Final Test MSE ({split_name}): {avg_mse:.6f}")
    NLinear_mse.loc[len(NLinear_mse)] = [split_name, avg_mse]


==== Running Split: 80_20 ====
Split 80_20 — X (INPUT): 80, Y (OUTPUT): 21
Train: torch.Size([1729, 80, 1]), Val: torch.Size([433, 80, 1]), Test: torch.Size([541, 80, 1])
Training started...
Epoch 10: Train Loss = 0.000252, Val Loss = 0.000861
Epoch 20: Train Loss = 0.000139, Val Loss = 0.000282
Epoch 30: Train Loss = 0.000080, Val Loss = 0.000136
Epoch 40: Train Loss = 0.000057, Val Loss = 0.000084
Epoch 50: Train Loss = 0.000020, Val Loss = 0.000071
Epoch 60: Train Loss = 0.000014, Val Loss = 0.000048
Epoch 70: Train Loss = 0.000070, Val Loss = 0.000042
Epoch 80: Train Loss = 0.000024, Val Loss = 0.000035
Epoch 90: Train Loss = 0.000076, Val Loss = 0.000053
Epoch 100: Train Loss = 0.000030, Val Loss = 0.000031
Epoch 110: Train Loss = 0.000014, Val Loss = 0.000035
Epoch 120: Train Loss = 0.000027, Val Loss = 0.000030
Epoch 130: Train Loss = 0.000064, Val Loss = 0.000019
Epoch 140: Train Loss = 0.000017, Val Loss = 0.000017
Epoch 150: Train Loss = 0.000003, Val Loss = 0.000017
Epoch 1