In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Install conda-in-colab (restarts runtime once)
!pip install -q condacolab
import condacolab; condacolab.install()

✨🍰✨ Everything looks OK!


In [None]:
# Create a Python 3.9 environment
!conda create -y -n mx19 python=3.9

# Upgrade basic tooling inside the env
!conda run -n mx19 python -m pip install --upgrade pip setuptools wheel

# Install CUDA runtime libraries inside the env (to satisfy mxnet-cu112)
!conda run -n mx19 conda install -y -c conda-forge cudatoolkit=11.2 cudnn=8.1

# Install pinned, compatible Python packages (NUMPY 1.23.5 is key)
!conda run -n mx19 python -m pip install \
  "numpy==1.23.5" "pandas==1.5.3" "matplotlib==3.7.1" "ujson==5.7.0" \
  "mxnet-cu112==1.9.1" "gluonts==0.13.3" "pyarrow==10.0.1"

Channels:
 - conda-forge
Platform: linux-64
Collecting package metadata (repodata.json): - \ | / - done
Solving environment: | / - done


    current version: 24.11.2
    latest version: 25.7.0

Please update conda by running

    $ conda update -n base -c conda-forge conda



## Package Plan ##

  environment location: /usr/local/envs/mx19

  added / updated specs:
    - python=3.9


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    ca-certificates-2025.8.3   |       hbd8a1cb_0         151 KB  conda-forge
    ld_impl_linux-64-2.44      |       h1423503_1         660 KB  conda-forge
    libexpat-2.7.1             |       hecca717_0          73 KB  conda-forge
    libffi-3.4.6               |       h2dba641_1          56 KB  conda-forge
    libgcc-15.1.0              |       h767d61c_4         805 KB  conda-forge
    libgcc-ng-15.1.0           |       h69a702a_4          29 

In [None]:
!nvidia-smi
!conda run -n mx19 python -c "import mxnet as mx, numpy, gluonts, pandas, sys; \
print('numpy', numpy.__version__); \
print('pandas', pandas.__version__); \
print('mxnet', mx.__version__); \
print('gluonts', gluonts.__version__); \
print('python', sys.version); \
print('num_gpus:', mx.context.num_gpus())"

Wed Aug 27 21:08:02 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   35C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
%%writefile prep.py
# Data Prep (robust, NO clusters, no google.colab import)
import os
import pandas as pd

CSV_PATH = "drive/My Drive/C02 project/correlation_wide.csv"  # adjust if needed

def run_prep():
    assert os.path.exists("/content/drive"), "Mount Drive first in a host cell: from google.colab import drive; drive.mount('/content/drive')"
    full_path = os.path.join("/content", CSV_PATH)
    if not os.path.exists(full_path):
        raise FileNotFoundError(f"Could not find CSV at: /content/{CSV_PATH}")

    df = pd.read_csv(full_path)
    print("Loaded:", df.shape, "Columns:", list(df.columns))

    # required cols
    assert "file_id" in df.columns, "CSV must contain 'file_id'"
    assert "Total_CO2_capture" in df.columns, "CSV must contain 'Total_CO2_capture'"

    # ensure 'timestep' exists
    alt = ["timestep","time_idx","step","TimeStep","time"]
    for name in alt:
        if name in df.columns:
            if name != "timestep":
                df = df.rename(columns={name:"timestep"})
            break
    else:
        sort_cols = ["file_id","year"] if "year" in df.columns else ["file_id"]
        df = df.sort_values(sort_cols).copy()
        df["timestep"] = df.groupby("file_id").cumcount()

    # keep first 101 steps
    df = (df.sort_values(["file_id","timestep"])
            .query("timestep < 101")
            .reset_index(drop=True))

    # statics present
    static_cols = [
        'MikeSorghum','Quartz','Plagioclase','Apatite','Ilmenite',
        'Diopside_Mn','Diopside','Olivine','Alkali-feldspar',
        'Montmorillonite','Glass','temp','shift','year'
    ]
    present = [c for c in static_cols if c in df.columns]

    # optional: dedupe to unique static parameter sets
    static_rows = df.groupby("file_id")[present].first().reset_index()
    unique_static_rows = static_rows.drop_duplicates(subset=present)
    keep_ids = set(unique_static_rows["file_id"].tolist())
    df = df[df["file_id"].isin(keep_ids)].copy()

    # merged_df (statics only)
    merged_df = (df[["file_id"] + present]
                   .drop_duplicates(subset=["file_id"])
                   .reset_index(drop=True))
    for c in ["temp","shift","year"]:
        if c in merged_df.columns:
            merged_df[c] = pd.to_numeric(merged_df[c], errors="coerce").fillna(0.0)

    # df_output (long CO2)
    df_output = (df[["file_id","timestep","Total_CO2_capture"]]
                   .rename(columns={"Total_CO2_capture":"CO2"})
                   .sort_values(["file_id","timestep"])
                   .reset_index(drop=True))

    # Save Parquet + CSV backups
    merged_df.to_parquet("merged_df.parquet", index=False)
    df_output.to_parquet("df_output.parquet", index=False)
    merged_df.to_csv("merged_df.csv", index=False)
    df_output.to_csv("df_output.csv", index=False)

    print("\nSummary:")
    print("merged_df:", merged_df.shape, "(statics only)")
    print("df_output:", df_output.shape, "(long CO2)")
    print("Per-series max timestep:")
    print(df_output.groupby("file_id")["timestep"].max().describe())

if __name__ == "__main__":
    run_prep()

Writing prep.py


In [None]:
%%writefile deepstate_run.py
import os, random
import pandas as pd
from typing import List
from gluonts.dataset.common import ListDataset
from gluonts.mx.trainer import Trainer
from gluonts.mx.model.deepstate import DeepStateEstimator
from gluonts.evaluation import Evaluator, make_evaluation_predictions
import mxnet as mx

CFG = {
    "freq": "D",
    "seed": 42,
    "learning_rate": 1e-3,
    "num_layers": 2,
    "num_cells": 40,
    "dropout_rate": 0.1,
    "epoch_grid": [200],
    "num_batches_per_epoch": 100,
    "val_frac": 0.10,
    "test_frac": 0.20,
}

def set_seeds(seed=42):
    random.seed(seed)
    try:
        mx.random.seed(seed)
    except Exception:
        pass

set_seeds(CFG["seed"])

def split_ids_py(file_ids: List[int], test_frac=0.20, val_frac=0.10, seed=42):
    ids = list(file_ids)
    rnd = random.Random(seed); rnd.shuffle(ids)
    n_test = round(test_frac * len(ids)); test_ids = ids[:n_test]
    rest = ids[n_test:]
    n_val = round(val_frac * len(rest)); val_ids = rest[:n_val]
    train_ids = rest[n_val:]
    return train_ids, val_ids, test_ids

def pivot_series(df_output: pd.DataFrame, ids: List[int]):
    sub = df_output[df_output["file_id"].isin(ids)]
    piv = sub.pivot(index="file_id", columns="timestep", values="CO2").sort_index()
    # safer than dropping: fill across time to avoid NaNs
    piv = piv.apply(lambda s: s.ffill().bfill(), axis=1)
    mat = [row.tolist() for _, row in piv.iterrows()]
    kept_ids = piv.index.tolist()
    return mat, kept_ids

def get_statics_real(merged_df: pd.DataFrame, kept_ids: List[int]):
    stat = (merged_df.set_index("file_id")
                     .loc[kept_ids]
                     .drop(columns=["cluster"], errors="ignore")
                     .fillna(0.0))
    return [row.tolist() for _, row in stat.iterrows()]

def build_listdataset_dynreal_from_statics_py(
    target_mat,          # list of sequences; each is length T
    static_real_mat,     # list of [F_static] floats
    start_period,
    freq: str
) -> ListDataset:
    items = []
    for y, s in zip(target_mat, static_real_mat):
        T = len(y)  # dynamic reals must match target length
        dyn = [[float(val) for _ in range(T)] for val in s]   # [F_static, T]
        items.append({
            "target": [float(v) for v in y],
            "start": start_period,
            "feat_dynamic_real": dyn,
        })
    return ListDataset(items, freq=freq)

def fit_deepstate(train_ds, X_len, Y_len, epochs):
    # pick GPU if available
    ctx = mx.gpu() if mx.context.num_gpus() > 0 else mx.cpu()

    est = DeepStateEstimator(
        freq=CFG["freq"],
        prediction_length=Y_len,
        past_length=X_len,

        # not using static categorical features, but GluonTS 0.13 requires cardinality
        use_feat_static_cat=False,
        cardinality=[1],

        use_feat_dynamic_real=True,
        add_trend=False,
        num_layers=CFG["num_layers"],
        num_cells=CFG["num_cells"],
        dropout_rate=CFG["dropout_rate"],
        trainer=Trainer(
            ctx=ctx,
            epochs=epochs,
            learning_rate=CFG["learning_rate"],
            num_batches_per_epoch=CFG["num_batches_per_epoch"],
        ),
    )
    return est.train(train_ds)

def mse_on_dataset(predictor, dataset):
    fc_it, ts_it = make_evaluation_predictions(dataset=dataset, predictor=predictor, num_samples=100)
    forecasts, tss = list(fc_it), list(ts_it)
    agg, _ = Evaluator(quantiles=[0.1, 0.5, 0.9])(tss, forecasts)
    return float(agg["MSE"])

def run_deepstate_split_with_val(df_output, merged_df, train_ids, val_ids, test_ids, X_len, Y_len):
    Xy_train, kept_train = pivot_series(df_output, train_ids)
    Xy_val,   kept_val   = pivot_series(df_output, val_ids)
    Xy_test,  kept_test  = pivot_series(df_output, test_ids)

    use_T = X_len + Y_len
    Xy_train = [y[:use_T] for y in Xy_train]
    Xy_val   = [y[:use_T] for y in Xy_val]
    Xy_test  = [y[:use_T] for y in Xy_test]

    S_train = get_statics_real(merged_df, kept_train)
    S_val   = get_statics_real(merged_df, kept_val)
    S_test  = get_statics_real(merged_df, kept_test)

    start = pd.Period("2000-01-01", freq=CFG["freq"])

    # Train: X; Val/Test: X+Y (for scoring last Y)
    train_ds = build_listdataset_dynreal_from_statics_py(
        target_mat=[y[:-Y_len] for y in Xy_train],  # length X
        static_real_mat=S_train,
        start_period=start, freq=CFG["freq"]
    )
    val_ds = build_listdataset_dynreal_from_statics_py(
        target_mat=Xy_val,                           # length X+Y
        static_real_mat=S_val,
        start_period=start, freq=CFG["freq"]
    )
    test_ds = build_listdataset_dynreal_from_statics_py(
        target_mat=Xy_test,                          # length X+Y
        static_real_mat=S_test,
        start_period=start, freq=CFG["freq"]
    )

    print(f"  train series: {len(list(train_ds))}, val: {len(list(val_ds))}, test: {len(list(test_ds))}", flush=True)
    print(f"  X_len={X_len}, Y_len={Y_len}", flush=True)

    best_val, best_ep, best_pred = float("inf"), None, None
    for ep in CFG["epoch_grid"]:
        print(f"  -> training epochs={ep}", flush=True)
        pred = fit_deepstate(train_ds, X_len=X_len, Y_len=Y_len, epochs=ep)
        val_mse = mse_on_dataset(pred, val_ds)
        print(f"     VAL MSE @epochs={ep}: {val_mse:.6f}", flush=True)
        if val_mse < best_val:
            best_val, best_ep, best_pred = val_mse, ep, pred

    test_mse = mse_on_dataset(best_pred, test_ds)
    print(f"  ** Best epochs={best_ep} | VAL MSE={best_val:.6f} | TEST MSE={test_mse:.6f}", flush=True)
    return {"best_epochs": best_ep, "val_mse": best_val, "test_mse": test_mse}

def main():
    # prefer Parquet (faster, types preserved), else CSV
    if os.path.exists("merged_df.parquet") and os.path.exists("df_output.parquet"):
        merged_df = pd.read_parquet("merged_df.parquet")
        df_output = pd.read_parquet("df_output.parquet")
    else:
        merged_df = pd.read_csv("merged_df.csv")
        df_output = pd.read_csv("df_output.csv")

    file_ids = df_output["file_id"].unique().tolist()
    train_ids, val_ids, test_ids = split_ids_py(
        file_ids, test_frac=CFG["test_frac"], val_frac=CFG["val_frac"], seed=CFG["seed"]
    )

    splits = [(80,21), (60,41), (40,61), (20,81), (10,91), (5,96)]
    rows = []
    for X_len, Y_len in splits:
        print(f"\n==== DeepState Split X={X_len} | Y={Y_len} ====", flush=True)
        rows.append({
            "split": f"{X_len}_{Y_len}",
            **run_deepstate_split_with_val(df_output, merged_df, train_ids, val_ids, test_ids, X_len, Y_len)
        })
    out = pd.DataFrame(rows)
    print("\nDeepState Results:")
    print(out)
    out.to_csv("DeepState_RESULTS1.csv", index=False)

if __name__ == "__main__":
    main()

Writing deepstate_run.py


In [None]:
#!conda run -n mx19 python prep.py
#!conda run -n mx19 python -u deepstate_run.py

In [None]:
# 1) Run data prep inside the conda env
!conda run -n mx19 python -u prep.py


Loaded: (1192157, 17) Columns: ['Unnamed: 0', 'Total_CO2_capture', 'temp', 'shift', 'year', 'file_id', 'MikeSorghum', 'Quartz', 'Plagioclase', 'Apatite', 'Ilmenite', 'Diopside_Mn', 'Diopside', 'Olivine', 'Alkali-feldspar', 'Montmorillonite', 'Glass']

Summary:
merged_df: (2703, 15) (statics only)
df_output: (273003, 3) (long CO2)
Per-series max timestep:
count    2703.0
mean      100.0
std         0.0
min       100.0
25%       100.0
50%       100.0
75%       100.0
max       100.0
Name: timestep, dtype: float64



In [None]:
# 2) Run DeepState (stream logs live), then copy results to Drive

# Run with minimal buffering so you see prints as they happen.
!stdbuf -oL -eL conda run --no-capture-output -n mx19 python -u deepstate_run.py 2>&1 | tee deepstate_run.log

# Make destination folder in Drive
!mkdir -p "/content/drive/My Drive/C02 project/results"

# Copy outputs to Drive
!cp -f /content/DeepState_RESULTS.csv "/content/drive/My Drive/C02 project/results/DeepState_RESULTS.csv"
!cp -f /content/deepstate_run.log "/content/drive/My Drive/C02 project/results/deepstate_run.log"

# (optional) also back up the prepared data artifacts
!cp -f /content/merged_df.parquet "/content/drive/My Drive/C02 project/results/merged_df.parquet"
!cp -f /content/df_output.parquet "/content/drive/My Drive/C02 project/results/df_output.parquet"

# Show what landed in Drive
!ls -lh "/content/drive/My Drive/C02 project/results"


==== DeepState Split X=80 | Y=21 ====
  train series: 1946, val: 216, test: 541
  X_len=80, Y_len=21
  -> training epochs=200
  0%|          | 0/100 [00:00<?, ?it/s][21:08:23] ../src/imperative/./cached_op.h:254: Disabling fusion due to altered topological order of inputs.
100%|██████████| 100/100 [00:30<00:00,  3.23it/s, epoch=1/200, avg_epoch_loss=-1.3]
100%|██████████| 100/100 [00:28<00:00,  3.45it/s, epoch=2/200, avg_epoch_loss=-1.93]
100%|██████████| 100/100 [00:29<00:00,  3.43it/s, epoch=3/200, avg_epoch_loss=-2.24]
100%|██████████| 100/100 [00:29<00:00,  3.42it/s, epoch=4/200, avg_epoch_loss=-2.32]
100%|██████████| 100/100 [00:29<00:00,  3.41it/s, epoch=5/200, avg_epoch_loss=-2.44]
100%|██████████| 100/100 [00:28<00:00,  3.45it/s, epoch=6/200, avg_epoch_loss=-2.53]
100%|██████████| 100/100 [00:29<00:00,  3.45it/s, epoch=7/200, avg_epoch_loss=-2.61]
100%|██████████| 100/100 [00:29<00:00,  3.43it/s, epoch=8/200, avg_epoch_loss=-2.57]
100%|██████████| 100/100 [00:29<00:00,  3.45it