In [None]:
SEED = 5000
import os, random, numpy as np
os.environ["PYTHONHASHSEED"] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)
# PyTorch seeding (TensorFlow is skipped to avoid environment issues)
try:
    import torch
    torch.manual_seed(SEED)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(SEED)
        torch.cuda.manual_seed_all(SEED)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
except Exception:
    pass


In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/eds-232-ocean-chemistry-prediction-for-calcofi/sample_submission.csv
/kaggle/input/eds-232-ocean-chemistry-prediction-for-calcofi/train.csv
/kaggle/input/eds-232-ocean-chemistry-prediction-for-calcofi/test.csv


In [None]:
train=pd.read_csv("/kaggle/input/eds-232-ocean-chemistry-prediction-for-calcofi/train.csv")
test=pd.read_csv("/kaggle/input/eds-232-ocean-chemistry-prediction-for-calcofi/test.csv")
sub=pd.read_csv("/kaggle/input/eds-232-ocean-chemistry-prediction-for-calcofi/sample_submission.csv")

In [None]:
pip install tabpfn

In [None]:
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler, PowerTransformer, RobustScaler, QuantileTransformer
from sklearn.impute import SimpleImputer
import numpy as np
import pandas as pd


# Load data
# train = pd.read_csv("/kaggle/input/eds-232-ocean-chemistry-prediction-for-calcofi/train.csv")
# test = pd.read_csv("/kaggle/input/eds-232-ocean-chemistry-prediction-for-calcofi/test.csv")

# Drop unneeded columns and handle missing values
train = train.drop(columns=["Unnamed: 12", "id"])  # Dropping unnecessary columns

# Find common columns between train and test (excluding "DIC" from train)
common_columns = train.drop(columns=["DIC"]).columns.intersection(test.columns)

# Select the common columns for both train and test
X = train[common_columns].copy()
y = train["DIC"]
test = test[common_columns].copy()


# Toggle for feature engineering (set to False to disable)
# === 調整ポイント: 特徴量ENGのON/OFF（効果比較に便利） ===
# True: 追加特徴（N_tot, 比率, 交互作用, sin/cos 等）を使う
# False: 生の共通列のみで学習
FE_ENABLED = False

if FE_ENABLED:
    # Feature engineering (deterministic; same for train/test)
    # Totals and ratios (avoid divide-by-zero -> NaN; imputer will handle)
    X['N_tot'] = X['NO3uM'] + X['NO2uM'] + X['NH3uM']
    test['N_tot'] = test['NO3uM'] + test['NO2uM'] + test['NH3uM']
    X['N_to_P'] = np.where(X['PO4uM'] == 0, np.nan, X['NO3uM'] / X['PO4uM'])
    test['N_to_P'] = np.where(test['PO4uM'] == 0, np.nan, test['NO3uM'] / test['PO4uM'])
    X['Si_to_N'] = np.where(X['NO3uM'] == 0, np.nan, X['SiO3uM'] / X['NO3uM'])
    test['Si_to_N'] = np.where(test['NO3uM'] == 0, np.nan, test['SiO3uM'] / test['NO3uM'])
    # Interactions
    X['Depth_Temp'] = X['R_Depth'] * X['R_TEMP']
    test['Depth_Temp'] = test['R_Depth'] * test['R_TEMP']
    X['Sal_Temp'] = X['R_Sal'] * X['R_TEMP']
    test['Sal_Temp'] = test['R_Sal'] * test['R_TEMP']
    # Geographic trig features
    X['sin_lat'] = np.sin(np.radians(X['Lat_Dec']))
    X['cos_lat'] = np.cos(np.radians(X['Lat_Dec']))
    X['sin_lon'] = np.sin(np.radians(X['Lon_Dec']))
    X['cos_lon'] = np.cos(np.radians(X['Lon_Dec']))
    test['sin_lat'] = np.sin(np.radians(test['Lat_Dec']))
    test['cos_lat'] = np.cos(np.radians(test['Lat_Dec']))
    test['sin_lon'] = np.sin(np.radians(test['Lon_Dec']))
    test['cos_lon'] = np.cos(np.radians(test['Lon_Dec']))
    # Replace infs with NaN to be imputed
    X = X.replace([np.inf, -np.inf], np.nan)
    test = test.replace([np.inf, -np.inf], np.nan)

# ===== 調整ポイント: 前処理/変換スイッチ =====
# FEATURE_IMPUTER: 欠損補完の方法 ('mean' または 'median')
# FEATURE_TRANSFORM: 特徴量変換
#   - 'power'    : Yeo-Johnson + 標準化（0/負値OK, 初手におすすめ）
#   - 'standard' : 標準化のみ（平均0・分散1）
#   - 'robust'   : 外れ値に頑健なスケーリング
#   - 'quantile' : 分位変換（出力分布は下のQUANTILE_OUTPUTで指定）
#   - 'none'     : 変換しない
# TARGET_TRANSFORM: 目的変数変換 ('none'|'log1p'|'standard')
#   - 'log1p'    : スケール/外れ値を抑制。予測は自動でexpm1逆変換
#   - 'standard' : 平均0・分散1に正規化（予測は平均・分散で戻す）
# 変更後は前処理→学習→推論の順に実行してください
# Switches for feature/target transforms
FEATURE_IMPUTER = 'mean'  # 'mean' or 'median'
FEATURE_TRANSFORM = 'power'  # 'power'|'standard'|'robust'|'quantile'|'none'
QUANTILE_OUTPUT = 'normal'  # 'normal' or 'uniform'

TARGET_TRANSFORM = 'none'  # 'none'|'log1p'|'standard'

# ===== TabPFN stacking feature (optional) =====
# 事前学習済みのTabPFNからOOF予測を作り、1次元の補助特徴量として追加
STACK_TABPFN = True  # Falseで無効化
TABPFN_N_SPLITS = 5
TABPFN_ENSEMBLE = 16
TABPFN_BINS = 30  # Regressorが無い環境では分類のビニングで近似

def add_tabpfn_stack_feature(X_df, y_series, test_df, n_splits=TABPFN_N_SPLITS):
    try:
        from tabpfn import TabPFNRegressor, TabPFNClassifier
    except Exception as e:
        # 必要ならインストールを試行（失敗したらスキップ）
        try:
            import sys, subprocess
            subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', 'tabpfn'])
            from tabpfn import TabPFNRegressor, TabPFNClassifier
        except Exception as ie:
            print('TabPFN not available; skipping stacking. Error:', ie)
            return X_df, test_df, None
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    oof = np.zeros(len(X_df), dtype=float)
    test_pred_folds = []
    X_np = X_df.values
    test_np = test_df.values
    y_np = y_series.values if hasattr(y_series, 'values') else np.asarray(y_series)
    for tr_idx, va_idx in kf.split(X_np):
        X_tr = X_np[tr_idx]
        X_va = X_np[va_idx]
        y_tr = y_np[tr_idx]
        # 欠損は学習FoldでImputerをfit（リーク防止）
        imp = SimpleImputer(strategy=FEATURE_IMPUTER)
        X_tr_imp = imp.fit_transform(X_tr)
        X_va_imp = imp.transform(X_va)
        test_imp_local = imp.transform(test_np)
        try:
            # 回帰器が利用可能な場合
            reg = TabPFNRegressor(N_ensemble_configurations=TABPFN_ENSEMBLE, seed=SEED, device=device)
            reg.fit(X_tr_imp, y_tr)
            oof[va_idx] = reg.predict(X_va_imp).astype(float)
            test_pred = reg.predict(test_imp_local).astype(float)
        except Exception as e:
            # 分類の確率出力で回帰を近似（分位ビンの期待値）
            bins = np.quantile(y_tr, np.linspace(0.0, 1.0, TABPFN_BINS + 1))
            bins[0] = -np.inf
            bins[-1] = np.inf
            y_tr_binned = np.digitize(y_tr, bins[1:-1], right=True)
            clf = TabPFNClassifier(N_ensemble_configurations=TABPFN_ENSEMBLE, seed=SEED, device=device)
            clf.fit(X_tr_imp, y_tr_binned)
            proba_va = clf.predict_proba(X_va_imp)
            proba_te = clf.predict_proba(test_imp_local)
            mids_all = (bins[:-1] + bins[1:]) / 2.0
            try:
                classes = clf.classes_.astype(int)
                mids_used = mids_all[classes]
            except Exception:
                mids_used = mids_all
            oof[va_idx] = (proba_va * mids_used).sum(axis=1)
            test_pred = (proba_te * mids_used).sum(axis=1)
        test_pred_folds.append(test_pred)
    test_pred_mean = np.mean(np.column_stack(test_pred_folds), axis=1)
    X_new = X_df.copy()
    test_new = test_df.copy()
    X_new['tabpfn_pred'] = oof
    test_new['tabpfn_pred'] = test_pred_mean
    return X_new, test_new, oof

# 学習/検証分割の前にTabPFN特徴を追加
if 'STACK_TABPFN' not in globals():
    STACK_TABPFN = True
if STACK_TABPFN:
    try:
        X, test, _ = add_tabpfn_stack_feature(X, y, test)
        print('TabPFN stacked feature added: column tabpfn_pred')
    except Exception as e:
        print('TabPFN stacking failed; continuing without it. Error:', e)

def make_feature_transformer(name):
    key = (name or 'none').lower()
    if key == 'standard':
        return StandardScaler()
    if key == 'robust':
        return RobustScaler()
    if key == 'quantile':
        return QuantileTransformer(output_distribution=QUANTILE_OUTPUT, random_state=SEED)
    if key == 'power':
        return PowerTransformer(method='yeo-johnson', standardize=True)
    return None

def fit_target_transform(y, name):
    key = (name or 'none').lower()
    yy = pd.Series(y).astype(float)
    if key == 'log1p':
        return np.log1p(yy).astype(np.float32).values, {'name':'log1p'}
    if key == 'standard':
        mu = float(yy.mean()); sd = float(yy.std() + 1e-8)
        return ((yy - mu)/sd).astype(np.float32).values, {'name':'standard','mean':mu,'std':sd}
    return yy.astype(np.float32).values, {'name':'none'}

def apply_target_transform(y, params):
    key = (params.get('name') or 'none').lower()
    yy = pd.Series(y).astype(float)
    if key == 'log1p':
        return np.log1p(yy).astype(np.float32).values
    if key == 'standard':
        mu = params['mean']; sd = params['std']
        return ((yy - mu)/sd).astype(np.float32).values
    return yy.astype(np.float32).values

def inverse_target_transform(arr, params):
    key = (params.get('name') or 'none').lower()
    a = np.asarray(arr, dtype=float)
    if key == 'log1p':
        return np.expm1(a)
    if key == 'standard':
        return a * params['std'] + params['mean']
    return a

# Split first to avoid leakage
X_train_raw, X_val_raw, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


# Impute missing values (fit on train only)
imputer = SimpleImputer(strategy=FEATURE_IMPUTER)
X_train_imp = imputer.fit_transform(X_train_raw)
X_val_imp = imputer.transform(X_val_raw)
test_imp = imputer.transform(test)

# Feature transform (fit on train only)
_ft = make_feature_transformer(FEATURE_TRANSFORM)
if _ft is None:
    X_train = X_train_imp
    X_val = X_val_imp
    test_scaled = test_imp
else:
    X_train = _ft.fit_transform(X_train_imp)
    X_val = _ft.transform(X_val_imp)
    test_scaled = _ft.transform(test_imp)

# Target transform (fit on train only)
y_train_t, TARGET_PARAMS_SINGLE = fit_target_transform(y_train, TARGET_TRANSFORM)
y_val_t = apply_target_transform(y_val, TARGET_PARAMS_SINGLE)
y_train_proc = pd.Series(y_train_t, index=y_train.index)
y_val_proc = pd.Series(y_val_t, index=y_val.index)


In [None]:
from torch.utils.data import Dataset, DataLoader

class OceanChemistryDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        # y は pandas.Series でも numpy.ndarray でも受け付ける
        y_arr = y.values if hasattr(y, 'values') else y
        self.y = torch.tensor(y_arr, dtype=torch.float32)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_dataset = OceanChemistryDataset(X_train, y_train_proc)
val_dataset = OceanChemistryDataset(X_val, y_val_proc)

# === 調整ポイント: 学習のバッチ設定 ===
# ・batch_size: 32/64/128/256 あたりで比較（大きいほど安定・速いが汎化は要CVで検証）→実際１が最強
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=1, shuffle=False)


In [None]:
# ===== MLPハイパラ（ここを主に調整） =====
# DROPOUT_P: 0.1〜0.3 推奨（0で無効）
DROPOUT_P = 0  # Dropout probability (0.1-0.3 recommended)
# ACTIVATION_NAME: 'ReLU'|'GELU'|'SiLU'|'Tanh'|'LeakyReLU'
ACTIVATION_NAME = 'Tanh'  # Options: ReLU, GELU, SiLU, Tanh, LeakyReLU

def make_activation(name):
    try:
        key = (name or 'ReLU').lower()
    except Exception:
        key = 'relu'
    if key == 'relu':
        return nn.ReLU()
    if key == 'gelu':
        return nn.GELU()
    if key in ('silu','swish'):
        return nn.SiLU()
    if key == 'tanh':
        return nn.Tanh()
    if key in ('leakyrelu','lrelu'):
        return nn.LeakyReLU(0.01)
    return nn.ReLU()

class MLPModel(nn.Module):

    def __init__(self, input_size, dropout_p=DROPOUT_P, activation_name=ACTIVATION_NAME):

        super(MLPModel, self).__init__()

        # 隠れ層ユニット数（今は1024。64〜1024で比較してみてください）
        self.fc1 = nn.Linear(input_size, 1024)

        self.act1 = make_activation(activation_name)

        self.drop1 = nn.Dropout(dropout_p)

        self.fc3 = nn.Linear(1024, 1)  # Output layer for regression


    def forward(self, x):

        x = self.fc1(x)

        x = self.act1(x)

        x = self.drop1(x)

        x = self.fc3(x)

        return x


# Initialize the model

model = MLPModel(input_size=X_train.shape[1], dropout_p=DROPOUT_P, activation_name=ACTIVATION_NAME)

print(X_train.shape)


(1163, 15)


In [None]:
WEIGHT_DECAY = 1e-4  # 1e-4 to 1e-3 recommended
EARLY_STOPPING_PATIENCE = 200  # epochs with no improvement before stop
EARLY_STOPPING_MIN_DELTA = 1e-4  # minimum improvement to reset patience

# Model selection toggle: 'BP' (PyTorch backprop), 'ELM', 'RBF'
MODEL_NAME = 'BP'

# ELM hyperparameters
ELM_HIDDEN = 512
ELM_REG = 1e-2  # ridge regularization
ELM_ACTIVATION = 'relu'  # relu|tanh|sigmoid

# RBF hyperparameters
RBF_UNITS = 100
RBF_REG = 1e-2
RBF_SIGMA_SCALE = 1.0  # scale factor for sigma derived from centers

import torch.optim as optim
# ===== 最適化/損失/早期終了（主な調整ポイント） =====
# OPTIMIZER_NAME: 'Adam'|'AdamW'|'SGD'|'RMSprop'|'Adagrad'
#   ・SGDを使うなら OPTIMIZER_PARAMS={'momentum':0.9,'nesterov':True} など
# WEIGHT_DECAY: 1e-4〜1e-3 推奨（L2正則化。大きすぎると学習が弱まる）
# LOSS_NAME: 'SmoothL1'|'MSE'|'L1'|'Huber'（Huberはdelta、SmoothL1はbetaをLOSS_PARAMSで指定可）
# EARLY_STOPPING_PATIENCE/MIN_DELTA: 早期終了の判定
# 学習率lrはmake_optimizerの引数で指定。ReduceLROnPlateauで自動減衰

# Optimizer toggle
OPTIMIZER_NAME = 'AdamW'  # Options: 'Adam','AdamW','SGD','RMSprop','Adagrad'
OPTIMIZER_PARAMS = {}  # e.g., {'momentum':0.9} for SGD

def make_optimizer(name, params, **kwargs):
    try:
        key = (name or 'Adam').lower()
    except Exception:
        key = 'adam'
    lr = kwargs.get('lr', 1e-3)
    wd = kwargs.get('weight_decay', 0.0)
    if key == 'adamw':
        return optim.AdamW(params, lr=lr, weight_decay=wd)
    if key == 'sgd':
        return optim.SGD(params, lr=lr, momentum=kwargs.get('momentum', 0.9), nesterov=kwargs.get('nesterov', False), weight_decay=wd)
    if key == 'rmsprop':
        return optim.RMSprop(params, lr=lr, momentum=kwargs.get('momentum', 0.0), alpha=kwargs.get('alpha', 0.99), weight_decay=wd)
    if key == 'adagrad':
        return optim.Adagrad(params, lr=lr, weight_decay=wd)
    return optim.Adam(params, lr=lr, weight_decay=wd)



# Loss function and optimizer

LOSS_NAME = 'SmoothL1'  # Options: 'SmoothL1', 'MSE', 'L1', 'Huber'
LOSS_PARAMS = {}  # e.g., {'beta': 1.0} for SmoothL1 or {'delta': 1.0} for Huber

def make_loss(name, **kwargs):
    try:
        key = (name or 'SmoothL1').lower()
    except Exception:
        key = 'smoothl1'
    if key in ('mse','mseloss'):
        return nn.MSELoss()
    if key in ('l1','mae','l1loss'):
        return nn.L1Loss()
    if key in ('huber','huberloss'):
        delta = kwargs.get('delta', 1.0)
        try:
            return nn.HuberLoss(delta=delta)
        except TypeError:
            return nn.SmoothL1Loss()
    # Default SmoothL1
    beta = kwargs.get('beta', 1.0)
    try:
        return nn.SmoothL1Loss(beta=beta)
    except TypeError:
        return nn.SmoothL1Loss()


criterion = make_loss(LOSS_NAME, **LOSS_PARAMS)

optimizer = make_optimizer(OPTIMIZER_NAME, model.parameters(), lr=0.001, weight_decay=WEIGHT_DECAY, **OPTIMIZER_PARAMS)

# LR scheduler
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=50, min_lr=1e-6)


def _np_activation(name, X):
    key = (name or 'relu').lower()
    if key == 'relu':
        return (X > 0) * X
    if key == 'tanh':
        return np.tanh(X)
    if key in ('sigmoid','logistic'):
        return 1.0 / (1.0 + np.exp(-X))
    return (X > 0) * X

def _ridge_solve(H, y, reg):
    # Solve (H^T H + reg I) w = H^T y
    HtH = H.T @ H
    n = HtH.shape[0]
    A = HtH + reg * np.eye(n)
    b = H.T @ y
    return np.linalg.solve(A, b)

def fit_elm(X, y, hidden=ELM_HIDDEN, reg=ELM_REG, act=ELM_ACTIVATION, seed=42):
    rs = np.random.RandomState(seed)
    W = rs.normal(scale=1.0, size=(X.shape[1], hidden))
    b = rs.normal(scale=1.0, size=(hidden,))
    H = _np_activation(act, X @ W + b)
    beta = _ridge_solve(H, y.astype(float), reg)
    return {'W': W, 'b': b, 'beta': beta, 'act': act}

def predict_elm(model_dict, X):
    W = model_dict['W']; b = model_dict['b']; act = model_dict['act']; beta = model_dict['beta']
    H = _np_activation(act, X @ W + b)
    return H @ beta

def _rbf_design(X, centers, gamma):
    # Compute squared Euclidean distances efficiently
    X2 = np.sum(X*X, axis=1, keepdims=True)
    C2 = np.sum(centers*centers, axis=1)[None, :]
    dist2 = X2 + C2 - 2.0 * (X @ centers.T)
    return np.exp(-gamma * dist2)

def fit_rbf(X, y, units=RBF_UNITS, reg=RBF_REG, sigma_scale=RBF_SIGMA_SCALE, seed=42):
    from sklearn.cluster import KMeans
    km = KMeans(n_clusters=units, random_state=seed, n_init=10)
    centers = km.fit(X).cluster_centers_
    # Estimate sigma from center distances
    from scipy.spatial.distance import cdist
    try:
        import numpy as _np
        pair = _np.linalg.norm(centers[:,None,:]-centers[None,:,:], axis=2)
    except Exception:
        pair = np.zeros((units, units))
    # Use median of nearest-neighbor distances
    nn = []
    for i in range(units):
        vals = np.sort(pair[i][pair[i]>0])
        if vals.size>0:
            nn.append(vals[0])
    sigma = (np.median(nn) if len(nn)>0 else 1.0) * sigma_scale
    sigma = max(sigma, 1e-6)
    gamma = 1.0/(2.0*sigma*sigma)
    Phi = _rbf_design(X, centers, gamma)
    w = _ridge_solve(Phi, y.astype(float), reg)
    return {'centers': centers, 'gamma': gamma, 'w': w}

def predict_rbf(model_dict, X):
    centers = model_dict['centers']; gamma = model_dict['gamma']; w = model_dict['w']
    Phi = _rbf_design(X, centers, gamma)
    return Phi @ w


# Training function

def train_model(model, train_loader, val_loader, epochs=5000, scheduler=None):

    import copy
    best_val = float('inf')
    best_state = None
    epochs_no_improve = 0

    for epoch in range(epochs):

        model.train()

        running_loss = 0.0


        for X_batch, y_batch in train_loader:

            optimizer.zero_grad()

            outputs = model(X_batch)

            loss = criterion(outputs.squeeze(), y_batch)

            loss.backward()

            optimizer.step()

            running_loss += loss.item()


        val_loss = 0.0

        model.eval()

        with torch.no_grad():

            for X_batch, y_batch in val_loader:

                outputs = model(X_batch)

                loss = criterion(outputs.squeeze(), y_batch)

                val_loss += loss.item()

        val_loss_avg = val_loss/len(val_loader) if len(val_loader)>0 else val_loss
        if scheduler is not None:
            scheduler.step(val_loss_avg)

        # Early stopping (only if we have validation batches)
        if len(val_loader) > 0:
            if best_val - val_loss_avg > EARLY_STOPPING_MIN_DELTA:
                best_val = val_loss_avg
                best_state = copy.deepcopy(model.state_dict())
                epochs_no_improve = 0
            else:
                epochs_no_improve += 1
            if epochs_no_improve >= EARLY_STOPPING_PATIENCE:
                print(f"Early stopping at epoch {epoch+1}; best val: {best_val:.6f}")
                if best_state is not None:
                    model.load_state_dict(best_state)
                break

        if epoch % 100 == 0:

            current_lr = optimizer.param_groups[0]['lr']
            print(f"Epoch {epoch+1}/{epochs}, LR: {current_lr:.2e}, Train Loss: {running_loss/len(train_loader)}, Validation Loss: {val_loss_avg}")

    # Load best state at the end if available
    if best_state is not None:
        model.load_state_dict(best_state)

# Train based on selected model

if MODEL_NAME.upper() == 'BP':
    train_model(model, train_loader, val_loader, epochs=5000, scheduler=scheduler)
elif MODEL_NAME.upper() == 'ELM':
    print('Training ELM...')
    ELM_MODEL = fit_elm(X_train, y_train, hidden=ELM_HIDDEN, reg=ELM_REG, act=ELM_ACTIVATION, seed=SEED)
elif MODEL_NAME.upper() == 'RBF':
    print('Training RBF...')
    RBF_MODEL = fit_rbf(X_train, y_train, units=RBF_UNITS, reg=RBF_REG, sigma_scale=RBF_SIGMA_SCALE, seed=SEED)
else:
    raise ValueError(f'Unknown MODEL_NAME: {MODEL_NAME}')


Epoch 1/5000, Train Loss: 4633887.947368421, Validation Loss: 4612593.2
Epoch 101/5000, Train Loss: 15084.370579769737, Validation Loss: 18933.8005859375
Epoch 201/5000, Train Loss: 2547.9099185341283, Validation Loss: 3594.0954345703126
Epoch 301/5000, Train Loss: 716.5324530350534, Validation Loss: 948.5962158203125
Epoch 401/5000, Train Loss: 400.37651142321135, Validation Loss: 575.134521484375
Epoch 501/5000, Train Loss: 284.9509711014597, Validation Loss: 413.65491943359376
Epoch 601/5000, Train Loss: 157.120555676912, Validation Loss: 215.35331115722656
Epoch 701/5000, Train Loss: 115.85318153782895, Validation Loss: 136.43557891845703
Epoch 801/5000, Train Loss: 78.71800462823165, Validation Loss: 102.47501983642579
Epoch 901/5000, Train Loss: 79.39923437018143, Validation Loss: 84.50692291259766
Epoch 1001/5000, Train Loss: 61.57915757831774, Validation Loss: 90.80420532226563
Epoch 1101/5000, Train Loss: 59.011202460841126, Validation Loss: 79.94369735717774
Epoch 1201/5000, 

In [None]:
# Predict according to MODEL_NAME
if MODEL_NAME.upper() == 'BP':
    # Convert the test set into a torch tensor
    test_tensor = torch.tensor(test_scaled, dtype=torch.float32)
    model.eval()
    with torch.no_grad():
        predictions = model(test_tensor).squeeze().numpy()
elif MODEL_NAME.upper() == 'ELM':
    predictions = predict_elm(ELM_MODEL, test_scaled)
elif MODEL_NAME.upper() == 'RBF':
    predictions = predict_rbf(RBF_MODEL, test_scaled)
else:
    raise ValueError(f'Unknown MODEL_NAME: {MODEL_NAME}')

# Inverse target transform if applied (single split)
try:
    predictions = inverse_target_transform(predictions, TARGET_PARAMS_SINGLE)
except Exception:
    pass

# Prepare submission
submission = pd.DataFrame({"id": range(1455, 1455 + len(predictions)), "DIC": predictions})
submission.to_csv("submission.csv", index=False)
