# CNN

In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler

# 1. Load data
train_df = pd.read_csv('train.csv')
test_df  = pd.read_csv('test.csv')

# 2. Prepare features & target
features = [c for c in train_df.columns if c not in ('id', 'Row#', 'yield')]
X_train = train_df[features].values
y_train = train_df['yield'].values
X_test  = test_df[features].values
test_ids = test_df['id'].values

# 3. Scale inputs
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test  = scaler.transform(X_test)

# 4. To torch tensors: shape (N, seq_len=F, input_size=1)
#    LSTM with batch_first=True expects (batch, seq_len, input_size)
X_train_t = torch.from_numpy(X_train).float().unsqueeze(-1)
y_train_t = torch.from_numpy(y_train).float().unsqueeze(1)
X_test_t  = torch.from_numpy(X_test).float().unsqueeze(-1)

# 5. DataLoader
train_ds     = TensorDataset(X_train_t, y_train_t)
train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)

# 6. Define an LSTM regressor
class RNNRegressor(nn.Module):
    def __init__(self, seq_len, hidden_size=64, num_layers=1):
        super().__init__()
        self.lstm = nn.LSTM(
            input_size=1,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=False
        )
        # Map from hidden state to a single output
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        # x: (batch, seq_len, 1)
        _, (h_n, _) = self.lstm(x)
        # h_n: (num_layers, batch, hidden_size)
        last_h = h_n[-1]                    # (batch, hidden_size)
        return self.fc(last_h)              # (batch, 1)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model  = RNNRegressor(seq_len=len(features)).to(device)

# 7. MAE loss & optimizer
criterion = nn.L1Loss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# 8. Training loop
num_epochs = 500
for epoch in range(1, num_epochs+1):
    model.train()
    total_mae = 0.0
    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        preds = model(xb)
        loss  = criterion(preds, yb)
        loss.backward()
        optimizer.step()
        total_mae += loss.item() * xb.size(0)
    epoch_mae = total_mae / len(train_loader.dataset)
    if epoch % 10 == 0:
        print(f'Epoch {epoch:2d}/{num_epochs} — MAE: {epoch_mae:.4f}')

# 9. Inference on test set
model.eval()
with torch.no_grad():
    X_test_t = X_test_t.to(device)
    y_pred   = model(X_test_t).cpu().numpy().flatten()

# 10. Save submission
submission = pd.DataFrame({
    'id':    test_ids,
    'yield': y_pred
})
submission.to_csv('submission.csv', index=False)
print("Done → predictions saved to submission.csv")


Epoch 10/500 — MAE: 5861.5300


KeyboardInterrupt: 

# RNN

In [6]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset, random_split
from sklearn.preprocessing import StandardScaler

# 1. Load & split data
df = pd.read_csv('train.csv')
features = [c for c in df.columns if c not in ('id','Row#','yield')]
X = df[features].values
y = df['yield'].values

# simple 80/20 train/val split
dataset = TensorDataset(
    torch.from_numpy(StandardScaler().fit_transform(X)).float().unsqueeze(-1),
    torch.from_numpy(y).float().unsqueeze(1)
)
n_val = int(len(dataset)*0.2)
n_tr  = len(dataset) - n_val
train_ds, val_ds = random_split(dataset, [n_tr, n_val])
train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
val_loader   = DataLoader(val_ds,   batch_size=64)

# 2. Define a beefed‑up LSTM regressor
class RNNRegressor(nn.Module):
    def __init__(self, seq_len, hidden_size=128, num_layers=2, bidirectional=True, dropout=0.2):
        super().__init__()
        self.num_directions = 2 if bidirectional else 1
        self.lstm = nn.LSTM(
            input_size=1,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=bidirectional,
            dropout=dropout if num_layers>1 else 0.0
        )
        # after LSTM, combine last‑layer forward & backward
        self.fc1 = nn.Linear(hidden_size * self.num_directions, 64)
        self.bn1 = nn.BatchNorm1d(64)
        self.drop = nn.Dropout(0.3)
        self.fc2 = nn.Linear(64, 1)

    def forward(self, x):
        # x: (batch, seq_len, 1)
        _, (h_n, _) = self.lstm(x)
        # h_n: (num_layers * num_directions, batch, hidden_size)
        # take the last layer's hidden states:
        last = h_n.view(self.lstm.num_layers, self.num_directions, x.size(0), 
                        self.lstm.hidden_size)[-1]
        # last shape: (num_directions, batch, hidden_size)
        last = last.transpose(0,1).contiguous().view(x.size(0), -1)
        x = torch.relu(self.bn1(self.fc1(last)))
        x = self.drop(x)
        return self.fc2(x)

# 3. Instantiate
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = RNNRegressor(seq_len=len(features)).to(device)

# 4. Loss, optimizer, scheduler
criterion = nn.L1Loss()
optimizer = torch.optim.Adam(model.parameters(), lr=5e-4, weight_decay=1e-5)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                       mode='min',
                                                       factor=0.5,
                                                       patience=5,
                                                       verbose=True)

# 5. Training + validation loops
def run_epoch(loader, train=True):
    if train:
        model.train()
    else:
        model.eval()

    total_loss = 0.0
    for xb, yb in loader:
        xb, yb = xb.to(device), yb.to(device)
        if train:
            optimizer.zero_grad()
        preds = model(xb)
        loss  = criterion(preds, yb)
        if train:
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
        total_loss += loss.item() * xb.size(0)
    return total_loss / len(loader.dataset)

num_epochs = 500
best_val = float('inf')
for epoch in range(1, num_epochs+1):
    train_mae = run_epoch(train_loader, train=True)
    val_mae   = run_epoch(val_loader,   train=False)
    scheduler.step(val_mae)

    if val_mae < best_val:
        best_val = val_mae
        torch.save(model.state_dict(), 'best_rnn.pth')

    if epoch % 5 == 0:
        print(f"Epoch {epoch:2d}/{num_epochs} — "
              f"Train MAE: {train_mae:.2f}  |  Val MAE: {val_mae:.2f}")

print(f"▶ Best validation MAE: {best_val:.2f}")




Epoch  5/500 — Train MAE: 5983.29  |  Val MAE: 5998.16
Epoch 10/500 — Train MAE: 5919.96  |  Val MAE: 5926.67
Epoch 15/500 — Train MAE: 5825.46  |  Val MAE: 5827.65
Epoch 20/500 — Train MAE: 5703.88  |  Val MAE: 5708.78
Epoch 25/500 — Train MAE: 5553.51  |  Val MAE: 5554.03
Epoch 30/500 — Train MAE: 5371.27  |  Val MAE: 5367.82
Epoch 35/500 — Train MAE: 5165.43  |  Val MAE: 5137.62
Epoch 40/500 — Train MAE: 4922.79  |  Val MAE: 4890.68
Epoch 45/500 — Train MAE: 4646.46  |  Val MAE: 4618.26
Epoch 50/500 — Train MAE: 4346.60  |  Val MAE: 4323.14
Epoch 55/500 — Train MAE: 4023.11  |  Val MAE: 3984.50
Epoch 60/500 — Train MAE: 3675.74  |  Val MAE: 3625.70
Epoch 65/500 — Train MAE: 3302.23  |  Val MAE: 3229.27
Epoch 70/500 — Train MAE: 2888.48  |  Val MAE: 2848.80
Epoch 75/500 — Train MAE: 2469.20  |  Val MAE: 2311.55
Epoch 80/500 — Train MAE: 2038.49  |  Val MAE: 2038.21
Epoch 85/500 — Train MAE: 1651.91  |  Val MAE: 1579.85
Epoch 90/500 — Train MAE: 1308.31  |  Val MAE: 1205.12
Epoch 95/5

# LSTM

In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset, random_split
from sklearn.preprocessing import StandardScaler

# Load & preprocess data
df = pd.read_csv('train.csv')
features = [c for c in df.columns if c not in ('id', 'Row#', 'yield')]
X = df[features].values
y = df['yield'].values

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Prepare PyTorch datasets
data = TensorDataset(
    torch.from_numpy(X_scaled).float().unsqueeze(-1),
    torch.from_numpy(y).float().unsqueeze(1)
)
train_size = int(0.8 * len(data))
val_size = len(data) - train_size
train_ds, val_ds = random_split(data, [train_size, val_size])

train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=64)

# Define LSTM regressor
class LSTMRegressor(nn.Module):
    def __init__(self, seq_len, hidden_size=128, num_layers=2, dropout=0.2, bidirectional=True):
        super().__init__()
        self.num_directions = 2 if bidirectional else 1
        self.lstm = nn.LSTM(
            input_size=1, hidden_size=hidden_size,
            num_layers=num_layers, dropout=dropout if num_layers > 1 else 0.0,
            bidirectional=bidirectional, batch_first=True
        )
        self.fc1 = nn.Linear(hidden_size * self.num_directions, 64)
        self.bn1 = nn.BatchNorm1d(64)
        self.dropout = nn.Dropout(0.3)
        self.fc2 = nn.Linear(64, 1)

    def forward(self, x):
        _, (h_n, _) = self.lstm(x)
        # Extract last layer's hidden state
        h_last = h_n.view(self.lstm.num_layers, self.num_directions, x.size(0), self.lstm.hidden_size)[-1]
        h_last = h_last.transpose(0, 1).reshape(x.size(0), -1)
        x = torch.relu(self.bn1(self.fc1(h_last)))
        x = self.dropout(x)
        return self.fc2(x)

# Setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = LSTMRegressor(seq_len=len(features)).to(device)
criterion = nn.L1Loss()
optimizer = torch.optim.Adam(model.parameters(), lr=5e-4, weight_decay=1e-5)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5, verbose=True)

# Training & validation loop
best_val_mae = float('inf')
for epoch in range(1, 61):
    # Training
    model.train()
    train_loss = 0.0
    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        preds = model(xb)
        loss = criterion(preds, yb)
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        train_loss += loss.item() * xb.size(0)
    train_mae = train_loss / len(train_loader.dataset)
    
    # Validation
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for xb, yb in val_loader:
            xb, yb = xb.to(device), yb.to(device)
            preds = model(xb)
            loss = criterion(preds, yb)
            val_loss += loss.item() * xb.size(0)
    val_mae = val_loss / len(val_loader.dataset)
    
    scheduler.step(val_mae)
    if val_mae < best_val_mae:
        best_val_mae = val_mae
        torch.save(model.state_dict(), 'best_lstm.pth')

    if epoch % 5 == 0:
        print(f'Epoch {epoch:2d}/60 — Train MAE: {train_mae:.2f} | Val MAE: {val_mae:.2f}')

print(f'Best Validation MAE: {best_val_mae:.2f}')




Epoch  5/60 — Train MAE: 5985.89 | Val MAE: 6003.07
Epoch 10/60 — Train MAE: 5929.43 | Val MAE: 5949.12
Epoch 15/60 — Train MAE: 5850.60 | Val MAE: 5859.79
Epoch 20/60 — Train MAE: 5742.09 | Val MAE: 5753.28
Epoch 25/60 — Train MAE: 5610.13 | Val MAE: 5622.58
Epoch 30/60 — Train MAE: 5447.29 | Val MAE: 5450.65
Epoch 35/60 — Train MAE: 5277.47 | Val MAE: 5296.85
Epoch 40/60 — Train MAE: 5056.13 | Val MAE: 5056.69
Epoch 45/60 — Train MAE: 4835.32 | Val MAE: 4838.67
Epoch 50/60 — Train MAE: 4575.81 | Val MAE: 4590.76
Epoch 55/60 — Train MAE: 4325.42 | Val MAE: 4347.83
Epoch 60/60 — Train MAE: 3950.30 | Val MAE: 3908.50
Best Validation MAE: 3908.50


In [3]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset, random_split
from sklearn.preprocessing import StandardScaler

# 1. Load & preprocess data
df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

features = [c for c in df.columns if c not in ('id', 'Row#', 'yield')]
X = df[features].values
y = df['yield'].values
X_test = test_df[features].values
test_ids = test_df['id'].values

scaler = StandardScaler()
X = scaler.fit_transform(X)
X_test = scaler.transform(X_test)

# 2. Prepare PyTorch datasets
data = TensorDataset(
    torch.from_numpy(X).float().unsqueeze(-1),
    torch.from_numpy(y).float().unsqueeze(1)
)
train_size = int(0.8 * len(data))
val_size   = len(data) - train_size
train_ds, val_ds = random_split(data, [train_size, val_size])

train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
val_loader   = DataLoader(val_ds,   batch_size=64)
test_loader  = DataLoader(
    TensorDataset(torch.from_numpy(X_test).float().unsqueeze(-1)),
    batch_size=64
)

# 3. Positional encoding
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=500):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() *
                             (-np.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe.unsqueeze(0))

    def forward(self, x):
        # x: (batch, seq_len, d_model)
        seq_len = x.size(1)
        return x + self.pe[:, :seq_len]

# 4. Transformer regressor
class TransformerRegressor(nn.Module):
    def __init__(self, seq_len, d_model=64, nhead=8,
                 num_layers=2, dim_ff=128, dropout=0.1):
        super().__init__()
        self.embed = nn.Linear(1, d_model)
        self.pos_enc = PositionalEncoding(d_model, max_len=seq_len)
        
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=nhead,
            dim_feedforward=dim_ff,
            dropout=dropout,
            batch_first=True
        )
        self.encoder = nn.TransformerEncoder(encoder_layer,
                                             num_layers=num_layers)
        
        self.regressor = nn.Sequential(
            nn.Linear(d_model, 64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, 1)
        )
    
    def forward(self, x):
        # x: (batch, seq_len, 1)
        x = self.embed(x)                        # (batch, seq_len, d_model)
        x = self.pos_enc(x)
        x = self.encoder(x)                      # (batch, seq_len, d_model)
        x = x.mean(dim=1)                        # global average pool
        return self.regressor(x)                 # (batch, 1)

# 5. Setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = TransformerRegressor(seq_len=len(features)).to(device)

criterion = nn.L1Loss()
optimizer = torch.optim.Adam(model.parameters(), lr=3e-4, weight_decay=1e-5)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode='min', factor=0.5, patience=5, verbose=True
)

# 6. Training & validation loops
best_val_mae = float('inf')
for epoch in range(1, 101):
    # Training
    model.train()
    train_loss = 0.0
    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        preds = model(xb)
        loss = criterion(preds, yb)
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        train_loss += loss.item() * xb.size(0)
    train_mae = train_loss / len(train_loader.dataset)
    
    # Validation
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for xb, yb in val_loader:
            xb, yb = xb.to(device), yb.to(device)
            preds = model(xb)
            val_loss += criterion(preds, yb).item() * xb.size(0)
    val_mae = val_loss / len(val_loader.dataset)
    
    scheduler.step(val_mae)
    
    if val_mae < best_val_mae:
        best_val_mae = val_mae
        torch.save(model.state_dict(), 'best_transformer.pth')
    
    if epoch % 5 == 0:
        print(f'Epoch {epoch:2d}/60 — Train MAE: {train_mae:.2f} | Val MAE: {val_mae:.2f}')

print(f"▶ Best Validation MAE: {best_val_mae:.2f}")

# 7. Inference on test set
model.load_state_dict(torch.load('best_transformer.pth'))
model.eval()
preds = []
with torch.no_grad():
    for xb in test_loader:
        xb = xb[0].to(device)
        preds.append(model(xb).cpu().numpy())
preds = np.vstack(preds).flatten()

# 8. Save submission
pd.DataFrame({'id': test_ids, 'yield': preds}) \
  .to_csv('submission.csv', index=False)





Epoch  5/60 — Train MAE: 5756.14 | Val MAE: 5691.66
Epoch 10/60 — Train MAE: 4612.38 | Val MAE: 4429.66
Epoch 15/60 — Train MAE: 1998.56 | Val MAE: 1679.00
Epoch 20/60 — Train MAE: 551.95 | Val MAE: 317.83
Epoch 25/60 — Train MAE: 508.10 | Val MAE: 273.05
Epoch 30/60 — Train MAE: 493.30 | Val MAE: 288.25
Epoch 35/60 — Train MAE: 492.51 | Val MAE: 259.87
Epoch 40/60 — Train MAE: 498.96 | Val MAE: 276.74
Epoch 45/60 — Train MAE: 495.81 | Val MAE: 261.67
Epoch 50/60 — Train MAE: 492.73 | Val MAE: 264.46
Epoch 55/60 — Train MAE: 491.26 | Val MAE: 259.08
Epoch 60/60 — Train MAE: 493.00 | Val MAE: 264.31
Epoch 65/60 — Train MAE: 485.79 | Val MAE: 257.98
Epoch 70/60 — Train MAE: 484.93 | Val MAE: 260.12
Epoch 75/60 — Train MAE: 491.37 | Val MAE: 258.15
Epoch 80/60 — Train MAE: 488.20 | Val MAE: 259.00
Epoch 85/60 — Train MAE: 485.15 | Val MAE: 258.47
Epoch 90/60 — Train MAE: 490.70 | Val MAE: 258.88
Epoch 95/60 — Train MAE: 485.94 | Val MAE: 259.55
Epoch 100/60 — Train MAE: 485.24 | Val MAE: 

# Hybrid

In [16]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset, random_split
from sklearn.preprocessing import StandardScaler

# 1. Load & preprocess data
train_df = pd.read_csv('train.csv')
test_df  = pd.read_csv('test.csv')
features = [c for c in train_df.columns if c not in ('id','Row#','yield')]

X = train_df[features].values
y = train_df['yield'].values
X_test = test_df[features].values
test_ids = test_df['id'].values

scaler = StandardScaler()
X      = scaler.fit_transform(X)
X_test = scaler.transform(X_test)

# 2. Create PyTorch datasets and loaders
data = TensorDataset(torch.from_numpy(X).float().unsqueeze(-1),
                     torch.from_numpy(y).float().unsqueeze(1))
train_size = int(0.8 * len(data))
val_size   = len(data) - train_size
train_ds, val_ds = random_split(data, [train_size, val_size])

train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
val_loader   = DataLoader(val_ds,   batch_size=64)
test_loader  = DataLoader(TensorDataset(torch.from_numpy(X_test).float().unsqueeze(-1)),
                          batch_size=64)

# 3. Define a CNN + LSTM hybrid regressor
class HybridRegressor(nn.Module):
    def __init__(self, seq_len, cnn_channels=(16,32), 
                 lstm_hidden=64, lstm_layers=2, bidir=True):
        super().__init__()
        # CNN branch
        self.cnn = nn.Sequential(
            nn.Conv1d(1, cnn_channels[0], kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool1d(2),
            nn.Conv1d(cnn_channels[0], cnn_channels[1], kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool1d(2)
        )
        # LSTM branch (on CNN features)
        self.lstm = nn.LSTM(
            input_size=cnn_channels[1],
            hidden_size=lstm_hidden,
            num_layers=lstm_layers,
            batch_first=True,
            bidirectional=bidir,
            dropout=0.2 if lstm_layers>1 else 0.0
        )
        self.bidir = bidir
        # Final MLP
        mlp_in = lstm_hidden * (2 if bidir else 1)
        self.mlp = nn.Sequential(
            nn.Linear(mlp_in, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 1)
        )

    def forward(self, x):
        # x: (batch, seq_len, 1) -> CNN expects (batch, 1, seq_len)
        x = x.transpose(1,2)
        x = self.cnn(x)
        # x: (batch, channels, seq_len/4) -> back to (batch, seq_len/4, channels)
        x = x.transpose(1,2)
        # LSTM
        _, (h_n, _) = self.lstm(x)
        # pick last layer hidden: shape (num_layers * directions, batch, hidden)
        if self.bidir:
            # get last forward & backward hidden
            h_last = torch.cat((h_n[-2], h_n[-1]), dim=1)
        else:
            h_last = h_n[-1]
        # final regressor
        return self.mlp(h_last)

# 4. Setup training components
device   = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model    = HybridRegressor(seq_len=len(features)).to(device)
criterion= nn.L1Loss()
optimizer= torch.optim.Adam(model.parameters(), lr=5.1e-4, weight_decay=0.5e-5)
scheduler= torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min',
                                                     factor=0.5, patience=5, verbose=True)

# 5. Training & validation loop
best_val = float('inf')
for epoch in range(1, 100):
    # Train
    model.train()
    total_train = 0.0
    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        preds = model(xb)
        loss  = criterion(preds, yb)
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        total_train += loss.item() * xb.size(0)
    train_mae = total_train / len(train_loader.dataset)

    # Validate
    model.eval()
    total_val = 0.0
    with torch.no_grad():
        for xb, yb in val_loader:
            xb, yb = xb.to(device), yb.to(device)
            preds = model(xb)
            total_val += criterion(preds, yb).item() * xb.size(0)
    val_mae = total_val / len(val_loader.dataset)

    scheduler.step(val_mae)
    if val_mae < best_val:
        best_val = val_mae
        torch.save(model.state_dict(), 'best_hybrid.pth')

    if epoch % 5 == 0:
        print(f"Epoch {epoch:2d}/201 - Train MAE: {train_mae:.2f} | Val MAE: {val_mae:.2f}")

print(f"Best Validation MAE: {best_val:.2f}")

# 6. Inference on test set
model.load_state_dict(torch.load('best_hybrid.pth'))
model.eval()
preds = []
with torch.no_grad():
    for xb in test_loader:
        xb = xb[0].to(device)
        preds.append(model(xb).cpu().numpy())
preds = np.vstack(preds).flatten()

# 7. Save submission
pd.DataFrame({'id': test_ids, 'yield': preds}).to_csv('submission.csv', index=False)
print("Done → saved hybrid model predictions to submission.csv")




Epoch  5/201 - Train MAE: 5364.33 | Val MAE: 5258.50
Epoch 10/201 - Train MAE: 2463.26 | Val MAE: 2088.70
Epoch 15/201 - Train MAE: 674.75 | Val MAE: 404.18
Epoch 20/201 - Train MAE: 603.79 | Val MAE: 292.61
Epoch 25/201 - Train MAE: 594.34 | Val MAE: 308.82
Epoch 30/201 - Train MAE: 587.33 | Val MAE: 303.68
Epoch 35/201 - Train MAE: 598.77 | Val MAE: 282.88
Epoch 40/201 - Train MAE: 578.86 | Val MAE: 270.41
Epoch 45/201 - Train MAE: 585.12 | Val MAE: 280.59
Epoch 50/201 - Train MAE: 581.04 | Val MAE: 277.02
Epoch 55/201 - Train MAE: 584.10 | Val MAE: 277.08
Epoch 60/201 - Train MAE: 564.65 | Val MAE: 267.08
Epoch 65/201 - Train MAE: 566.26 | Val MAE: 286.65
Epoch 70/201 - Train MAE: 561.76 | Val MAE: 272.93
Epoch 75/201 - Train MAE: 569.68 | Val MAE: 268.95
Epoch 80/201 - Train MAE: 558.34 | Val MAE: 269.91
Epoch 85/201 - Train MAE: 556.47 | Val MAE: 270.50
Epoch 90/201 - Train MAE: 562.83 | Val MAE: 264.56
Epoch 95/201 - Train MAE: 566.45 | Val MAE: 264.71
Best Validation MAE: 263.05