# Import modules

In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset, random_split
from sklearn.preprocessing import StandardScaler

# CNN

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

# 1. Load and prepare data
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

features = [c for c in train_df.columns if c not in ('id', 'Row#', 'yield')]
X = train_df[features].values
y = train_df['yield'].values
X_test = test_df[features].values
test_ids = test_df['id'].values

# 2. Scale data
scaler = StandardScaler()
X = scaler.fit_transform(X)
X_test = scaler.transform(X_test)

# 3. Train-val split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. Convert to PyTorch tensors - reshape for CNN (batch, channels, length)
X_train_t = torch.from_numpy(X_train).float().unsqueeze(1)  # Add channel dimension
y_train_t = torch.from_numpy(y_train).float().unsqueeze(1)
X_val_t = torch.from_numpy(X_val).float().unsqueeze(1)
y_val_t = torch.from_numpy(y_val).float().unsqueeze(1)
X_test_t = torch.from_numpy(X_test).float().unsqueeze(1)

# 5. DataLoader
train_ds = TensorDataset(X_train_t, y_train_t)
val_ds = TensorDataset(X_val_t, y_val_t)
train_loader = DataLoader(train_ds, batch_size=128, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=128)

# 6. CNN Model
class CNNRegressor(nn.Module):
    def __init__(self, input_length, num_channels=64):
        super().__init__()
        self.conv1 = nn.Sequential(
            nn.Conv1d(1, num_channels, kernel_size=5, padding=2),
            nn.BatchNorm1d(num_channels),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2)
        )
        self.conv2 = nn.Sequential(
            nn.Conv1d(num_channels, num_channels*2, kernel_size=3, padding=1),
            nn.BatchNorm1d(num_channels*2),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2)
        )
        self.conv3 = nn.Sequential(
            nn.Conv1d(num_channels*2, num_channels*4, kernel_size=3, padding=1),
            nn.BatchNorm1d(num_channels*4),
            nn.ReLU(),
            nn.AdaptiveAvgPool1d(1)
        )
        self.fc = nn.Sequential(
            nn.Linear(num_channels*4, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 1)
        )
        
    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        x = x.view(x.size(0), -1)  # Flatten
        return self.fc(x)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = CNNRegressor(input_length=len(features)).to(device)

# 7. Loss and optimizer
criterion = nn.L1Loss()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-4)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=5, factor=0.5)

# 8. Training with early stopping
num_epochs = 200
best_mae = float('inf')
patience = 15
patience_counter = 0

for epoch in range(1, num_epochs+1):
    # Training
    model.train()
    train_mae = 0.0
    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        preds = model(xb)
        loss = criterion(preds, yb)
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        train_mae += loss.item() * xb.size(0)
    
    # Validation
    model.eval()
    val_mae = 0.0
    with torch.no_grad():
        for xb, yb in val_loader:
            xb, yb = xb.to(device), yb.to(device)
            preds = model(xb)
            val_mae += criterion(preds, yb).item() * xb.size(0)
    
    train_mae /= len(train_loader.dataset)
    val_mae /= len(val_loader.dataset)
    scheduler.step(val_mae)

    if epoch % 10 == 0:
        print(f'Epoch {epoch:3d}/{num_epochs} - Train MAE: {train_mae:.4f} - Val MAE: {val_mae:.4f}')

# 9. Test prediction
model.eval()
with torch.no_grad():
    X_test_t = X_test_t.to(device)
    y_pred = model(X_test_t).cpu().numpy().flatten()

# 10. Save submission
submission = pd.DataFrame({
    'id': test_ids,
    'yield': y_pred
})
submission.to_csv('CNN_prediction.csv', index=False)
print(f"Done → CNN predictions saved.")

Epoch  10/200 - Train MAE: 513.9194 - Val MAE: 287.4022
Epoch  20/200 - Train MAE: 490.1140 - Val MAE: 284.1623
Epoch  30/200 - Train MAE: 485.4239 - Val MAE: 285.0510
Epoch  40/200 - Train MAE: 477.6255 - Val MAE: 263.8176
Epoch  50/200 - Train MAE: 481.2920 - Val MAE: 261.5328
Epoch  60/200 - Train MAE: 473.4218 - Val MAE: 274.1691
Epoch  70/200 - Train MAE: 477.3768 - Val MAE: 261.4722
Epoch  80/200 - Train MAE: 479.1592 - Val MAE: 262.4379
Epoch  90/200 - Train MAE: 471.6675 - Val MAE: 260.6419
Epoch 100/200 - Train MAE: 473.7440 - Val MAE: 260.6624
Epoch 110/200 - Train MAE: 470.4218 - Val MAE: 259.9990
Epoch 120/200 - Train MAE: 472.3900 - Val MAE: 260.8253
Epoch 130/200 - Train MAE: 476.7619 - Val MAE: 263.0355
Epoch 140/200 - Train MAE: 474.2400 - Val MAE: 262.6856
Epoch 150/200 - Train MAE: 485.3300 - Val MAE: 260.3950
Epoch 160/200 - Train MAE: 478.0532 - Val MAE: 262.3427
Epoch 170/200 - Train MAE: 480.0227 - Val MAE: 260.6604
Epoch 180/200 - Train MAE: 468.0858 - Val MAE: 2

# RNN

In [None]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

# Feature selection
features = [c for c in train_df.columns if c not in ('id','Row#','yield')]

# Prepare training data
X_train = train_df[features].values
y_train = train_df['yield'].values

# Standardize using only training data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Prepare training dataset and loaders
train_dataset = TensorDataset(
    torch.from_numpy(X_train_scaled).float().unsqueeze(-1),
    torch.from_numpy(y_train).float().unsqueeze(1)
)
n_val = int(len(train_dataset) * 0.2)
n_tr = len(train_dataset) - n_val
train_ds, val_ds = random_split(train_dataset, [n_tr, n_val])
train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=64)

# Define model
class RNNRegressor(nn.Module):
    def __init__(self, seq_len, hidden_size=128, num_layers=2, bidirectional=True, dropout=0.2):
        super().__init__()
        self.num_directions = 2 if bidirectional else 1
        self.lstm = nn.GRU(
            input_size=1,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=bidirectional,
            dropout=dropout if num_layers > 1 else 0.0
        )
        self.fc1 = nn.Linear(hidden_size * self.num_directions, 64)
        self.bn1 = nn.BatchNorm1d(64)
        self.drop = nn.Dropout(0.3)
        self.fc2 = nn.Linear(64, 1)

    def forward(self, x):
        _, (h_n, _) = self.lstm(x)
        last = h_n.view(self.lstm.num_layers, self.num_directions, x.size(0), self.lstm.hidden_size)[-1]
        last = last.transpose(0, 1).contiguous().view(x.size(0), -1)
        x = torch.relu(self.bn1(self.fc1(last)))
        x = self.drop(x)
        return self.fc2(x)

# Training setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = RNNRegressor(seq_len=len(features)).to(device)
criterion = nn.L1Loss()
optimizer = torch.optim.Adam(model.parameters(), lr=5e-4, weight_decay=1e-5)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5, verbose=True)

def run_epoch(loader, train=True):
    model.train() if train else model.eval()
    total_loss = 0.0
    for xb, yb in loader:
        xb, yb = xb.to(device), yb.to(device)
        if train:
            optimizer.zero_grad()
        preds = model(xb)
        loss = criterion(preds, yb)
        if train:
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
        total_loss += loss.item() * xb.size(0)
    return total_loss / len(loader.dataset)

# Training loop
num_epochs = 500
best_val = float('inf')
for epoch in range(1, num_epochs+1):
    train_mae = run_epoch(train_loader, train=True)
    val_mae = run_epoch(val_loader, train=False)
    scheduler.step(val_mae)
    if val_mae < best_val:
        best_val = val_mae
        torch.save(model.state_dict(), 'best_rnn.pth')
    if epoch % 5 == 0:
        print(f"Epoch {epoch:2d}/{num_epochs} — Train MAE: {train_mae:.2f}  |  Val MAE: {val_mae:.2f}")
print(f"▶ Best validation MAE: {best_val:.2f}")

# --- Inference on test set ---
model.load_state_dict(torch.load('best_rnn.pth'))

X_test = test_df[features].values
X_test_scaled = scaler.transform(X_test)
test_tensor = torch.from_numpy(X_test_scaled).float().unsqueeze(-1).to(device)

model.eval()
with torch.no_grad():
    test_preds = model(test_tensor).cpu().numpy().squeeze()

# Save predictions
submission = pd.DataFrame({
    'id': test_df['id'],
    'yield': test_preds
})
submission.to_csv('RNN_prediction.csv', index=False)



Epoch  5/500 — Train MAE: 5986.10  |  Val MAE: 6004.67
Epoch 10/500 — Train MAE: 5927.44  |  Val MAE: 5940.97
Epoch 15/500 — Train MAE: 5832.06  |  Val MAE: 5847.16
Epoch 20/500 — Train MAE: 5702.14  |  Val MAE: 5711.24
Epoch 25/500 — Train MAE: 5540.48  |  Val MAE: 5546.87
Epoch 30/500 — Train MAE: 5342.49  |  Val MAE: 5347.53
Epoch 35/500 — Train MAE: 5129.99  |  Val MAE: 5140.02
Epoch 40/500 — Train MAE: 4872.22  |  Val MAE: 4870.95
Epoch 45/500 — Train MAE: 4599.44  |  Val MAE: 4616.84
Epoch 50/500 — Train MAE: 4311.65  |  Val MAE: 4316.42
Epoch 55/500 — Train MAE: 3973.09  |  Val MAE: 3955.39
Epoch 60/500 — Train MAE: 3639.88  |  Val MAE: 3659.67
Epoch 65/500 — Train MAE: 3241.05  |  Val MAE: 3289.30
Epoch 70/500 — Train MAE: 2875.15  |  Val MAE: 2819.38
Epoch 75/500 — Train MAE: 2456.98  |  Val MAE: 2404.19
Epoch 80/500 — Train MAE: 2041.62  |  Val MAE: 1957.86
Epoch 85/500 — Train MAE: 1660.85  |  Val MAE: 1643.17
Epoch 90/500 — Train MAE: 1273.92  |  Val MAE: 1273.75
Epoch 95/5

# LSTM

In [10]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

# Select features
features = [c for c in train_df.columns if c not in ('id', 'Row#', 'yield')]
X_train = train_df[features].values
y_train = train_df['yield'].values

# Fit scaler only on training data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Prepare PyTorch datasets
train_dataset = TensorDataset(
    torch.from_numpy(X_train_scaled).float().unsqueeze(-1),
    torch.from_numpy(y_train).float().unsqueeze(1)
)

# Train-validation split
train_size = int(0.8 * len(train_dataset))
val_size = len(train_dataset) - train_size
train_ds, val_ds = random_split(train_dataset, [train_size, val_size])
train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=64)

# Define LSTM Regressor
class LSTMRegressor(nn.Module):
    def __init__(self, seq_len, hidden_size=128, num_layers=2, dropout=0.2, bidirectional=True):
        super().__init__()
        self.num_directions = 2 if bidirectional else 1
        self.lstm = nn.LSTM(
            input_size=1, hidden_size=hidden_size,
            num_layers=num_layers, dropout=dropout if num_layers > 1 else 0.0,
            bidirectional=bidirectional, batch_first=True
        )
        self.fc1 = nn.Linear(hidden_size * self.num_directions, 64)
        self.bn1 = nn.BatchNorm1d(64)
        self.dropout = nn.Dropout(0.3)
        self.fc2 = nn.Linear(64, 1)

    def forward(self, x):
        _, (h_n, _) = self.lstm(x)
        h_last = h_n.view(self.lstm.num_layers, self.num_directions, x.size(0), self.lstm.hidden_size)[-1]
        h_last = h_last.transpose(0, 1).reshape(x.size(0), -1)
        x = torch.relu(self.bn1(self.fc1(h_last)))
        x = self.dropout(x)
        return self.fc2(x)

# Setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = LSTMRegressor(seq_len=len(features)).to(device)
criterion = nn.L1Loss()
optimizer = torch.optim.Adam(model.parameters(), lr=5e-4, weight_decay=1e-5)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5, verbose=True)

# Training loop
best_val_mae = float('inf')
for epoch in range(1, 501):
    model.train()
    train_loss = 0.0
    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        preds = model(xb)
        loss = criterion(preds, yb)
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        train_loss += loss.item() * xb.size(0)
    train_mae = train_loss / len(train_loader.dataset)

    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for xb, yb in val_loader:
            xb, yb = xb.to(device), yb.to(device)
            preds = model(xb)
            val_loss += criterion(preds, yb).item() * xb.size(0)
    val_mae = val_loss / len(val_loader.dataset)

    scheduler.step(val_mae)
    if val_mae < best_val_mae:
        best_val_mae = val_mae
        torch.save(model.state_dict(), 'best_lstm.pth')

    if epoch % 5 == 0:
        print(f'Epoch {epoch:2d}/500 — Train MAE: {train_mae:.2f} | Val MAE: {val_mae:.2f}')

print(f'Best Validation MAE: {best_val_mae:.2f}')

# --- Inference on test.csv ---
model.load_state_dict(torch.load('best_lstm.pth'))
model.eval()

X_test = test_df[features].values
X_test_scaled = scaler.transform(X_test)
X_test_tensor = torch.from_numpy(X_test_scaled).float().unsqueeze(-1).to(device)

with torch.no_grad():
    test_preds = model(X_test_tensor).cpu().numpy().squeeze()

# Save predictions
submission = pd.DataFrame({
    'id': test_df['id'],
    'yield': test_preds
})
submission.to_csv('LSTM_prediction.csv', index=False)



Epoch  5/500 — Train MAE: 5994.45 | Val MAE: 5963.30
Epoch 10/500 — Train MAE: 5932.53 | Val MAE: 5896.19
Epoch 15/500 — Train MAE: 5836.83 | Val MAE: 5801.42
Epoch 20/500 — Train MAE: 5707.04 | Val MAE: 5668.41
Epoch 25/500 — Train MAE: 5545.16 | Val MAE: 5503.42
Epoch 30/500 — Train MAE: 5355.18 | Val MAE: 5301.85
Epoch 35/500 — Train MAE: 5133.67 | Val MAE: 5081.63
Epoch 40/500 — Train MAE: 4882.31 | Val MAE: 4862.46
Epoch 45/500 — Train MAE: 4684.94 | Val MAE: 4644.23
Epoch 50/500 — Train MAE: 4418.45 | Val MAE: 4392.07
Epoch 55/500 — Train MAE: 4069.29 | Val MAE: 4015.41
Epoch 60/500 — Train MAE: 3757.99 | Val MAE: 3703.87
Epoch 65/500 — Train MAE: 3349.78 | Val MAE: 3319.20
Epoch 70/500 — Train MAE: 3007.07 | Val MAE: 2956.64
Epoch 75/500 — Train MAE: 2593.40 | Val MAE: 2514.20
Epoch 80/500 — Train MAE: 2203.90 | Val MAE: 2123.79
Epoch 85/500 — Train MAE: 1734.51 | Val MAE: 1659.52
Epoch 90/500 — Train MAE: 1368.05 | Val MAE: 1319.12
Epoch 95/500 — Train MAE: 1083.41 | Val MAE: 1

# Transformer

In [11]:
df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

features = [c for c in df.columns if c not in ('id', 'Row#', 'yield')]
X = df[features].values
y = df['yield'].values
X_test = test_df[features].values
test_ids = test_df['id'].values

scaler = StandardScaler()
X = scaler.fit_transform(X)
X_test = scaler.transform(X_test)

# 2. Prepare PyTorch datasets
data = TensorDataset(
    torch.from_numpy(X).float().unsqueeze(-1),
    torch.from_numpy(y).float().unsqueeze(1)
)
train_size = int(0.8 * len(data))
val_size   = len(data) - train_size
train_ds, val_ds = random_split(data, [train_size, val_size])

train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
val_loader   = DataLoader(val_ds,   batch_size=64)
test_loader  = DataLoader(
    TensorDataset(torch.from_numpy(X_test).float().unsqueeze(-1)),
    batch_size=64
)

# 3. Positional encoding
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=500):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() *
                             (-np.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe.unsqueeze(0))

    def forward(self, x):
        seq_len = x.size(1)
        return x + self.pe[:, :seq_len]

# 4. Transformer regressor
class TransformerRegressor(nn.Module):
    def __init__(self, seq_len, d_model=64, nhead=8,
                 num_layers=2, dim_ff=128, dropout=0.1):
        super().__init__()
        self.embed = nn.Linear(1, d_model)
        self.pos_enc = PositionalEncoding(d_model, max_len=seq_len)
        
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=nhead,
            dim_feedforward=dim_ff,
            dropout=dropout,
            batch_first=True
        )
        self.encoder = nn.TransformerEncoder(encoder_layer,
                                             num_layers=num_layers)
        
        self.regressor = nn.Sequential(
            nn.Linear(d_model, 64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, 1)
        )
    
    def forward(self, x):
        x = self.embed(x)             # (batch, seq_len, d_model)
        x = self.pos_enc(x)
        x = self.encoder(x)           # (batch, seq_len, d_model)
        x = x.mean(dim=1)             # Global average pooling
        return self.regressor(x)

# 5. Setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = TransformerRegressor(seq_len=len(features)).to(device)

criterion = nn.L1Loss()
optimizer = torch.optim.Adam(model.parameters(), lr=3e-4, weight_decay=1e-5)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode='min', factor=0.5, patience=5, verbose=True
)

# 6. Training & validation loops
best_val_mae = float('inf')
for epoch in range(1, 501):  # 100 epochs
    # Training
    model.train()
    train_loss = 0.0
    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        preds = model(xb)
        loss = criterion(preds, yb)
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        train_loss += loss.item() * xb.size(0)
    train_mae = train_loss / len(train_loader.dataset)
    
    # Validation
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for xb, yb in val_loader:
            xb, yb = xb.to(device), yb.to(device)
            preds = model(xb)
            val_loss += criterion(preds, yb).item() * xb.size(0)
    val_mae = val_loss / len(val_loader.dataset)
    
    scheduler.step(val_mae)
    
    if val_mae < best_val_mae:
        best_val_mae = val_mae
        torch.save(model.state_dict(), 'best_transformer.pth')
    
    if epoch % 5 == 0:
        print(f'Epoch {epoch:3d}/500 — Train MAE: {train_mae:.2f} | Val MAE: {val_mae:.2f}')

print(f"▶ Best Validation MAE: {best_val_mae:.2f}")

# 7. Inference on test set
model.load_state_dict(torch.load('best_transformer.pth'))
model.eval()
preds = []
with torch.no_grad():
    for xb in test_loader:
        xb = xb[0].to(device)
        preds.append(model(xb).cpu().numpy())
preds = np.vstack(preds).flatten()

# 8. Save submission
submission = pd.DataFrame({'id': test_ids, 'yield': preds})
submission.to_csv('Transformer_prediction.csv', index=False)



Epoch   5/500 — Train MAE: 5749.81 | Val MAE: 5700.79
Epoch  10/500 — Train MAE: 4560.94 | Val MAE: 4375.72
Epoch  15/500 — Train MAE: 1898.07 | Val MAE: 1578.66
Epoch  20/500 — Train MAE: 543.71 | Val MAE: 345.77
Epoch  25/500 — Train MAE: 500.65 | Val MAE: 282.29
Epoch  30/500 — Train MAE: 487.36 | Val MAE: 280.92
Epoch  35/500 — Train MAE: 489.57 | Val MAE: 278.56
Epoch  40/500 — Train MAE: 477.47 | Val MAE: 288.34
Epoch  45/500 — Train MAE: 476.31 | Val MAE: 270.45
Epoch  50/500 — Train MAE: 468.04 | Val MAE: 270.17
Epoch  55/500 — Train MAE: 474.57 | Val MAE: 268.49
Epoch  60/500 — Train MAE: 473.38 | Val MAE: 264.54
Epoch  65/500 — Train MAE: 462.77 | Val MAE: 269.48
Epoch  70/500 — Train MAE: 466.37 | Val MAE: 269.74
Epoch  75/500 — Train MAE: 461.49 | Val MAE: 268.75
Epoch  80/500 — Train MAE: 467.85 | Val MAE: 266.98
Epoch  85/500 — Train MAE: 466.92 | Val MAE: 266.44
Epoch  90/500 — Train MAE: 464.27 | Val MAE: 266.40
Epoch  95/500 — Train MAE: 470.77 | Val MAE: 265.69
Epoch 

# Hybrid

In [12]:
train_df = pd.read_csv('data/train.csv')
test_df  = pd.read_csv('data/test.csv')
features = [c for c in train_df.columns if c not in ('id','Row#','yield')]

X = train_df[features].values
y = train_df['yield'].values
X_test = test_df[features].values
test_ids = test_df['id'].values

scaler = StandardScaler()
X      = scaler.fit_transform(X)
X_test = scaler.transform(X_test)

# 2. Create PyTorch datasets and loaders
data = TensorDataset(torch.from_numpy(X).float().unsqueeze(-1),
                     torch.from_numpy(y).float().unsqueeze(1))
train_size = int(0.8 * len(data))
val_size   = len(data) - train_size
train_ds, val_ds = random_split(data, [train_size, val_size])

train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
val_loader   = DataLoader(val_ds,   batch_size=64)
test_loader  = DataLoader(TensorDataset(torch.from_numpy(X_test).float().unsqueeze(-1)),
                          batch_size=64)

# 3. Define a CNN + LSTM hybrid regressor
class HybridRegressor(nn.Module):
    def __init__(self, seq_len, cnn_channels=(16,32), 
                 lstm_hidden=64, lstm_layers=2, bidir=True):
        super().__init__()
        # CNN branch
        self.cnn = nn.Sequential(
            nn.Conv1d(1, cnn_channels[0], kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool1d(2),
            nn.Conv1d(cnn_channels[0], cnn_channels[1], kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool1d(2)
        )
        # LSTM branch (on CNN features)
        self.lstm = nn.LSTM(
            input_size=cnn_channels[1],
            hidden_size=lstm_hidden,
            num_layers=lstm_layers,
            batch_first=True,
            bidirectional=bidir,
            dropout=0.2 if lstm_layers>1 else 0.0
        )
        self.bidir = bidir
        # Final MLP
        mlp_in = lstm_hidden * (2 if bidir else 1)
        self.mlp = nn.Sequential(
            nn.Linear(mlp_in, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 1)
        )

    def forward(self, x):
        # x: (batch, seq_len, 1) -> CNN expects (batch, 1, seq_len)
        x = x.transpose(1,2)
        x = self.cnn(x)
        # x: (batch, channels, seq_len/4) -> back to (batch, seq_len/4, channels)
        x = x.transpose(1,2)
        # LSTM
        _, (h_n, _) = self.lstm(x)
        # pick last layer hidden: shape (num_layers * directions, batch, hidden)
        if self.bidir:
            # get last forward & backward hidden
            h_last = torch.cat((h_n[-2], h_n[-1]), dim=1)
        else:
            h_last = h_n[-1]
        # final regressor
        return self.mlp(h_last)

# 4. Setup training components
device   = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model    = HybridRegressor(seq_len=len(features)).to(device)
criterion= nn.L1Loss()
optimizer= torch.optim.Adam(model.parameters(), lr=5.1e-4, weight_decay=0.5e-5)
scheduler= torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min',
                                                     factor=0.5, patience=5, verbose=True)

# 5. Training & validation loop
best_val = float('inf')
for epoch in range(1, 100):
    # Train
    model.train()
    total_train = 0.0
    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        preds = model(xb)
        loss  = criterion(preds, yb)
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        total_train += loss.item() * xb.size(0)
    train_mae = total_train / len(train_loader.dataset)

    # Validate
    model.eval()
    total_val = 0.0
    with torch.no_grad():
        for xb, yb in val_loader:
            xb, yb = xb.to(device), yb.to(device)
            preds = model(xb)
            total_val += criterion(preds, yb).item() * xb.size(0)
    val_mae = total_val / len(val_loader.dataset)

    scheduler.step(val_mae)
    if val_mae < best_val:
        best_val = val_mae
        torch.save(model.state_dict(), 'best_hybrid.pth')

    if epoch % 5 == 0:
        print(f"Epoch {epoch:2d}/201 - Train MAE: {train_mae:.2f} | Val MAE: {val_mae:.2f}")

print(f"Best Validation MAE: {best_val:.2f}")

# 6. Inference on test set
model.load_state_dict(torch.load('best_hybrid.pth'))
model.eval()
preds = []
with torch.no_grad():
    for xb in test_loader:
        xb = xb[0].to(device)
        preds.append(model(xb).cpu().numpy())
preds = np.vstack(preds).flatten()

# 7. Save submission
pd.DataFrame({'id': test_ids, 'yield': preds}).to_csv('Hybrid_prediction.csv', index=False)



Epoch  5/201 - Train MAE: 5446.97 | Val MAE: 5342.18
Epoch 10/201 - Train MAE: 2927.53 | Val MAE: 2582.31
Epoch 15/201 - Train MAE: 780.40 | Val MAE: 477.29
Epoch 20/201 - Train MAE: 641.29 | Val MAE: 336.35
Epoch 25/201 - Train MAE: 632.57 | Val MAE: 301.21
Epoch 30/201 - Train MAE: 629.88 | Val MAE: 283.80
Epoch 35/201 - Train MAE: 617.93 | Val MAE: 265.28
Epoch 40/201 - Train MAE: 617.97 | Val MAE: 271.09
Epoch 45/201 - Train MAE: 620.80 | Val MAE: 267.66
Epoch 50/201 - Train MAE: 619.95 | Val MAE: 276.18
Epoch 55/201 - Train MAE: 610.00 | Val MAE: 266.24
Epoch 60/201 - Train MAE: 616.07 | Val MAE: 268.45
Epoch 65/201 - Train MAE: 604.84 | Val MAE: 268.92
Epoch 70/201 - Train MAE: 618.20 | Val MAE: 270.47
Epoch 75/201 - Train MAE: 611.60 | Val MAE: 269.29
Epoch 80/201 - Train MAE: 615.94 | Val MAE: 269.13
Epoch 85/201 - Train MAE: 609.41 | Val MAE: 269.14
Epoch 90/201 - Train MAE: 613.43 | Val MAE: 269.28
Epoch 95/201 - Train MAE: 614.83 | Val MAE: 270.35
Best Validation MAE: 265.28