In [27]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [6]:
df = pd.read_csv('./Data/all_crops_combined_2.csv')
df.shape

(25464, 14)

In [7]:
df.head()

Unnamed: 0,commodity_desc,reference_period_desc,year,state_ansi,state_name,county_ansi,county_name,asd_code,asd_desc,domain_desc,source_desc,agg_level_desc,"PRODUCTION, MEASURED IN BU","YIELD, MEASURED IN BU / ACRE"
0,WHEAT,YEAR,2017,5,ARKANSAS,3,ASHLEY,90,SOUTHEAST,TOTAL,SURVEY,COUNTY,76000.0,63.3
1,WHEAT,YEAR,2017,5,ARKANSAS,37,CROSS,60,EAST CENTRAL,TOTAL,SURVEY,COUNTY,282000.0,65.6
2,WHEAT,YEAR,2017,5,ARKANSAS,69,JEFFERSON,90,SOUTHEAST,TOTAL,SURVEY,COUNTY,240000.0,44.4
3,WHEAT,YEAR,2017,5,ARKANSAS,77,LEE,60,EAST CENTRAL,TOTAL,SURVEY,COUNTY,225000.0,47.9
4,WHEAT,YEAR,2017,5,ARKANSAS,79,LINCOLN,90,SOUTHEAST,TOTAL,SURVEY,COUNTY,206000.0,45.8


In [None]:
unique_commodity_values = df['asd_code'].unique()
unique_commodity_values

array([90, 60, 30, 50, 51, 70, 20, 10, 80, 40, 11, 96, 12, 22, 81, 21, 97,
       82, 91, 52])

In [22]:
df2 = df.drop(columns=['reference_period_desc', 'state_name', 'county_name', 'asd_desc', 'domain_desc', 'source_desc', 'agg_level_desc'])

In [23]:
df2.head()

Unnamed: 0,commodity_desc,year,state_ansi,county_ansi,asd_code,"PRODUCTION, MEASURED IN BU","YIELD, MEASURED IN BU / ACRE"
0,WHEAT,2017,5,3,90,76000.0,63.3
1,WHEAT,2017,5,37,60,282000.0,65.6
2,WHEAT,2017,5,69,90,240000.0,44.4
3,WHEAT,2017,5,77,60,225000.0,47.9
4,WHEAT,2017,5,79,90,206000.0,45.8


In [24]:
df3 = pd.get_dummies(df2, columns=['commodity_desc'])
df3.head()

Unnamed: 0,year,state_ansi,county_ansi,asd_code,"PRODUCTION, MEASURED IN BU","YIELD, MEASURED IN BU / ACRE",commodity_desc_CORN,commodity_desc_SOYBEANS,commodity_desc_WHEAT
0,2017,5,3,90,76000.0,63.3,False,False,True
1,2017,5,37,60,282000.0,65.6,False,False,True
2,2017,5,69,90,240000.0,44.4,False,False,True
3,2017,5,77,60,225000.0,47.9,False,False,True
4,2017,5,79,90,206000.0,45.8,False,False,True


In [26]:
df3.to_csv('./Data/all_crops_combined_cleaned.csv')

## Data preprocessing - division into categorical / continious

In [28]:
target_col = "YIELD, MEASURED IN BU / ACRE"

# Drop index-like column if present
drop_cols = [target_col]
if "Unnamed: 0" in df3.columns:
    drop_cols.append("Unnamed: 0")

X = df3.drop(columns=drop_cols)
y = df3[target_col].values.astype(np.float32)

# ======================
# 2. Split into cat / cont
# ======================
# Treat ints + bools as categorical, floats as continuous
cat_cols = [c for c in X.columns if str(X[c].dtype) in ("int64", "bool")]
cont_cols = [c for c in X.columns if c not in cat_cols]

X_cat = X[cat_cols].copy()
X_cont = X[cont_cols].copy()

# Encode categorical cols as 0..n-1
for c in cat_cols:
    X_cat[c] = X_cat[c].astype("category").cat.codes

In [29]:
print(f"Categorical columns: {cat_cols}"
      f"\nContinuous columns: {cont_cols}")

Categorical columns: ['year', 'state_ansi', 'county_ansi', 'asd_code', 'commodity_desc_CORN', 'commodity_desc_SOYBEANS', 'commodity_desc_WHEAT']
Continuous columns: ['PRODUCTION, MEASURED IN BU']


In [30]:
# Scale continuous features
scaler = StandardScaler()
if len(cont_cols) > 0:
    X_cont_scaled = scaler.fit_transform(X_cont.values.astype(np.float32))
else:
    X_cont_scaled = np.zeros((len(df3), 0), dtype=np.float32)

X_cat_vals = X_cat.values.astype(np.int64)
X_cont_vals = X_cont_scaled.astype(np.float32)

# Cardinalities for each categorical feature (for embeddings)
cat_cardinalities = [int(X_cat[c].nunique()) for c in cat_cols]

print("Categorical columns:", cat_cols)
print("Continuous columns:", cont_cols)
print("Cat cardinalities:", cat_cardinalities)

Categorical columns: ['year', 'state_ansi', 'county_ansi', 'asd_code', 'commodity_desc_CORN', 'commodity_desc_SOYBEANS', 'commodity_desc_WHEAT']
Continuous columns: ['PRODUCTION, MEASURED IN BU']
Cat cardinalities: [7, 38, 206, 20, 2, 2, 2]


## Train / Test data

In [31]:
X_train_cat, X_test_cat, X_train_cont, X_test_cont, y_train, y_test = train_test_split(
    X_cat_vals,
    X_cont_vals,
    y,
    test_size=0.2,
    random_state=42,
)

# ======================
# 4. PyTorch Dataset / DataLoader
# ======================
class TabularDataset(Dataset):
    def __init__(self, x_cont, x_cat, y):
        self.x_cont = x_cont
        self.x_cat = x_cat
        self.y = y

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        x_cont = self.x_cont[idx]
        x_cat = self.x_cat[idx]
        y = self.y[idx]
        return (
            torch.tensor(x_cont, dtype=torch.float32),
            torch.tensor(x_cat, dtype=torch.long),
            torch.tensor(y, dtype=torch.float32),
        )

batch_size = 256

train_ds = TabularDataset(X_train_cont, X_train_cat, y_train)
test_ds  = TabularDataset(X_test_cont,  X_test_cat,  y_test)

train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
test_loader  = DataLoader(test_ds,  batch_size=batch_size, shuffle=False)

## Transformer class initialization

In [32]:
# ======================
# 5. FT-Transformer model
#    (features as tokens)
# ======================
class FTTransformer(nn.Module):
    def __init__(
        self,
        num_cont,
        cat_cardinalities,
        d_model=32,
        n_heads=4,
        n_layers=3,
        dropout=0.1,
    ):
        super().__init__()
        self.num_cont = num_cont
        self.num_cat = len(cat_cardinalities)
        self.d_model = d_model

        # Continuous feature embeddings: one vector per feature
        if num_cont > 0:
            self.cont_emb = nn.Parameter(torch.randn(num_cont, d_model))
        else:
            self.register_parameter("cont_emb", None)

        # Categorical embeddings: one embedding table per feature
        self.cat_embeds = nn.ModuleList(
            [nn.Embedding(card, d_model) for card in cat_cardinalities]
        )

        n_tokens = self.num_cont + self.num_cat

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=n_heads,
            dim_feedforward=d_model * 4,
            dropout=dropout,
            batch_first=True,
            activation="gelu",
        )
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=n_layers)
        self.norm = nn.LayerNorm(d_model)

        self.head = nn.Sequential(
            nn.Linear(d_model, d_model),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(d_model, 1),
        )

    def forward(self, x_cont, x_cat):
        tokens = []

        if self.num_cont > 0:
            # x_cont: (B, num_cont)
            # cont_emb: (num_cont, d_model)
            cont_tok = x_cont.unsqueeze(-1) * self.cont_emb.unsqueeze(0)  # (B, num_cont, d_model)
            tokens.append(cont_tok)

        if self.num_cat > 0:
            cat_tok_list = []
            for i, emb in enumerate(self.cat_embeds):
                cat_tok_list.append(emb(x_cat[:, i]))  # (B, d_model)
            cat_tok = torch.stack(cat_tok_list, dim=1)  # (B, num_cat, d_model)
            tokens.append(cat_tok)

        x = torch.cat(tokens, dim=1)  # (B, n_tokens, d_model)
        x = self.encoder(x)
        x = self.norm(x)
        x = x.mean(dim=1)  # mean pooling over tokens
        out = self.head(x).squeeze(-1)  # (B,)
        return out

## Model training

In [33]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = FTTransformer(
    num_cont=X_cont_vals.shape[1],
    cat_cardinalities=cat_cardinalities,
    d_model=64,
    n_heads=4,
    n_layers=3,
    dropout=0.1,
).to(device)

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

epochs = 20

for epoch in range(1, epochs + 1):
    model.train()
    running_loss = 0.0
    for x_cont_batch, x_cat_batch, y_batch in train_loader:
        x_cont_batch = x_cont_batch.to(device)
        x_cat_batch = x_cat_batch.to(device)
        y_batch = y_batch.to(device)

        optimizer.zero_grad()
        preds = model(x_cont_batch, x_cat_batch)
        loss = criterion(preds, y_batch)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * len(y_batch)

    epoch_loss = running_loss / len(train_ds)
    print(f"Epoch {epoch:02d}/{epochs} - Train MSE: {epoch_loss:.4f}")

Epoch 01/20 - Train MSE: 10420.8352
Epoch 02/20 - Train MSE: 4009.0553
Epoch 03/20 - Train MSE: 801.5726
Epoch 04/20 - Train MSE: 426.4412
Epoch 05/20 - Train MSE: 339.2671
Epoch 06/20 - Train MSE: 301.1262
Epoch 07/20 - Train MSE: 270.2439
Epoch 08/20 - Train MSE: 250.6798
Epoch 09/20 - Train MSE: 239.1180
Epoch 10/20 - Train MSE: 230.3146
Epoch 11/20 - Train MSE: 215.5777
Epoch 12/20 - Train MSE: 210.0133
Epoch 13/20 - Train MSE: 198.8358
Epoch 14/20 - Train MSE: 195.5610
Epoch 15/20 - Train MSE: 188.6199
Epoch 16/20 - Train MSE: 183.2355
Epoch 17/20 - Train MSE: 182.3424
Epoch 18/20 - Train MSE: 175.6579
Epoch 19/20 - Train MSE: 172.9424
Epoch 20/20 - Train MSE: 165.8613


## Evaluation results

In [34]:
# ======================
# 7. Evaluate: R², RMSE, MAE
# ======================
model.eval()
y_pred_list = []

with torch.no_grad():
    for x_cont_batch, x_cat_batch, y_batch in test_loader:
        x_cont_batch = x_cont_batch.to(device)
        x_cat_batch = x_cat_batch.to(device)

        preds = model(x_cont_batch, x_cat_batch)
        y_pred_list.append(preds.cpu().numpy())

y_pred = np.concatenate(y_pred_list)
y_true = y_test

r2 = r2_score(y_true, y_pred)
mae = mean_absolute_error(y_true, y_pred)
rmse = mean_squared_error(y_true, y_pred, squared=False)

print("\n=== Evaluation on test set ===")
print(f"R²   : {r2:.4f}")
print(f"RMSE : {rmse:.4f}")
print(f"MAE  : {mae:.4f}")


=== Evaluation on test set ===
R²   : 0.9535
RMSE : 12.5746
MAE  : 8.7096


