MTL with kmeans clutering and predict the nearest cluster for a sample and then run that specific model

Expt 1 - No clustering

In [None]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset


In [None]:

# Load and inspect dataset
df1 = pd.read_csv("../../data/tr_data.csv")
df2 = pd.read_csv("../../data/te_data.csv")
df = pd.concat([df1,df2],axis=0)
df.head()


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler

# === Load train and test data separately ===
train_df = pd.read_csv("../../data/tr_data.csv")
test_df = pd.read_csv("../../data/te_data.csv")    # Replace with your actual path

# === Drop irrelevant columns ===
drop_cols = ['Unnamed: 0', 'TestId', 'date_initial', 'date_final', 'Feature','longitute','latitude','env']
target_class_col = 'Specie'
target_reg_col = 'Productivity (y)'

train_df.drop(columns=drop_cols, errors='ignore', inplace=True)
test_df.drop(columns=drop_cols, errors='ignore', inplace=True)

# === Encode species labels ===
label_encoder = LabelEncoder()
train_df['species_encoded'] = label_encoder.fit_transform(train_df[target_class_col])
test_df['species_encoded'] = label_encoder.transform(test_df[target_class_col])

# === Correlation Analysis on Train Set ===
# Compute correlation matrix
# Select only numeric columns for correlation
numeric_train_df = train_df.select_dtypes(include=[np.number])

# Compute correlation matrix
corr_matrix = numeric_train_df.corr()

# Find features most correlated with Productivity (y)
correlation_threshold = 0.3  # you can adjust
strong_corr_features = corr_matrix[target_reg_col].abs()
selected_features = strong_corr_features[strong_corr_features > correlation_threshold].index.tolist()

# Remove target columns themselves
selected_features = [f for f in selected_features if f not in [target_reg_col, target_class_col, 'species_encoded']]

print(f"Selected Features after Correlation Analysis: {selected_features}")


# === Prepare Feature and Target Arrays ===

# Train Features and Targets
X_train = train_df[selected_features].fillna(0).values
y_species_train = train_df['species_encoded'].values
y_prod_train = train_df[target_reg_col].values

# Test Features and Targets
X_test = test_df[selected_features].fillna(0).values
y_species_test = test_df['species_encoded'].values
y_prod_test = test_df[target_reg_col].values

# === Standardize Features based on Train ===
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Now you have:
# - X_train_scaled, y_species_train, y_prod_train
# - X_test_scaled, y_species_test, y_prod_test


In [None]:

# PyTorch dataset
class PineDataset(Dataset):
    def __init__(self, X, y_species, y_productivity):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y_species = torch.tensor(y_species, dtype=torch.long)
        self.y_productivity = torch.tensor(y_productivity, dtype=torch.float32)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y_species[idx], self.y_productivity[idx]

train_dataset = PineDataset(X_train, y_species_train, y_prod_train)
test_dataset = PineDataset(X_test, y_species_test, y_prod_test)
train_loader = DataLoader(train_dataset, batch_size=512, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=512)


In [None]:
class SpeciesProductivityModel(nn.Module):
    def __init__(self, input_dim, num_classes, use_uncertainty=True,alpha =1):
        super().__init__()
        self.shared = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU()
        )
        self.classifier = nn.Linear(64, num_classes)
        self.regressor = nn.Linear(64, 1)
        
        self.alpha = alpha
        # For uncertainty weighting
        self.use_uncertainty = use_uncertainty
        if use_uncertainty:
            self.log_sigma_cls = nn.Parameter(torch.tensor(0.0))
            self.log_sigma_reg = nn.Parameter(torch.tensor(0.0))

    def forward(self, x):
        x = self.shared(x)
        class_logits = self.classifier(x)
        regression = self.regressor(x).squeeze(1)
        return class_logits, regression


In [None]:
def uncertainty_weighted_loss(loss_cls, loss_reg, model):
    sigma_c = model.log_sigma_cls
    sigma_r = model.log_sigma_reg

    loss = (1 / (2 * torch.exp(sigma_c) ** 2)) * loss_cls + sigma_c
    loss += (1 / (2 * torch.exp(sigma_r) ** 2)) * loss_reg + sigma_r
    return loss

In [None]:
input_dim = X_train.shape[1]
num_classes = len(np.unique(y_species_train))
model = SpeciesProductivityModel(input_dim, num_classes)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_class = nn.CrossEntropyLoss()
loss_regression = nn.MSELoss()

In [None]:

for epoch in range(100):
    model.train()
    total_loss=0
    for x_batch, y_cls, y_reg in train_loader:
        logits, preds = model(x_batch)
        loss_cls = loss_class(logits, y_cls)
        loss_reg = loss_regression(preds, y_reg)
      
        loss = uncertainty_weighted_loss(loss_cls, loss_reg, model)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}: Loss = {total_loss:.4f}")


In [None]:

# Evaluation
model.eval()
all_preds_cls, all_true_cls = [], []
all_preds_reg, all_true_reg = [], []

with torch.no_grad():
    for x_batch, y_cls, y_reg in test_loader:
        logits, preds = model(x_batch)
        y_pred_cls = torch.argmax(logits, dim=1)
        all_preds_cls.extend(y_pred_cls.numpy())
        all_true_cls.extend(y_cls.numpy())
        all_preds_reg.extend(preds.numpy())
        all_true_reg.extend(y_reg.numpy())

accuracy = accuracy_score(all_true_cls, all_preds_cls)
rmse = np.sqrt(mean_squared_error(all_true_reg, all_preds_reg))
r2 = r2_score(all_true_reg, all_preds_reg)

print(f"✅ Species Classification Accuracy: {accuracy:.4f}")
print(f"✅ Productivity RMSE: {rmse:.4f}")
print(f"✅ Productivity R² Score: {r2:.4f}")


In [None]:
topk = 3
all_top3_preds = []
all_true_species = []
all_pred_productivity = []
all_true_productivity = []

model.eval()
with torch.no_grad():
    for x_batch, y_cls, y_reg in test_loader:
        logits, pred_reg = model(x_batch)

        # Get top-3 class predictions
        top3 = torch.topk(logits, k=topk, dim=1).indices  # (batch_size, 3)

        all_top3_preds.extend(top3.numpy())
        all_true_species.extend(y_cls.numpy())
        all_pred_productivity.extend(pred_reg.numpy())
        all_true_productivity.extend(y_reg.numpy())

# ✅ Compute Top-3 Accuracy
correct_top3 = 0
for true, top3 in zip(all_true_species, all_top3_preds):
    if true in top3:
        correct_top3 += 1
top3_accuracy = correct_top3 / len(all_true_species)
print(f"✅ Top-3 Species Accuracy: {top3_accuracy:.4f}")


In [None]:
Expt 2 - Clustering

In [1]:
from scipy.stats import mode

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.cluster import KMeans
from sklearn.neighbors import NearestCentroid
from sklearn.metrics import accuracy_score, r2_score
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from collections import Counter

# === Load and clean data ===
train_df = pd.read_csv("../../data/tr_data.csv")
test_df = pd.read_csv("../../data/te_data.csv")

drop_cols = ['Unnamed: 0', 'TestId', 'date_initial', 'date_final', 'Feature','longitute','latitude']
train_df = train_df.drop(columns=drop_cols, errors='ignore')
test_df = test_df.drop(columns=drop_cols, errors='ignore')

# Encode species
label_encoder = LabelEncoder()
train_df['species_encoded'] = label_encoder.fit_transform(train_df['Specie'])
test_df['species_encoded'] = label_encoder.transform(test_df['Specie'])

# Keep numeric and clean
train_df = train_df.select_dtypes(include=[np.number]).dropna()
test_df = test_df.select_dtypes(include=[np.number]).dropna()

X_train = train_df.drop(columns=['Productivity (y)', 'species_encoded'])
y_train_cls = train_df['species_encoded']
y_train_reg = train_df['Productivity (y)']

X_test = test_df.drop(columns=['Productivity (y)', 'species_encoded'])
y_test_cls = test_df['species_encoded']
y_test_reg = test_df['Productivity (y)']

# === Standardize ===
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# === Cluster with KMeans ===
n_clusters = 8
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
train_clusters = kmeans.fit_predict(X_train_scaled)

# Centroid classifier for test-time assignment
centroid_classifier = NearestCentroid()
centroid_classifier.fit(X_train_scaled, train_clusters)

# === Torch dataset ===
class PineDataset(Dataset):
    def __init__(self, X, y_cls, y_reg):
        self.X = torch.tensor(X.values, dtype=torch.float32)
        self.y_cls = torch.tensor(y_cls.values, dtype=torch.long)
        self.y_reg = torch.tensor(y_reg.values, dtype=torch.float32)

    def __len__(self): return len(self.X)
    def __getitem__(self, idx): return self.X[idx], self.y_cls[idx], self.y_reg[idx]



In [2]:
class BetterMTLNet(nn.Module):
    def __init__(self, input_dim, num_classes, alpha=1.0):
        super().__init__()
        self.shared = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.3)
        )
        self.classifier = nn.Linear(128, num_classes)
        self.regressor = nn.Linear(128, 1)

        # Uncertainty weighting params (log sigmas)
        self.log_sigma_cls = nn.Parameter(torch.tensor(0.0))
        self.log_sigma_reg = nn.Parameter(torch.tensor(0.0))
        self.alpha = alpha

    def forward(self, x):
        shared = self.shared(x)
        class_logits = self.classifier(shared)
        reg_output = self.regressor(shared).squeeze(1)
        return class_logits, reg_output


In [4]:
def souncertainty_weighted_loss(loss_cls, loss_reg, model):
    sigma_cls = model.log_sigma_cls
    sigma_reg = model.log_sigma_reg
    alpha = model.alpha

    loss = (0.5 * loss_cls / torch.exp(2 * sigma_cls)) + alpha * sigma_cls
    loss += (0.5 * loss_reg / torch.exp(2 * sigma_reg)) + alpha * sigma_reg
    return loss


In [6]:
cluster_models = {}
input_dim = X_train.shape[1]
num_classes = y_train_cls.nunique()

for cluster_id in np.unique(train_clusters):
    idx = train_clusters == cluster_id
    X_c = X_train[idx]
    y_c_cls = y_train_cls[idx]
    y_c_reg = y_train_reg[idx]

    model = BetterMTLNet(input_dim, num_classes)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    loss_cls = nn.CrossEntropyLoss()
    loss_reg = nn.MSELoss()

    dataset = PineDataset(X_c, y_c_cls, y_c_reg)
    loader = DataLoader(dataset, batch_size=64, shuffle=True)

    for epoch in range(10):
        for xb, yb_cls, yb_reg in loader:
            logits, reg_out = model(xb)
            l_cls = loss_cls(logits, yb_cls)
            l_reg = loss_reg(reg_out, yb_reg)
            loss = souncertainty_weighted_loss(l_cls, l_reg, model)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    cluster_models[cluster_id] = model

# === Predict test samples using nearest cluster ===
y_cls_preds = []
y_reg_preds = []

for i in range(len(X_test)):
    x_raw = X_test.iloc[i:i+1]
    x_tensor = torch.tensor(x_raw.values, dtype=torch.float32)
    cluster_id = centroid_classifier.predict([X_test_scaled[i]])[0]

    model = cluster_models[cluster_id]
    model.eval()
    with torch.no_grad():
        logits, reg_out = model(x_tensor)
        pred_cls = torch.argmax(logits, dim=1).item()
        pred_reg = reg_out.item()

    y_cls_preds.append(pred_cls)
    y_reg_preds.append(pred_reg)

# === Evaluation ===
acc = accuracy_score(y_test_cls, y_cls_preds)
r2 = r2_score(y_test_reg, y_reg_preds)

print(f"✅ KMeans better MTL Accuracy (Species): {acc:.4f}")
print(f"✅ KMeans better MTL R² (Productivity): {r2:.4f}")

✅ KMeans better MTL Accuracy (Species): 0.7737
✅ KMeans better MTL R² (Productivity): 0.7476


CNN + MTL code

Expt 1 - Without clustering

In [6]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, r2_score
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# === Load and preprocess ===
train_df = pd.read_csv("../../data/tr_data.csv")
test_df = pd.read_csv("../../data/te_data.csv")

drop_cols = ['Unnamed: 0', 'TestId', 'date_initial', 'date_final', 'Feature','latitude','longitute','env']
train_df = train_df.drop(columns=drop_cols, errors='ignore').dropna()
test_df = test_df.drop(columns=drop_cols, errors='ignore').dropna()

# Label encode species
le = LabelEncoder()
train_df['species_encoded'] = le.fit_transform(train_df['Specie'])
test_df['species_encoded'] = le.transform(test_df['Specie'])

# Features and targets
X_train = train_df.drop(columns=['Specie', 'species_encoded', 'Productivity (y)'])
y_cls_train = train_df['species_encoded']
y_reg_train = train_df['Productivity (y)']

X_test = test_df.drop(columns=['Specie', 'species_encoded', 'Productivity (y)'])
y_cls_test = test_df['species_encoded']
y_reg_test = test_df['Productivity (y)']

# Standardize
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# === PyTorch Dataset ===
class TabularDataset(Dataset):
    def __init__(self, X, y_cls, y_reg):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y_cls = torch.tensor(y_cls.values, dtype=torch.long)
        self.y_reg = torch.tensor(y_reg.values, dtype=torch.float32)

    def __len__(self): return len(self.X)
    def __getitem__(self, idx): return self.X[idx], self.y_cls[idx], self.y_reg[idx]

train_loader = DataLoader(TabularDataset(X_train_scaled, y_cls_train, y_reg_train), batch_size=64, shuffle=True)
test_loader = DataLoader(TabularDataset(X_test_scaled, y_cls_test, y_reg_test), batch_size=1, shuffle=False)

# === CNN-based MTL Model ===
class CNNMTLNetWithSOUW(nn.Module):
    def __init__(self, input_dim, num_classes):
        super().__init__()
        self.conv = nn.Sequential(
            nn.Conv1d(1, 16, kernel_size=3, padding=1),
            nn.BatchNorm1d(16),
            nn.ReLU(),
            nn.Conv1d(16, 32, kernel_size=3, padding=1),
            nn.BatchNorm1d(32),
            nn.ReLU(),
            nn.AdaptiveAvgPool1d(1)
        )
        self.shared = nn.Sequential(
            nn.Linear(32, 64),
            nn.ReLU(),
            nn.Dropout(0.3)
        )
        self.classifier = nn.Linear(64, num_classes)
        self.regressor = nn.Linear(64, 1)
        self.log_sigma_cls = nn.Parameter(torch.tensor(0.0))
        self.log_sigma_reg = nn.Parameter(torch.tensor(0.0))
        self.alpha = 1.0

    def forward(self, x):
        x = x.unsqueeze(1)  # shape [B, 1, F]
        x = self.conv(x).squeeze(2)  # shape [B, 32]
        x = self.shared(x)
        return self.classifier(x), self.regressor(x).squeeze(1)

# === SOUW Loss Function ===
def souncertainty_weighted_loss(loss_cls, loss_reg, model):
    sigma_cls = model.log_sigma_cls
    sigma_reg = model.log_sigma_reg
    alpha = model.alpha
    return (
        0.5 * loss_cls / torch.exp(2 * sigma_cls) + alpha * sigma_cls +
        0.5 * loss_reg / torch.exp(2 * sigma_reg) + alpha * sigma_reg
    )

# === Training ===
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CNNMTLNetWithSOUW(input_dim=X_train.shape[1], num_classes=y_cls_train.nunique()).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_cls_fn = nn.CrossEntropyLoss()
loss_reg_fn = nn.MSELoss()

for epoch in range(20):
    model.train()
    total_loss = 0
    for xb, yb_cls, yb_reg in train_loader:
        xb, yb_cls, yb_reg = xb.to(device), yb_cls.to(device), yb_reg.to(device)
        logits, reg_out = model(xb)
        loss_cls = loss_cls_fn(logits, yb_cls)
        loss_reg = loss_reg_fn(reg_out, yb_reg)
        loss = souncertainty_weighted_loss(loss_cls, loss_reg, model)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} | Loss: {total_loss:.4f}")

# === Evaluation ===
model.eval()
y_cls_preds, y_reg_preds = [], []

with torch.no_grad():
    for xb, _, _ in test_loader:
        xb = xb.to(device)
        logits, reg_out = model(xb)
        y_cls_preds.append(torch.argmax(logits, dim=1).item())
        y_reg_preds.append(reg_out.item())

print("✅ Accuracy (Species):", accuracy_score(y_cls_test, y_cls_preds))
print("✅ R² (Productivity):", r2_score(y_reg_test, y_reg_preds))


Epoch 1 | Loss: 10747.1958
Epoch 2 | Loss: 4053.5753
Epoch 3 | Loss: 3356.1904
Epoch 4 | Loss: 3077.4677
Epoch 5 | Loss: 2916.1092
Epoch 6 | Loss: 2849.1620
Epoch 7 | Loss: 2783.2814
Epoch 8 | Loss: 2729.4028
Epoch 9 | Loss: 2669.1367
Epoch 10 | Loss: 2633.7491
Epoch 11 | Loss: 2587.9395
Epoch 12 | Loss: 2546.1974
Epoch 13 | Loss: 2519.1892
Epoch 14 | Loss: 2486.9600
Epoch 15 | Loss: 2459.4866
Epoch 16 | Loss: 2422.2980
Epoch 17 | Loss: 2392.1825
Epoch 18 | Loss: 2364.0668
Epoch 19 | Loss: 2343.7454
Epoch 20 | Loss: 2320.1114
✅ Accuracy (Species): 0.6427745664739885
✅ R² (Productivity): 0.7100981956200969


Expt 2 - Using Clustering

In [8]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, r2_score
from sklearn.neighbors import NearestCentroid
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import torch.nn.functional as F

# === Load train/test ===
train_df = pd.read_csv("../../data/tr_data.csv")
test_df = pd.read_csv("../../data/te_data.csv")

drop_cols = ['Unnamed: 0', 'TestId', 'date_initial', 'date_final', 'Feature','latitude','longitute','env']
train_df = train_df.drop(columns=drop_cols, errors='ignore').dropna()
test_df = test_df.drop(columns=drop_cols, errors='ignore').dropna()

# Encode species
le = LabelEncoder()
train_df['species_encoded'] = le.fit_transform(train_df['Specie'])
test_df['species_encoded'] = le.transform(test_df['Specie'])

# Split features and targets
X_train_env = train_df.drop(columns=['Specie', 'species_encoded', 'Productivity (y)'])  # environmental vars
X_test_env = test_df.drop(columns=['Specie', 'species_encoded', 'Productivity (y)'])

y_cls_train = train_df['species_encoded']
y_reg_train = train_df['Productivity (y)']
y_cls_test = test_df['species_encoded']
y_reg_test = test_df['Productivity (y)']

# === Normalize environmental features ===
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_env)
X_test_scaled = scaler.transform(X_test_env)

# === Clustering ===
n_clusters = 8
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
train_clusters = kmeans.fit_predict(X_train_scaled)
centroids = kmeans.cluster_centers_

# === Cluster-wise data storage ===
cluster_models = {}
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Dataset
class TabularDataset(Dataset):
    def __init__(self, X, y_cls, y_reg):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y_cls = torch.tensor(y_cls.values, dtype=torch.long)
        self.y_reg = torch.tensor(y_reg.values, dtype=torch.float32)

    def __len__(self): return len(self.X)
    def __getitem__(self, idx): return self.X[idx], self.y_cls[idx], self.y_reg[idx]

# CNN-MTL model
class CNNMTLNetWithSOUW(nn.Module):
    def __init__(self, input_dim, num_classes):
        super().__init__()
        self.conv = nn.Sequential(
            nn.Conv1d(1, 16, kernel_size=3, padding=1),
            nn.BatchNorm1d(16),
            nn.ReLU(),
            nn.Conv1d(16, 32, kernel_size=3, padding=1),
            nn.BatchNorm1d(32),
            nn.ReLU(),
            nn.AdaptiveAvgPool1d(1)
        )
        self.shared = nn.Sequential(
            nn.Linear(32, 64),
            nn.ReLU(),
            nn.Dropout(0.3)
        )
        self.classifier = nn.Linear(64, num_classes)
        self.regressor = nn.Linear(64, 1)
        self.log_sigma_cls = nn.Parameter(torch.tensor(0.0))
        self.log_sigma_reg = nn.Parameter(torch.tensor(0.0))
        self.alpha = 1.0

    def forward(self, x):
        x = x.unsqueeze(1)
        x = self.conv(x).squeeze(2)
        x = self.shared(x)
        return self.classifier(x), self.regressor(x).squeeze(1)

def souncertainty_weighted_loss(loss_cls, loss_reg, model):
    return (
        0.5 * loss_cls / torch.exp(2 * model.log_sigma_cls) + model.alpha * model.log_sigma_cls +
        0.5 * loss_reg / torch.exp(2 * model.log_sigma_reg) + model.alpha * model.log_sigma_reg
    )

# === Train one model per cluster ===
for cluster_id in range(n_clusters):
    indices = np.where(train_clusters == cluster_id)[0]
    X_c = X_train_scaled[indices]
    y_cls_c = y_cls_train.iloc[indices]
    y_reg_c = y_reg_train.iloc[indices]

    dataset = TabularDataset(X_c, y_cls_c, y_reg_c)
    loader = DataLoader(dataset, batch_size=64, shuffle=True)

    model = CNNMTLNetWithSOUW(input_dim=X_c.shape[1], num_classes=y_cls_train.nunique()).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    loss_cls_fn = nn.CrossEntropyLoss()
    loss_reg_fn = nn.MSELoss()

    for epoch in range(15):
        model.train()
        for xb, yb_cls, yb_reg in loader:
            xb, yb_cls, yb_reg = xb.to(device), yb_cls.to(device), yb_reg.to(device)
            logits, reg_out = model(xb)
            loss = souncertainty_weighted_loss(loss_cls_fn(logits, yb_cls), loss_reg_fn(reg_out, yb_reg), model)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    cluster_models[cluster_id] = model

# === Predict for test data using nearest cluster ===
from sklearn.metrics.pairwise import euclidean_distances

y_cls_preds, y_reg_preds = [], []

for i in range(len(X_test_scaled)):
    x_raw = torch.tensor(X_test_scaled[i], dtype=torch.float32).unsqueeze(0).to(device)
    distances = euclidean_distances(X_test_scaled[i].reshape(1, -1), centroids)
    cluster_id = np.argmin(distances)

    model = cluster_models[cluster_id]
    model.eval()
    with torch.no_grad():
        logits, reg_out = model(x_raw)
        y_cls_preds.append(torch.argmax(logits, dim=1).item())
        y_reg_preds.append(reg_out.item())

# === Evaluate ===
acc = accuracy_score(y_cls_test, y_cls_preds)
r2 = r2_score(y_reg_test, y_reg_preds)

print(f"✅ Cluster-CNN MTL Accuracy (Species): {acc:.4f}")
print(f"✅ Cluster-CNN MTL R² (Productivity): {r2:.4f}")


✅ Cluster-CNN MTL Accuracy (Species): 0.6401
✅ Cluster-CNN MTL R² (Productivity): 0.6201


Attention - simple 

Expt 1 - Clustering

In [16]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, r2_score
from sklearn.metrics.pairwise import euclidean_distances
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import torch.nn.functional as F

# === Load CSVs ===
train_df = pd.read_csv("../../data/tr_data.csv")
test_df = pd.read_csv("../../data/te_data.csv")

drop_cols = ['Unnamed: 0', 'TestId', 'date_initial', 'date_final', 'Feature','latitude','longitute','env']
train_df.drop(columns=drop_cols, errors='ignore', inplace=True)
test_df.drop(columns=drop_cols, errors='ignore', inplace=True)

# Drop NaNs
train_df.dropna(inplace=True)
test_df.dropna(inplace=True)

# Encode species
le = LabelEncoder()
train_df['species_encoded'] = le.fit_transform(train_df['Specie'])
test_df['species_encoded'] = le.transform(test_df['Specie'])

# Separate features and targets
X_train = train_df.drop(columns=['Specie', 'species_encoded', 'Productivity (y)'])
X_test = test_df.drop(columns=['Specie', 'species_encoded', 'Productivity (y)'])

y_cls_train = train_df['species_encoded']
y_reg_train = train_df['Productivity (y)']
y_cls_test = test_df['species_encoded']
y_reg_test = test_df['Productivity (y)']

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# === Clustering ===
n_clusters = 4
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
train_clusters = kmeans.fit_predict(X_train_scaled)
centroids = kmeans.cluster_centers_

# === Dataset ===
class TabularDataset(Dataset):
    def __init__(self, X, y_cls, y_reg):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y_cls = torch.tensor(y_cls.values, dtype=torch.long)
        self.y_reg = torch.tensor(y_reg.values, dtype=torch.float32)

    def __len__(self): return len(self.X)
    def __getitem__(self, idx): return self.X[idx], self.y_cls[idx], self.y_reg[idx]

# === Model ===
class AttentionPooling(nn.Module):
    def __init__(self, dim):
        super().__init__()
        self.attn = nn.Linear(dim, 1)

    def forward(self, x):
        weights = self.attn(x)  # (B, T, 1)
        weights = torch.softmax(weights, dim=1)
        pooled = torch.sum(weights * x, dim=1)  # (B, D)
        return pooled

class MLPTransformerMultiTask(nn.Module):
    def __init__(self, input_dim=41, mlp_hidden_dim=128, transformer_dim=128,
                 num_layers=2, num_loops=4, num_species=10, use_attention_pooling=True):
        super().__init__()
        self.mlp = nn.Sequential(
            nn.Linear(input_dim, mlp_hidden_dim),
            nn.ReLU(),
            nn.Linear(mlp_hidden_dim, transformer_dim),
            nn.ReLU()
        )
        encoder_layer = nn.TransformerEncoderLayer(d_model=transformer_dim, nhead=4, batch_first=True)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.num_loops = num_loops
        self.pool = AttentionPooling(transformer_dim) if use_attention_pooling else lambda x: x.mean(dim=1)

        self.reg_head = nn.Sequential(
            nn.Linear(transformer_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )
        self.cls_head = nn.Sequential(
            nn.Linear(transformer_dim, 64),
            nn.ReLU(),
            nn.Linear(64, num_species)
        )

    def forward(self, x):
        B = x.size(0)
        x_embed = self.mlp(x)  # (B, D)
        x_seq = x_embed.unsqueeze(1).repeat(1, self.num_loops, 1)  # (B, T, D)
        x_transformed = self.transformer(x_seq)  # (B, T, D)
        x_pooled = self.pool(x_transformed)  # (B, D)
        reg_out = self.reg_head(x_pooled).squeeze(1)
        cls_logits = self.cls_head(x_pooled)
        return cls_logits, reg_out

# === Training one model per cluster ===
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
cluster_models = {}

for cluster_id in range(n_clusters):
    indices = np.where(train_clusters == cluster_id)[0]
    X_cluster = X_train_scaled[indices]
    y_cluster_cls = y_cls_train.iloc[indices]
    y_cluster_reg = y_reg_train.iloc[indices]

    train_loader = DataLoader(TabularDataset(X_cluster, y_cluster_cls, y_cluster_reg), batch_size=64, shuffle=True)

    model = MLPTransformerMultiTask(
        input_dim=X_cluster.shape[1],
        num_species=y_cls_train.nunique(),
        use_attention_pooling=True
    ).to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    criterion_cls = nn.CrossEntropyLoss()
    criterion_reg = nn.MSELoss()

    for epoch in range(15):
        model.train()
        for xb, yb_cls, yb_reg in train_loader:
            xb, yb_cls, yb_reg = xb.to(device), yb_cls.to(device), yb_reg.to(device)
            logits, reg_out = model(xb)
            loss = criterion_cls(logits, yb_cls) + criterion_reg(reg_out, yb_reg)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    cluster_models[cluster_id] = model

# === Predict test samples using nearest cluster model ===
y_cls_preds = []
y_reg_preds = []

for i in range(len(X_test_scaled)):
    x = torch.tensor(X_test_scaled[i], dtype=torch.float32).unsqueeze(0).to(device)
    distances = euclidean_distances(X_test_scaled[i].reshape(1, -1), centroids)
    nearest_cluster = np.argmin(distances)

    model = cluster_models[nearest_cluster]
    model.eval()
    with torch.no_grad():
        logits, reg_out = model(x)
        pred_cls = torch.argmax(logits, dim=1).item()
        pred_reg = reg_out.item()

    y_cls_preds.append(pred_cls)
    y_reg_preds.append(pred_reg)

# === Evaluate ===
acc = accuracy_score(y_cls_test, y_cls_preds)
r2 = r2_score(y_reg_test, y_reg_preds)

print(f"✅ Accuracy (Species): {acc:.4f}")
print(f"✅ R² Score (Productivity): {r2:.4f}")


✅ Accuracy (Species): 0.7760
✅ R² Score (Productivity): 0.7731


Expt 2 - no clustering

In [10]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, r2_score
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import torch.nn.functional as F

# === Load CSVs ===
train_df = pd.read_csv("../../data/tr_data.csv")
test_df = pd.read_csv("../../data/te_data.csv")

drop_cols = ['Unnamed: 0', 'TestId', 'date_initial', 'date_final', 'Feature', 'latitude', 'longitute', 'env']
train_df.drop(columns=drop_cols, errors='ignore', inplace=True)
test_df.drop(columns=drop_cols, errors='ignore', inplace=True)

# Drop NaNs
train_df.dropna(inplace=True)
test_df.dropna(inplace=True)

# Encode species
le = LabelEncoder()
train_df['species_encoded'] = le.fit_transform(train_df['Specie'])
test_df['species_encoded'] = le.transform(test_df['Specie'])

# Separate features and targets
X_train = train_df.drop(columns=['Specie', 'species_encoded', 'Productivity (y)'])
X_test = test_df.drop(columns=['Specie', 'species_encoded', 'Productivity (y)'])

y_cls_train = train_df['species_encoded']
y_reg_train = train_df['Productivity (y)']
y_cls_test = test_df['species_encoded']
y_reg_test = test_df['Productivity (y)']

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# === Dataset ===
class TabularDataset(Dataset):
    def __init__(self, X, y_cls, y_reg):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y_cls = torch.tensor(y_cls.values, dtype=torch.long)
        self.y_reg = torch.tensor(y_reg.values, dtype=torch.float32)

    def __len__(self): return len(self.X)
    def __getitem__(self, idx): return self.X[idx], self.y_cls[idx], self.y_reg[idx]

train_loader = DataLoader(TabularDataset(X_train_scaled, y_cls_train, y_reg_train), batch_size=64, shuffle=True)
test_loader = DataLoader(TabularDataset(X_test_scaled, y_cls_test, y_reg_test), batch_size=64, shuffle=False)

# === Model ===
class AttentionPooling(nn.Module):
    def __init__(self, dim):
        super().__init__()
        self.attn = nn.Linear(dim, 1)

    def forward(self, x):
        weights = self.attn(x)  # (B, T, 1)
        weights = torch.softmax(weights, dim=1)
        pooled = torch.sum(weights * x, dim=1)  # (B, D)
        return pooled

class MLPTransformerMultiTask(nn.Module):
    def __init__(self, input_dim=41, mlp_hidden_dim=128, transformer_dim=128,
                 num_layers=2, num_loops=4, num_species=10, use_attention_pooling=True):
        super().__init__()
        self.mlp = nn.Sequential(
            nn.Linear(input_dim, mlp_hidden_dim),
            nn.ReLU(),
            nn.Linear(mlp_hidden_dim, transformer_dim),
            nn.ReLU()
        )
        encoder_layer = nn.TransformerEncoderLayer(d_model=transformer_dim, nhead=4, batch_first=True)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.num_loops = num_loops
        self.pool = AttentionPooling(transformer_dim) if use_attention_pooling else lambda x: x.mean(dim=1)

        self.reg_head = nn.Sequential(
            nn.Linear(transformer_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )
        self.cls_head = nn.Sequential(
            nn.Linear(transformer_dim, 64),
            nn.ReLU(),
            nn.Linear(64, num_species)
        )

    def forward(self, x):
        B = x.size(0)
        x_embed = self.mlp(x)  # (B, D)
        x_seq = x_embed.unsqueeze(1).repeat(1, self.num_loops, 1)  # (B, T, D)
        x_transformed = self.transformer(x_seq)  # (B, T, D)
        x_pooled = self.pool(x_transformed)  # (B, D)
        reg_out = self.reg_head(x_pooled).squeeze(1)
        cls_logits = self.cls_head(x_pooled)
        return cls_logits, reg_out

# === Train single model ===
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = MLPTransformerMultiTask(
    input_dim=X_train.shape[1],
    num_species=y_cls_train.nunique(),
    use_attention_pooling=True
).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion_cls = nn.CrossEntropyLoss()
criterion_reg = nn.MSELoss()

for epoch in range(15):
    model.train()
    for xb, yb_cls, yb_reg in train_loader:
        xb, yb_cls, yb_reg = xb.to(device), yb_cls.to(device), yb_reg.to(device)
        logits, reg_out = model(xb)
        loss = criterion_cls(logits, yb_cls) + criterion_reg(reg_out, yb_reg)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

# === Evaluate ===
model.eval()
y_cls_preds = []
y_reg_preds = []

with torch.no_grad():
    for xb, _, _ in test_loader:
        xb = xb.to(device)
        logits, reg_out = model(xb)
        pred_cls = torch.argmax(logits, dim=1)
        y_cls_preds.extend(pred_cls.cpu().numpy())
        y_reg_preds.extend(reg_out.cpu().numpy())

# === Final Metrics ===
acc = accuracy_score(y_cls_test, y_cls_preds)
r2 = r2_score(y_reg_test, y_reg_preds)

print(f"✅ Accuracy (Species): {acc:.4f}")
print(f"✅ R² Score (Productivity): {r2:.4f}")




✅ Accuracy (Species): 0.7792
✅ R² Score (Productivity): 0.7465
