In [9]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import xgboost as xgb

# === Load train and test data ===
train_df = pd.read_csv("../../data/tr_data.csv")
test_df = pd.read_csv("../../data/te_data.csv")

# === Drop unnecessary columns ===
drop_cols = ['Unnamed: 0', 'TestId', 'date_initial', 'date_final', 'Feature', 'env', 'latitude', 'longitute']
train_df = train_df.drop(columns=drop_cols, errors='ignore')
test_df = test_df.drop(columns=drop_cols, errors='ignore')

# === Encode Species ===
label_encoder = LabelEncoder()
train_df['species_encoded'] = label_encoder.fit_transform(train_df['Specie'])
test_df['species_encoded'] = label_encoder.transform(test_df['Specie'])

# === Drop non-numeric columns and missing values ===
train_df = train_df.select_dtypes(include=[np.number]).dropna()
test_df = test_df.select_dtypes(include=[np.number]).dropna()

# === Features and Targets ===
X_train = train_df.drop(columns=['Productivity (y)'])
y_train = train_df['Productivity (y)']

X_test = test_df.drop(columns=['Productivity (y)'])
y_test = test_df['Productivity (y)']

# === Standardize features ===
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# === Define clustering methods (ONLY KMeans and GMM) ===
clustering_methods = {
    'KMeans': KMeans(n_clusters=8, random_state=42),
    'GMM': GaussianMixture(n_components=8, random_state=42)
}

# === Define regression models ===
models = {
    'XGBoost': xgb.XGBRegressor(objective="reg:squarederror", n_estimators=100, random_state=42),
    'RandomForest': RandomForestRegressor(n_estimators=100, random_state=42),
    'MLP': MLPRegressor(hidden_layer_sizes=(128, 64), max_iter=500, random_state=42)
}

# === Store results ===
results = []

# === Loop through clustering methods and models ===
for cluster_name, cluster_algo in clustering_methods.items():
    # Fit clustering on training data
    train_clusters = cluster_algo.fit_predict(X_train_scaled)
    unique_clusters = np.unique(train_clusters)
    
    # For each model
    for model_name, model_prototype in models.items():
        cluster_models = {}

        # Train a model for each cluster
        for cluster_id in unique_clusters:
            indices = np.where(train_clusters == cluster_id)[0]
            X_cluster = X_train.iloc[indices]
            y_cluster = y_train.iloc[indices]

            model = model_prototype.__class__(**model_prototype.get_params())  # clone
            model.fit(X_cluster, y_cluster)
            cluster_models[cluster_id] = model
        
        # Predict on test set
        test_predictions = []
        for i in range(len(X_test)):
            x_test_scaled = X_test_scaled[i].reshape(1, -1)
            cluster_id = cluster_algo.predict(x_test_scaled)[0]

            pred = cluster_models[cluster_id].predict(X_test.iloc[i:i+1])[0]
            test_predictions.append(pred)

        # Evaluate
        mse = mean_squared_error(y_test, test_predictions)
        mae = mean_absolute_error(y_test, test_predictions)
        r2 = r2_score(y_test, test_predictions)

        results.append({
            'Clustering': cluster_name,
            'Model': model_name,
            'MSE': mse,
            'MAE': mae,
            'R2': r2
        })

# === Display results ===
results_df = pd.DataFrame(results)
print(results_df)


  Clustering         Model       MSE       MAE        R2
0     KMeans       XGBoost  1.457232  0.879935  0.878101
1     KMeans  RandomForest  1.485478  0.890422  0.875739
2     KMeans           MLP  5.526616  1.712809  0.537695
3        GMM       XGBoost  1.460071  0.880399  0.877864
4        GMM  RandomForest  1.485998  0.890684  0.875695
5        GMM           MLP  6.327234  1.905190  0.470722
