In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, KFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, make_scorer
import math

In [None]:
# Load datasets
df1 = pd.read_csv("climate_change_impact_on_agriculture_2024.csv")
df2 = pd.read_csv("farmer_advisor_dataset.csv")

# Ensure both datasets have consistent column names (adjust if necessary)
print("Dataset 1 shape:", df1.shape)
print("Dataset 2 shape:", df2.shape)

# Try to find common target column (assuming last column is yield)
df1["target"] = df1.iloc[:, -1]
df2["target"] = df2.iloc[:, -1]

# Concatenate datasets (align columns)
df = pd.concat([df1, df2], axis=0, ignore_index=True, sort=False)

# Separate features and target
X = df.drop(columns=["target"])
y = df["target"]

# Identify categorical and numeric columns
cat_cols = X.select_dtypes(include=["object"]).columns.tolist()
num_cols = X.select_dtypes(exclude=["object"]).columns.tolist()

# Preprocessor: scale numeric, one-hot encode categorical
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
    ]
)

# Define models to compare
models = {
    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(),
    "RandomForest": RandomForestRegressor(random_state=42),
    "GradientBoosting": GradientBoostingRegressor(random_state=42),
    "SVR": SVR(),
    "KNN": KNeighborsRegressor()
}

# RMSE scorer
def rmse(y_true, y_pred):
    return math.sqrt(mean_squared_error(y_true, y_pred))

rmse_scorer = make_scorer(rmse, greater_is_better=False)

# Cross-validation setup
cv = KFold(n_splits=5, shuffle=True, random_state=42)

results = []
for name, model in models.items():
    pipe = Pipeline(steps=[("pre", preprocessor), ("model", model)])
    rmse_scores = -cross_val_score(pipe, X, y, cv=cv, scoring="neg_root_mean_squared_error")
    r2_scores = cross_val_score(pipe, X, y, cv=cv, scoring="r2")
    results.append({
        "model": name,
        "rmse_mean": np.mean(rmse_scores),
        "rmse_std": np.std(rmse_scores),
        "r2_mean": np.mean(r2_scores),
        "r2_std": np.std(r2_scores)
    })

# Results DataFrame
res_df = pd.DataFrame(results).sort_values("rmse_mean").reset_index(drop=True)
print("\nCross-validated results (5-fold CV):\n")
print(res_df)

# Best model
best_model = res_df.iloc[0]["model"]
print(f"\nBest model: {best_model}")