In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing   import OneHotEncoder
from sklearn.compose         import ColumnTransformer
from sklearn.pipeline        import Pipeline
from sklearn.ensemble        import GradientBoostingRegressor
from sklearn.multioutput     import MultiOutputRegressor
from sklearn.metrics         import (
    r2_score,
    mean_squared_error,
    mean_absolute_error
)

# ─── Your binning functions ──────────────────────────────────────────────
def bin_temperature(temp):
    if temp <= 10: return 'very_low'
    elif temp <= 20: return 'low'
    elif temp <= 30: return 'moderate'
    elif temp <= 35: return 'high'
    return 'very_high'

def bin_humidity(h):
    if h <= 30: return 'very_low'
    elif h <= 50: return 'low'
    elif h <= 70: return 'moderate'
    elif h <= 85: return 'high'
    return 'very_high'

def bin_ph(ph):
    if ph <= 5.5: return 'very_low'
    elif ph <= 6.5: return 'low'
    elif ph <= 7.5: return 'moderate'
    elif ph <= 8.5: return 'high'
    return 'very_high'

def bin_soil_quality(sq):
    if sq <= 2: return 'very_low'
    elif sq <= 4: return 'low'
    elif sq <= 6: return 'moderate'
    elif sq <= 8: return 'high'
    return 'very_high'
# ─────────────────────────────────────────────────────────────────────────

# 1. Load and preprocess
df = pd.read_csv("crop_yield_dataset.csv")
df = df[df["Crop_Yield"] > 0].copy()

for col, fn in [
    ("Temperature", bin_temperature),
    ("Humidity",    bin_humidity),
    ("Soil_pH",     bin_ph),
    ("Soil_Quality",bin_soil_quality),
]:
    df[f"{col}_Bin"] = df[col].apply(fn)

# one‐hot encode crop type now or later; we'll do it later
df = pd.get_dummies(df, columns=['Crop_Type'])

# 2. Define features & targets
features    = [
    'Soil_Type',
    'Temperature_Bin',
    'Humidity_Bin',
    'Soil_pH_Bin',
    'Soil_Quality_Bin',
    'Wind_Speed',
    'N','P','K'
]
target_cols = [c for c in df.columns if c.startswith('Crop_Type_')]

X = df[features]
y = df[target_cols]

# 3. Build pipeline (unchanged model)
cat_feats = ['Soil_Type',
             'Temperature_Bin','Humidity_Bin','Soil_pH_Bin','Soil_Quality_Bin']
num_feats = ['Wind_Speed','N','P','K']
pre = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_feats),
    ('num','passthrough',                   num_feats)
])
pipe = Pipeline([
    ('prep',    pre),
    ('predict', MultiOutputRegressor(
                   GradientBoostingRegressor(
                     n_estimators=100,
                     learning_rate=0.1,
                     random_state=42
                   )))
])

# 4. Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 5. Fit
pipe.fit(X_train, y_train)

# 6. Predict
y_pred = pipe.predict(X_test)

# 7. Compute metrics
# — Overall (average across outputs)
r2_overall   = r2_score(y_test, y_pred, multioutput='uniform_average')
rmse_overall = mean_squared_error(y_test, y_pred, squared=False)
mae_overall  = mean_absolute_error(y_test, y_pred)

print(f"Overall R²:  {r2_overall:.3f}")
print(f"Overall RMSE: {rmse_overall:.3f}")
print(f"Overall MAE:  {mae_overall:.3f}\n")

# — Per‐crop‐type
for idx, col in enumerate(target_cols):
    r2  = r2_score(y_test.iloc[:, idx], y_pred[:, idx])
    rmse = mean_squared_error(y_test.iloc[:, idx], y_pred[:, idx], squared=False)
    mae  = mean_absolute_error(y_test.iloc[:, idx], y_pred[:, idx])
    print(f"{col:20s} R²={r2:.3f}, RMSE={rmse:.3f}, MAE={mae:.3f}")

# 8. (Optional) 5-fold cross‐val R² on the training set
cv_scores = cross_val_score(
    pipe, X_train, y_train,
    cv=5,
    scoring='r2'
)
print(f"\n5-fold CV R²: {cv_scores.mean():.3f} ± {cv_scores.std():.3f}")




Overall R²:  0.874
Overall RMSE: 0.067
Overall MAE:  0.036

Crop_Type_Barley     R²=0.898, RMSE=0.096, MAE=0.059
Crop_Type_Corn       R²=0.998, RMSE=0.015, MAE=0.010
Crop_Type_Cotton     R²=0.996, RMSE=0.018, MAE=0.012
Crop_Type_Potato     R²=0.427, RMSE=0.229, MAE=0.112
Crop_Type_Rice       R²=0.998, RMSE=0.014, MAE=0.009
Crop_Type_Soybean    R²=1.000, RMSE=0.002, MAE=0.001
Crop_Type_Sugarcane  R²=0.990, RMSE=0.029, MAE=0.016
Crop_Type_Sunflower  R²=0.987, RMSE=0.034, MAE=0.021
Crop_Type_Tomato     R²=0.999, RMSE=0.007, MAE=0.004
Crop_Type_Wheat      R²=0.447, RMSE=0.229, MAE=0.112

5-fold CV R²: 0.873 ± 0.001


In [10]:
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report

# 1. After your regression predictions:
#    y_pred = pipe.predict(X_test)

# 2. Convert one-hot → label index
y_true_labels = np.argmax(y_test.values, axis=1)
y_pred_labels = np.argmax(y_pred,      axis=1)

# 3. Get readable class names (strip the prefix)
class_names = [c.replace('Crop_Type_', '') for c in target_cols]

# 4. Compute & print the big confusion matrix
cm = confusion_matrix(y_true_labels, y_pred_labels)
print("Overall confusion matrix:")
print(cm)

# 5. (Optional) A classification report for precision/recall per class
print("\nClassification report:")
print(classification_report(
    y_true_labels,
    y_pred_labels,
    target_names=class_names
))



Overall confusion matrix:
[[507   0   0   0   0   0   0   0   0   0]
 [  0 510   0   0   0   0   0   0   0   0]
 [  0   0 507   0   0   0   0   0   0   0]
 [  0   0   0 316   0   0   0   0   0 201]
 [  0   0   0   0 499   0   0   0   0   0]
 [  0   0   0   0   0 534   0   0   0   0]
 [  0   0   0   0   0   0 467   0   0   0]
 [  0   0   0   0   0   0   0 515   0   0]
 [  0   0   0   0   0   0   0   0 504   0]
 [  0   0   0 326   0   0   0   0   0 213]]

Classification report:
              precision    recall  f1-score   support

      Barley       1.00      1.00      1.00       507
        Corn       1.00      1.00      1.00       510
      Cotton       1.00      1.00      1.00       507
      Potato       0.49      0.61      0.55       517
        Rice       1.00      1.00      1.00       499
     Soybean       1.00      1.00      1.00       534
   Sugarcane       1.00      1.00      1.00       467
   Sunflower       1.00      1.00      1.00       515
      Tomato       1.00      1.0