In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import warnings

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor


sns.set(style="whitegrid")

In [6]:
df = pd.read_csv('datasets/global_master_dataset_fixed.csv')

In [7]:
yield_99 = df['Yield'].quantile(0.99)
df = df[(df['Yield'] <= yield_99) & (df['Yield'] > 0)].copy()
df['Year_Normalized'] = (df['Year'] - df['Year'].min()) / (df['Year'].max() - df['Year'].min())

In [8]:
df.head()

Unnamed: 0,Country,Country_Code,Crop,Crop_Code,Year,Unit,Yield,Yield_scaled,Crop_Category,Fertilizer_Nutrient nitrogen N (total),Fertilizer_Nutrient phosphate P2O5 (total),Fertilizer_Nutrient potash K2O (total),Pesticide_Total_Tonnes,ISO3,Agricultural Land (%),GDP (USD),Rainfall (mm),Temperature (C),Region,Year_Normalized
0,Afghanistan,2,"Almonds, in shell",221,2015,kg/ha,1652.1,-0.361153,Nuts,185062.41,21777.45,123.86,,AFG,58.123668,19134220000.0,406.96,13.66,"Middle East, North Africa, Afghanistan & Pakistan",0.0
1,Afghanistan,2,"Almonds, in shell",221,2016,kg/ha,1685.9,-0.359706,Nuts,193574.75,18286.84,175.46,,AFG,58.123668,18116570000.0,373.35,14.19,"Middle East, North Africa, Afghanistan & Pakistan",0.125
2,Afghanistan,2,"Almonds, in shell",221,2017,kg/ha,1378.8,-0.37285,Nuts,332639.63,41850.37,70.41,,AFG,58.123668,18753460000.0,297.65,13.82,"Middle East, North Africa, Afghanistan & Pakistan",0.25
3,Afghanistan,2,"Almonds, in shell",221,2018,kg/ha,1716.1,-0.358414,Nuts,241259.6,29265.51,37.63,,AFG,58.276988,18053220000.0,272.3,14.25,"Middle East, North Africa, Afghanistan & Pakistan",0.375
4,Afghanistan,2,"Almonds, in shell",221,2019,kg/ha,1308.3,-0.375868,Nuts,242111.35,52221.72,204.04,,AFG,58.276988,18799440000.0,394.8,13.58,"Middle East, North Africa, Afghanistan & Pakistan",0.5


In [9]:
category_counts = df['Crop_Category'].value_counts()
min_samples = 3000  # Increased threshold for better models
valid_categories = category_counts[category_counts >= min_samples].index.tolist()

In [10]:
for cat in valid_categories:
    count = category_counts[cat]
    print(f"  {cat}: {count:,} samples")

  Other: 50,475 samples
  Vegetables: 18,028 samples
  Fruits: 16,978 samples
  Cereals: 12,109 samples
  Legumes: 11,880 samples
  Industrial Crops: 5,971 samples
  Root Crops: 4,748 samples
  Oil Crops: 4,301 samples


In [11]:
df_filtered = df[df['Crop_Category'].isin(valid_categories)].copy()

In [12]:
numerical_features = [
    'Year_Normalized',
    'Rainfall (mm)', 
    'Temperature (C)', 
    'GDP (USD)', 
    'Agricultural Land (%)',
    'Pesticide_Total_Tonnes',
    'Fertilizer_Nutrient nitrogen N (total)',
    'Fertilizer_Nutrient phosphate P2O5 (total)',
    'Fertilizer_Nutrient potash K2O (total)'
    
]

categorical_features = ['Country', 'Region', 'Crop']

Crop Category: Other
Samples: 50,475
Unique crops: 69
Training Random Forest...
Results:
  RMSE: 3,813.84 kg/ha
  MAE:  1,591.55 kg/ha
  R²:   0.8688
Crop Category: Vegetables
Samples: 18,028
Unique crops: 20
Training Random Forest...
Results:
  RMSE: 5,103.77 kg/ha
  MAE:  2,775.58 kg/ha
  R²:   0.8740
Crop Category: Fruits
Samples: 16,978
Unique crops: 24
Training Random Forest...
Results:
  RMSE: 5,033.94 kg/ha
  MAE:  2,674.51 kg/ha
  R²:   0.8325
Crop Category: Cereals
Samples: 12,109
Unique crops: 17
Training Random Forest...
Results:
  RMSE: 1,840.42 kg/ha
  MAE:  538.81 kg/ha
  R²:   0.7881
Crop Category: Legumes
Samples: 11,880
Unique crops: 19
Training Random Forest...
Results:
  RMSE: 1,466.71 kg/ha
  MAE:  556.55 kg/ha
  R²:   0.8555
Crop Category: Industrial Crops
Samples: 5,971
Unique crops: 14
Training Random Forest...
Results:
  RMSE: 2,664.66 kg/ha
  MAE:  1,048.12 kg/ha
  R²:   0.9816
Crop Category: Root Crops
Samples: 4,748
Unique crops: 6
Training Random Forest...
Results:
  RMSE: 3,369.31 kg/ha
  MAE:  1,895.53 kg/ha
  R²:   0.8954
Crop Category: Oil Crops
Samples: 4,301
Unique crops: 8
Training Random Forest...
Results:
  RMSE: 1,484.91 kg/ha
  MAE:  421.44 kg/ha
  R²:   0.8989


In [17]:
all_y_test = np.concatenate([category_models[cat]['y_test'] for cat in valid_categories])
all_y_pred = np.concatenate([category_models[cat]['y_pred'] for cat in valid_categories])

overall_rmse = np.sqrt(mean_squared_error(all_y_test, all_y_pred))
overall_mae = mean_absolute_error(all_y_test, all_y_pred)
overall_r2 = r2_score(all_y_test, all_y_pred)

print(f"\nOverall Metrics (Crop-Specific Models with Crop Feature):")
print(f"   RMSE: {overall_rmse:,.2f} kg/ha")
print(f"   MAE:  {overall_mae:,.2f} kg/ha")
print(f"   R²:   {overall_r2:.4f}")


Overall Metrics (Crop-Specific Models with Crop Feature):
   RMSE: 3,810.67 kg/ha
   MAE:  1,655.10 kg/ha
   R²:   0.9043


In [12]:
results_df = pd.DataFrame(all_results).sort_values('R2', ascending=False)
results_df

Unnamed: 0,Crop_Category,Samples,Unique_Crops,RMSE,MAE,R2
5,Industrial Crops,5971,14,2659.492375,1045.535176,0.981706
7,Oil Crops,4301,8,1493.575671,421.297151,0.897766
6,Root Crops,4748,6,3381.446392,1897.847905,0.894654
1,Vegetables,18028,20,5107.338462,2775.742163,0.873834
0,Other,50475,69,3813.247827,1592.280504,0.8688
4,Legumes,11880,19,1461.380865,555.934492,0.856547
2,Fruits,16978,24,5038.736781,2674.997354,0.832179
3,Cereals,12109,17,1841.113162,540.825876,0.78797


Crop Category: Other
Samples: 50,475
Unique crops: 69
Training Random Forest...
Results:
  RMSE: 3,813.84 kg/ha
  MAE:  1,591.55 kg/ha
  R²:   0.8688
Crop Category: Vegetables
Samples: 18,028
Unique crops: 20
Training Random Forest...
Results:
  RMSE: 5,103.77 kg/ha
  MAE:  2,775.58 kg/ha
  R²:   0.8740
Crop Category: Fruits
Samples: 16,978
Unique crops: 24
Training Random Forest...
Results:
  RMSE: 5,033.94 kg/ha
  MAE:  2,674.51 kg/ha
  R²:   0.8325
Crop Category: Cereals
Samples: 12,109
Unique crops: 17
Training Random Forest...
Results:
  RMSE: 1,840.42 kg/ha
  MAE:  538.81 kg/ha
  R²:   0.7881
Crop Category: Legumes
Samples: 11,880
Unique crops: 19
Training Random Forest...
Results:
  RMSE: 1,466.71 kg/ha
  MAE:  556.55 kg/ha
  R²:   0.8555
Crop Category: Industrial Crops
Samples: 5,971
Unique crops: 14
Training Random Forest...
Results:
  RMSE: 2,664.66 kg/ha
  MAE:  1,048.12 kg/ha
  R²:   0.9816
Crop Category: Root Crops
Samples: 4,748
Unique crops: 6
Training Random Forest...
Results:
  RMSE: 3,369.31 kg/ha
  MAE:  1,895.53 kg/ha
  R²:   0.8954
Crop Category: Oil Crops
Samples: 4,301
Unique crops: 8
Training Random Forest...
Results:
  RMSE: 1,484.91 kg/ha
  MAE:  421.44 kg/ha
  R²:   0.8989


## Linear Regression

In [31]:
print("\n\n================ BASELINE MODEL: LINEAR REGRESSION ================\n")

all_results_lr = []
category_models_lr = {}

for category in valid_categories:
    print(f"Crop Category: {category}")
    
    df_cat = df_filtered[df_filtered['Crop_Category'] == category].copy()
    print(f"Samples: {len(df_cat):,}")
    print(f"Unique crops: {df_cat['Crop'].nunique()}")

    # Log transform
    df_cat['Log_Yield'] = np.log1p(df_cat['Yield'])

    X = df_cat[numerical_features + categorical_features]
    y = df_cat['Log_Yield']

    # Split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    # Preprocessing pipeline (same as RF)
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='Unknown')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False, max_categories=50))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ])

    # Train Linear Regression
    print("Training Linear Regression...")

    lr_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', LinearRegression())
    ])

    lr_pipeline.fit(X_train, y_train)

    # Predict
    y_pred_log = lr_pipeline.predict(X_test)
    y_pred = np.expm1(y_pred_log)
    y_test_orig = np.expm1(y_test)

    # Evaluate
    rmse = np.sqrt(mean_squared_error(y_test_orig, y_pred))
    mae = mean_absolute_error(y_test_orig, y_pred)
    r2 = r2_score(y_test_orig, y_pred)

    print("Results:")
    print(f"  RMSE: {rmse:,.2f} kg/ha")
    print(f"  MAE:  {mae:,.2f} kg/ha")
    print(f"  R²:   {r2:.4f}")

    all_results_lr.append({
        'Crop_Category': category,
        'Samples': len(df_cat),
        'Unique_Crops': df_cat['Crop'].nunique(),
        'RMSE': rmse,
        'MAE': mae,
        'R2': r2
    })

    category_models_lr[category] = {
        'model': lr_pipeline,
        'y_test': y_test_orig,
        'y_pred': y_pred
    }





Crop Category: Other
Samples: 50,475
Unique crops: 69
Training Linear Regression...
Results:
  RMSE: 5,987.08 kg/ha
  MAE:  2,704.28 kg/ha
  R²:   0.6766
Crop Category: Vegetables
Samples: 18,028
Unique crops: 20
Training Linear Regression...
Results:
  RMSE: 11,420.12 kg/ha
  MAE:  7,650.22 kg/ha
  R²:   0.3692
Crop Category: Fruits
Samples: 16,978
Unique crops: 24
Training Linear Regression...
Results:
  RMSE: 10,200.30 kg/ha
  MAE:  6,452.27 kg/ha
  R²:   0.3123
Crop Category: Cereals
Samples: 12,109
Unique crops: 17
Training Linear Regression...
Results:
  RMSE: 3,460.11 kg/ha
  MAE:  1,436.47 kg/ha
  R²:   0.2511
Crop Category: Legumes
Samples: 11,880
Unique crops: 19
Training Linear Regression...
Results:
  RMSE: 2,936.47 kg/ha
  MAE:  1,353.83 kg/ha
  R²:   0.4208
Crop Category: Industrial Crops
Samples: 5,971
Unique crops: 14
Training Linear Regression...
Results:
  RMSE: 9,240.28 kg/ha
  MAE:  4,179.95 kg/ha
  R²:   0.7792
Crop Category: Root Crops
Samples: 4,748
Unique cro

## Decision Tree Regressor

In [33]:
print("\n\n================ BASELINE MODEL: DECISION TREE ================\n")

all_results_dt = []
category_models_dt = {}

for category in valid_categories:
    print(f"Crop Category: {category}")
    
    df_cat = df_filtered[df_filtered['Crop_Category'] == category].copy()
    print(f"Samples: {len(df_cat):,}")
    print(f"Unique crops: {df_cat['Crop'].nunique()}")

    # Log transform
    df_cat['Log_Yield'] = np.log1p(df_cat['Yield'])

    X = df_cat[numerical_features + categorical_features]
    y = df_cat['Log_Yield']

    # Split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    # Preprocessing pipeline (same as RF)
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='Unknown')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False, max_categories=50))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ])

    # Train Decision Tree
    print("Training Decision Tree...")

    dt_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', DecisionTreeRegressor(
            max_depth=12,
            min_samples_leaf=10,
            random_state=42
        ))
    ])

    dt_pipeline.fit(X_train, y_train)

    # Predict
    y_pred_log = dt_pipeline.predict(X_test)
    y_pred = np.expm1(y_pred_log)
    y_test_orig = np.expm1(y_test)

    # Evaluate
    rmse = np.sqrt(mean_squared_error(y_test_orig, y_pred))
    mae = mean_absolute_error(y_test_orig, y_pred)
    r2 = r2_score(y_test_orig, y_pred)

    print("Results:")
    print(f"  RMSE: {rmse:,.2f} kg/ha")
    print(f"  MAE:  {mae:,.2f} kg/ha")
    print(f"  R²:   {r2:.4f}")

    all_results_dt.append({
        'Crop_Category': category,
        'Samples': len(df_cat),
        'Unique_Crops': df_cat['Crop'].nunique(),
        'RMSE': rmse,
        'MAE': mae,
        'R2': r2
    })

    category_models_dt[category] = {
        'model': dt_pipeline,
        'y_test': y_test_orig,
        'y_pred': y_pred
    }





Crop Category: Other
Samples: 50,475
Unique crops: 69
Training Decision Tree...
Results:
  RMSE: 6,815.84 kg/ha
  MAE:  3,476.57 kg/ha
  R²:   0.5808
Crop Category: Vegetables
Samples: 18,028
Unique crops: 20
Training Decision Tree...
Results:
  RMSE: 10,523.55 kg/ha
  MAE:  6,805.15 kg/ha
  R²:   0.4644
Crop Category: Fruits
Samples: 16,978
Unique crops: 24
Training Decision Tree...
Results:
  RMSE: 8,770.05 kg/ha
  MAE:  5,523.76 kg/ha
  R²:   0.4916
Crop Category: Cereals
Samples: 12,109
Unique crops: 17
Training Decision Tree...
Results:
  RMSE: 2,647.28 kg/ha
  MAE:  1,141.95 kg/ha
  R²:   0.5616
Crop Category: Legumes
Samples: 11,880
Unique crops: 19
Training Decision Tree...
Results:
  RMSE: 2,314.14 kg/ha
  MAE:  1,005.12 kg/ha
  R²:   0.6403
Crop Category: Industrial Crops
Samples: 5,971
Unique crops: 14
Training Decision Tree...
Results:
  RMSE: 4,517.09 kg/ha
  MAE:  1,912.55 kg/ha
  R²:   0.9472
Crop Category: Root Crops
Samples: 4,748
Unique crops: 6
Training Decision T

## Training with Random Forest algorithm

In [35]:
all_results = []
category_models_RF = {}

for category in valid_categories:
    print(f"Crop Category: {category}")
    
    df_cat = df_filtered[df_filtered['Crop_Category'] == category].copy()
    print(f"Samples: {len(df_cat):,}")
    print(f"Unique crops: {df_cat['Crop'].nunique()}")
    
    # Log transform
    df_cat['Log_Yield'] = np.log1p(df_cat['Yield'])
    
    X = df_cat[numerical_features + categorical_features]
    y = df_cat['Log_Yield']
    
    # Split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    
    # Pipeline
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])
    
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='Unknown')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False, max_categories=50))
    ])
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ])
    
    # Train
    print(f"Training Random Forest...")
    
    rf_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', RandomForestRegressor(
            n_estimators=200,
            max_depth=25,
            min_samples_split=5,
            min_samples_leaf=2,
            random_state=42,
            n_jobs=-1
        ))
    ])
    
    rf_pipeline.fit(X_train, y_train)
    
    # Predict
    y_pred_log = rf_pipeline.predict(X_test)
    y_pred = np.expm1(y_pred_log)
    y_test_orig = np.expm1(y_test)
    
    # Evaluate
    rmse = np.sqrt(mean_squared_error(y_test_orig, y_pred))
    mae = mean_absolute_error(y_test_orig, y_pred)
    r2 = r2_score(y_test_orig, y_pred)
    
    print(f"Results:")
    print(f"  RMSE: {rmse:,.2f} kg/ha")
    print(f"  MAE:  {mae:,.2f} kg/ha")
    print(f"  R²:   {r2:.4f}")
    
    all_results.append({
        'Crop_Category': category,
        'Samples': len(df_cat),
        'Unique_Crops': df_cat['Crop'].nunique(),
        'RMSE': rmse,
        'MAE': mae,
        'R2': r2
    })
    
    category_models[category] = {
        'model': rf_pipeline,
        'y_test': y_test_orig,
        'y_pred': y_pred
    }

Crop Category: Other
Samples: 50,475
Unique crops: 69
Training Random Forest...
Results:
  RMSE: 3,813.25 kg/ha
  MAE:  1,592.28 kg/ha
  R²:   0.8688
Crop Category: Vegetables
Samples: 18,028
Unique crops: 20
Training Random Forest...
Results:
  RMSE: 5,107.34 kg/ha
  MAE:  2,775.74 kg/ha
  R²:   0.8738
Crop Category: Fruits
Samples: 16,978
Unique crops: 24
Training Random Forest...
Results:
  RMSE: 5,038.74 kg/ha
  MAE:  2,675.00 kg/ha
  R²:   0.8322
Crop Category: Cereals
Samples: 12,109
Unique crops: 17
Training Random Forest...
Results:
  RMSE: 1,841.11 kg/ha
  MAE:  540.83 kg/ha
  R²:   0.7880
Crop Category: Legumes
Samples: 11,880
Unique crops: 19
Training Random Forest...
Results:
  RMSE: 1,461.38 kg/ha
  MAE:  555.93 kg/ha
  R²:   0.8565
Crop Category: Industrial Crops
Samples: 5,971
Unique crops: 14
Training Random Forest...
Results:
  RMSE: 2,659.49 kg/ha
  MAE:  1,045.54 kg/ha
  R²:   0.9817
Crop Category: Root Crops
Samples: 4,748
Unique crops: 6
Training Random Forest...
R

## OVERALL PERFORMANCE COMPARISON

In [39]:


import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def compute_overall_performance(model_dict, model_name):
    all_y_true = []
    all_y_pred = []

    for cat, data in model_dict.items():
        all_y_true.extend(list(data["y_test"]))
        all_y_pred.extend(list(data["y_pred"]))

    all_y_true = np.array(all_y_true)
    all_y_pred = np.array(all_y_pred)

    rmse = np.sqrt(mean_squared_error(all_y_true, all_y_pred))
    mae = mean_absolute_error(all_y_true, all_y_pred)
    r2 = r2_score(all_y_true, all_y_pred)

    return {
        "Model": model_name,
        "RMSE": rmse,
        "MAE": mae,
        "R2": r2
    }




overall_results = []

overall_results.append(compute_overall_performance(category_models_RF, "Random Forest"))
overall_results.append(compute_overall_performance(category_models_dt, "Decision Tree"))
overall_results.append(compute_overall_performance(category_models_lr, "Linear Regression"))

df_overall = pd.DataFrame(overall_results)

print("\n================ OVERALL MODEL PERFORMANCE ================\n")
print(df_overall)




               Model         RMSE          MAE        R2
0      Random Forest  3810.667812  1655.097244  0.904288
1      Decision Tree  7039.661576  3622.825321  0.673360
2  Linear Regression  7534.820315  3804.807650  0.625793
