In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor

# Gradient Boosting Libraries
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

## Models For MinMax Data

In [3]:
# Load dataset scaled using MinMaxScaler
df1=pd.read_csv('Crop_Mod_OneH_MinMax.csv')
df1

Unnamed: 0,soil_ph,temp,humidity,wind_speed,n,p,k,crop_yield,soil_quality,crop_type_Barley,...,crop_type_Tomato,crop_type_Wheat,soil_type_Clay,soil_type_Loamy,soil_type_Peaty,soil_type_Saline,soil_type_Sandy,year,month,day
0,0.4,0.402160,0.996494,0.495300,0.847826,0.833333,0.696970,104.871310,0.874403,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2014,1,1
1,0.5,0.390129,1.000000,0.251013,0.108696,0.111111,0.090909,58.939796,0.355631,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2014,1,1
2,0.0,0.244445,1.000000,0.458278,0.097826,0.250000,0.348485,32.970413,0.145392,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2014,1,1
3,0.5,0.193090,1.000000,0.568686,0.217391,0.222222,0.272727,29.356115,0.427304,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2014,1,1
4,0.3,0.332976,1.000000,0.479032,0.326087,0.250000,0.393939,22.221375,0.475085,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,2014,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25490,0.3,0.064709,1.000000,0.747856,0.326087,0.388889,0.393939,13.069169,0.498976,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2023,12,31
25491,0.3,0.381596,1.000000,0.291693,0.456522,0.527778,0.636364,73.323885,0.589761,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2023,12,31
25492,0.0,0.442798,0.928758,0.503711,0.576087,0.500000,0.454545,39.226521,0.239590,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2023,12,31
25493,0.0,0.043176,1.000000,0.593603,0.336957,0.250000,0.136364,6.067881,0.156314,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2023,12,31


In [4]:
# Split dataset into features (X) and target (y)
X_train, X_test, y_train, y_test = train_test_split(df1.drop(columns='crop_yield'), df1['crop_yield'], test_size=0.2, random_state=42)

In [5]:
# Define regression models to evaluate
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "XGBoost": XGBRegressor(n_estimators=200, learning_rate=0.1, random_state=42, verbosity=0),
    "LightGBM": LGBMRegressor(n_estimators=200, learning_rate=0.1, random_state=42),
    "CatBoost": CatBoostRegressor(n_estimators=200, learning_rate=0.1, random_state=42, verbose=0),
    "AdaBoost": AdaBoostRegressor(n_estimators=200, learning_rate=0.1, random_state=42)
}

In [6]:
# Train and evaluate each model on MinMax-scaled data
res=[]
for name, model in models.items():
    model.fit(X_train,y_train)
    preds=model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    mae = mean_absolute_error(y_test, preds)
    r2 = r2_score(y_test, preds)
    res.append({
        "Model": name,
        "RMSE": rmse,
        "MAE": mae,
        "R2 Score": r2
    })

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000473 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 955
[LightGBM] [Info] Number of data points in the train set: 20396, number of used features: 26
[LightGBM] [Info] Start training from score 38.449331


In [7]:
# Display model performance comparison (MinMax data)
pd.DataFrame(res)

Unnamed: 0,Model,RMSE,MAE,R2 Score
0,Linear Regression,14.657593,11.187409,0.574734
1,Decision Tree,6.513339,4.558377,0.916026
2,Random Forest,4.790668,3.31367,0.954572
3,XGBoost,4.712465,3.304719,0.956043
4,LightGBM,4.688084,3.27983,0.956496
5,CatBoost,4.633138,3.255912,0.95751
6,AdaBoost,14.814775,12.749749,0.565564


## Models For Standard Scaled Data

In [8]:
# Load dataset scaled using StandardScaler
df2=pd.read_csv('Crop_Mod_OneH_Stand.csv')
df2

Unnamed: 0,soil_ph,temp,humidity,wind_speed,n,p,k,crop_yield,soil_quality,crop_type_Barley,...,crop_type_Tomato,crop_type_Wheat,soil_type_Clay,soil_type_Loamy,soil_type_Peaty,soil_type_Saline,soil_type_Sandy,year,month,day
0,-0.122964,-0.260859,0.802313,-0.469878,1.649420,1.473463,0.936043,104.871310,1.647104,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2014,1,1
1,0.182886,-0.315545,0.816279,-2.437751,-1.468169,-1.475844,-1.409067,58.939796,-0.141463,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2014,1,1
2,-1.346363,-0.977727,0.816279,-0.768113,-1.514016,-0.908669,-0.412395,32.970413,-0.866303,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2014,1,1
3,0.182886,-1.211154,0.816279,0.121288,-1.009700,-1.022104,-0.705534,29.356115,0.105642,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2014,1,1
4,-0.428814,-0.575326,0.816279,-0.600925,-0.551231,-0.908669,-0.236512,22.221375,0.270378,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,2014,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25490,-0.428814,-1.794691,0.816279,1.564607,-0.551231,-0.341495,-0.236512,13.069169,0.352746,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2023,12,31
25491,-0.428814,-0.354332,0.816279,-2.110050,-0.001068,0.225680,0.701532,73.323885,0.665746,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2023,12,31
25492,-1.346363,-0.076148,0.532502,-0.402119,0.503248,0.112245,-0.002001,39.226521,-0.541537,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2023,12,31
25493,-1.346363,-1.892562,0.816279,0.322011,-0.505384,-0.908669,-1.233184,6.067881,-0.828649,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2023,12,31


In [9]:
# Split dataset into train-test sets
X_train, X_test, y_train, y_test = train_test_split(df2.drop(columns='crop_yield'), df2['crop_yield'], test_size=0.2, random_state=42)

In [10]:
# Train and evaluate same models on Standard-scaled data
res2=[]
for name, model in models.items():
    model.fit(X_train,y_train)
    preds=model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    mae = mean_absolute_error(y_test, preds)
    r2 = r2_score(y_test, preds)
    res2.append({
        "Model": name,
        "RMSE": rmse,
        "MAE": mae,
        "R2 Score": r2
    })

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000193 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 960
[LightGBM] [Info] Number of data points in the train set: 20396, number of used features: 26
[LightGBM] [Info] Start training from score 38.449331


In [11]:
pd.DataFrame(res2)

Unnamed: 0,Model,RMSE,MAE,R2 Score
0,Linear Regression,14.657593,11.187409,0.574734
1,Decision Tree,6.510664,4.553918,0.916095
2,Random Forest,4.793166,3.315274,0.954524
3,XGBoost,4.727562,3.314253,0.955761
4,LightGBM,4.709001,3.292935,0.956107
5,CatBoost,4.633138,3.255912,0.95751
6,AdaBoost,14.815508,12.749952,0.565521


In [12]:
pd.DataFrame(res)

Unnamed: 0,Model,RMSE,MAE,R2 Score
0,Linear Regression,14.657593,11.187409,0.574734
1,Decision Tree,6.513339,4.558377,0.916026
2,Random Forest,4.790668,3.31367,0.954572
3,XGBoost,4.712465,3.304719,0.956043
4,LightGBM,4.688084,3.27983,0.956496
5,CatBoost,4.633138,3.255912,0.95751
6,AdaBoost,14.814775,12.749749,0.565564
