In [1]:
import pandas as pd
df1=pd.read_csv("yeild_dataset.csv")

# `Fine-Tuned XGBoost Regression for Crop Yield Prediction`

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor

a = df1.drop(columns=['yeild', 'Production','Soil pH','Soil Type'], axis=1) 
b = df1['yeild']  


# step1Identify Numerical & Categorical Columns
num_features = a.select_dtypes(exclude="object").columns
cat_features = a.select_dtypes(include="object").columns

# Step2 Create Column Transformer for Preprocessing
numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder(handle_unknown='ignore', sparse=False)
preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, cat_features),
        ("StandardScaler", numeric_transformer, num_features),        
    ]
)


#Step3 Split the Data (Avoid Data Leakage)
a_train, a_test, b_train, b_test = train_test_split(a, b, test_size=0.2, random_state=42)


#Step4 Apply Preprocessing (Fit Only on Training Data)
a_train = preprocessor.fit_transform(a_train)
a_test = preprocessor.transform(a_test)


# Step 5 train xgBoost with hyperparamters
xgb = XGBRegressor(
    random_state=42, 
    tree_method='gpu_hist', 
    n_estimators=300,           
    learning_rate=0.05,         
    max_depth=8,                
    subsample=0.8,              
    colsample_bytree=0.8,       
    reg_alpha=0.3,              
    reg_lambda=0.8              
)

# Train the Model
xgb.fit(a_train, b_train)

# predictions & evaluation
y_train_pred = xgb.predict(a_train)
y_test_pred = xgb.predict(a_test)

# training metrics
train_mae = mean_absolute_error(b_train, y_train_pred)
train_mse = mean_squared_error(b_train, y_train_pred)
train_rmse = np.sqrt(train_mse)
train_r2 = r2_score(b_train, y_train_pred)
# test metrics
test_mae = mean_absolute_error(b_test, y_test_pred)
test_mse = mean_squared_error(b_test, y_test_pred)
test_rmse = np.sqrt(test_mse)
test_r2 = r2_score(b_test, y_test_pred)

# final output
print("model performance on training Data")
print(f"Train R² Score: {train_r2}")
print(f"Train MAE: {train_mae}")
print(f"Train MSE: {train_mse}")
print(f"Train RMSE: {train_rmse}\n")
print("model performance on test Data")
print(f"Test R² Score: {test_r2}")
print(f"Test MAE: {test_mae}")
print(f"Test MSE: {test_mse}")
print(f"Test RMSE: {test_rmse}")


    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:
Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)


model performance on training Data
Train R² Score: 0.9897809535924578
Train MAE: 6.276916410238808
Train MSE: 6083.879010159382
Train RMSE: 77.99922442024268

model performance on test Data
Test R² Score: 0.9591218610121297
Test MAE: 9.674200238811087
Test MSE: 32910.83043797658
Test RMSE: 181.4134240842628


# `Hyperparameter-Tuned XGBoost Regression for Crop Yield Prediction With RandomizedSearch`

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor

a = df1.drop(columns=['yeild', 'Production'], axis=1)
 # Target variable
b = df1['yeild'] 

# Step 1 identify numerical & categorical columns
num_features = a.select_dtypes(exclude="object").columns
cat_features = a.select_dtypes(include="object").columns

# Step 2: create column transformer for preprocessing
numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder(handle_unknown='ignore', sparse=False)
preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, cat_features),
        ("StandardScaler", numeric_transformer, num_features),        
    ]
)

# Step3 split the data (avoid data leakage)
a_train, a_test, b_train, b_test = train_test_split(a, b, test_size=0.2, random_state=42)

# Step 4 apply preprocessing (fit only on training data)
a_train = preprocessor.fit_transform(a_train)
a_test = preprocessor.transform(a_test)

# step 5 define xgBoost regressor
xgb = XGBRegressor(tree_method='gpu_hist', objective='reg:squarederror', random_state=42)

# Step 6 define hyperparameter grid for tuning**
param_grid = {
    'n_estimators': [200, 300, 500, 800],
    'learning_rate': [0.01, 0.03, 0.05, 0.1],
    'max_depth': [4, 6, 7, 10],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.7, 0.8, 0.85, 1.0],
    'reg_alpha': [0.1, 0.2, 0.3, 0.5],
    'reg_lambda': [0.5, 0.7, 0.8, 1.0],
    'min_child_weight': [1, 3, 5]
}

# step 7 hyperparameter tuning using randomizedsearchv
xgb_grid = RandomizedSearchCV(
    xgb, param_grid, cv=5, scoring='r2', n_iter=30, n_jobs=-1, verbose=1, random_state=42
)
xgb_grid.fit(a_train, b_train)

# Step 8 Train the Best Model
best_model = xgb_grid.best_estimator_
best_model.fit(a_train, b_train)

# Step 9 predictions & evaluation
y_train_pred = best_model.predict(a_train)
y_test_pred = best_model.predict(a_test)

# training metrics
train_mae = mean_absolute_error(b_train, y_train_pred)
train_mse = mean_squared_error(b_train, y_train_pred)
train_rmse = np.sqrt(train_mse)
train_r2 = r2_score(b_train, y_train_pred)
# test metrics
test_mae = mean_absolute_error(b_test, y_test_pred)
test_mse = mean_squared_error(b_test, y_test_pred)
test_rmse = np.sqrt(test_mse)
test_r2 = r2_score(b_test, y_test_pred)
# Step10 Final Output
print(" best hyperparameters found:", xgb_grid.best_params_)
print("\n Model Performance on Training Data:")
print(f"Train R² Score: {train_r2}")
print(f"Train MAE: {train_mae}")
print(f"Train MSE: {train_mse}")
print(f"Train RMSE: {train_rmse}\n")
print("model performance on test data:")
print(f"Test R² Score: {test_r2}")
print(f"Test MAE: {test_mae}")
print(f"Test MSE: {test_mse}")
print(f"Test RMSE: {test_rmse}")




Fitting 5 folds for each of 30 candidates, totalling 150 fits



    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:


 best hyperparameters found: {'subsample': 0.8, 'reg_lambda': 0.7, 'reg_alpha': 0.5, 'n_estimators': 300, 'min_child_weight': 5, 'max_depth': 10, 'learning_rate': 0.05, 'colsample_bytree': 0.8}

 Model Performance on Training Data:
Train R² Score: 0.9780648383862353
Train MAE: 8.710967162325291
Train MSE: 13059.033495331125
Train RMSE: 114.27612828290572

model performance on test data:
Test R² Score: 0.9612977378857608
Test MAE: 11.448073280404685
Test MSE: 31159.040444228645
Test RMSE: 176.51923533776323


# `Hyperparameter-Tuned XGBoost Regression for Crop Yield Prediction With RandomizedSearch`

In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor

a = df1.drop(columns=['yeild', 'Production','Soil pH','Soil Type'], axis=1)
 # Target variable
b = df1['yeild'] 

# Step 1 identify numerical & categorical columns
num_features = a.select_dtypes(exclude="object").columns
cat_features = a.select_dtypes(include="object").columns

# Step 2: create column transformer for preprocessing
numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder(handle_unknown='ignore', sparse=False)
preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, cat_features),
        ("StandardScaler", numeric_transformer, num_features),        
    ]
)

# Step3 split the data (avoid data leakage)
a_train, a_test, b_train, b_test = train_test_split(a, b, test_size=0.2, random_state=42)

# Step 4 apply preprocessing (fit only on training data)
a_train = preprocessor.fit_transform(a_train)
a_test = preprocessor.transform(a_test)

# step 5 define xgBoost regressor
xgb = XGBRegressor(tree_method='gpu_hist', objective='reg:squarederror', random_state=42)

# Step 6 define hyperparameter grid for tuning**
param_grid = {
    'n_estimators': [200, 300, 500, 800],
    'learning_rate': [0.01, 0.03, 0.05, 0.1],
    'max_depth': [4, 6, 7, 10],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.7, 0.8, 0.85, 1.0],
    'reg_alpha': [0.1, 0.2, 0.3, 0.5],
    'reg_lambda': [0.5, 0.7, 0.8, 1.0],
    'min_child_weight': [1, 3, 5]
}

# step 7 hyperparameter tuning using randomizedsearchv
xgb_grid = RandomizedSearchCV(
    xgb, param_grid, cv=5, scoring='r2', n_iter=30, n_jobs=-1, verbose=1, random_state=42
)
xgb_grid.fit(a_train, b_train)

# Step 8 Train the Best Model
best_model = xgb_grid.best_estimator_
best_model.fit(a_train, b_train)

# Step 9 predictions & evaluation
y_train_pred = best_model.predict(a_train)
y_test_pred = best_model.predict(a_test)

# training metrics
train_mae = mean_absolute_error(b_train, y_train_pred)
train_mse = mean_squared_error(b_train, y_train_pred)
train_rmse = np.sqrt(train_mse)
train_r2 = r2_score(b_train, y_train_pred)
# test metrics
test_mae = mean_absolute_error(b_test, y_test_pred)
test_mse = mean_squared_error(b_test, y_test_pred)
test_rmse = np.sqrt(test_mse)
test_r2 = r2_score(b_test, y_test_pred)
# Step10 Final Output
print(" best hyperparameters found:", xgb_grid.best_params_)
print("\n Model Performance on Training Data:")
print(f"Train R² Score: {train_r2}")
print(f"Train MAE: {train_mae}")
print(f"Train MSE: {train_mse}")
print(f"Train RMSE: {train_rmse}\n")
print("model performance on test data:")
print(f"Test R² Score: {test_r2}")
print(f"Test MAE: {test_mae}")
print(f"Test MSE: {test_mse}")
print(f"Test RMSE: {test_rmse}")




Fitting 5 folds for each of 30 candidates, totalling 150 fits



    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:


 best hyperparameters found: {'subsample': 0.8, 'reg_lambda': 0.5, 'reg_alpha': 0.1, 'n_estimators': 500, 'min_child_weight': 3, 'max_depth': 7, 'learning_rate': 0.1, 'colsample_bytree': 0.8}

 Model Performance on Training Data:
Train R² Score: 0.9849841862987546
Train MAE: 8.287516381375122
Train MSE: 8939.620210555639
Train RMSE: 94.54956483535838

model performance on test data:
Test R² Score: 0.9599103353794801
Test MAE: 11.721002978185059
Test MSE: 32276.032796717507
Test RMSE: 179.65531663916187
