After performing a model comparison using scikit-learn, we selected XGBoost as the most suitable algorithm for our needs. To implement the algorithm and perform hyperparameter tuning, we will be using the XGBoost library instead of scikit-learn, as it provides faster processing capabilities.

In [2]:
from xgboost import XGBRegressor
import joblib

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import warnings
warnings.filterwarnings("ignore")
# Set Pandas to display all columsn
pd.set_option("display.max.columns", None)

In [3]:
# Import data
dataset = pd.read_csv('data_preprocessing/yield_df.csv', encoding='utf-8', index_col=[0])
dataset = dataset.rename(columns={'hg/ha_yield': 'Crop Yield (hg/ha)', 'Item': 'Crop', 'average_rain_fall_mm_per_year': 'Rainfall (mm/year)', 'pesticides_tonnes': 'Pesticides (tonnes)', 'avg_temp': 'Temperature (Celsius)'})

# Handle categorical variables
encoder = OneHotEncoder(handle_unknown='ignore')
encoded_data = encoder.fit_transform(dataset[["Crop", 'Area']])
encoded_df = pd.DataFrame(encoded_data.toarray(), columns=encoder.get_feature_names_out(["Crop", 'Area']))
df = pd.concat([dataset, encoded_df], axis=1)

# Create features and labels (and remove categorical variables)
features = df.drop(['Crop Yield (hg/ha)',"Crop",'Area', 'Rainfall (mm/year)'], axis=1)
labels = df['Crop Yield (hg/ha)']

# Create training (80%) and test (20%) sets
X_train, X_test , y_train , y_test = train_test_split(features, labels, test_size =0.2, random_state=42)

In [4]:
# Train an XGBoost regression model on the training data
# {'rgs__gamma': 0.5, 'rgs__learning_rate': 0.2, 'rgs__max_depth': 12, 'rgs__min_child_weight': 1,
# 'rgs__n_estimators': 300, 'rgs__reg_alpha': 0.5, 'rgs__reg_lambda': 0.5}

model = XGBRegressor(gamma=0.5, learning_rate=0.2, max_depth=12, min_child_weight=1,
                     n_estimators=300, reg_alpha=0.5, reg_lambda=0.5)

model.fit(X_train, y_train)


In [5]:
# Save the trained model to a file
joblib.dump(model, 'xgb_regressor.joblib')


['xgb_regressor.joblib']

In [6]:
# Convert the trained model to a JSON format
model.get_booster().dump_model("xgb_trained_model.json")


In [None]:
#import json
# json.dump(model_json, open("xgb_trained_model.json", "w"))