In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
import joblib
import time

In [2]:
# Load data
df = pd.read_csv("crop_yield.csv")

In [None]:
# Feature engineering
# The following features cannot be created as the required source columns are missing in the dataset:
# - 'Days_to_Harvest' for 'rainfall_per_day'
# - 'Irrigation_Used' for 'fert_irrig_score'
# - 'Temperature_Celsius' for 'temp_rain_product'
# Only available columns from df.columns output: ['Crop', 'Crop_Year', 'Season', 'State', 'Area', 'Production', 'Annual_Rainfall', 'Fertilizer', 'Pesticide', 'Yield']
# We will proceed without these specific engineered features for now.

In [3]:
# Encode categories
cat_cols = ['Crop', 'Season', 'State'] # 'Region', 'Soil_Type', 'Weather_Condition' are not present in df
label_encoders = {}
for col in cat_cols:
    le = LabelEncoder()
    df[col + '_encoded'] = le.fit_transform(df[col])
    label_encoders[col] = le

In [5]:
# Features & target
features = [
    'Crop_Year', 'Area', 'Production', 'Annual_Rainfall', 'Fertilizer', 'Pesticide',
    'Crop_encoded', 'Season_encoded', 'State_encoded'
]
X = df[features]
y = df['Yield'] # The target column is 'Yield', not 'Yield_tons_per_hectare'

In [6]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:

# Hyperparameter tuning
param_dist = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 4, 5],
    'subsample': [0.6, 0.8, 1.0],
    'min_samples_split': [2, 5, 10],
}
gb = GradientBoostingRegressor(random_state=42)
search = RandomizedSearchCV(gb, param_distributions=param_dist, n_iter=20, cv=3, scoring='r2', n_jobs=-1, verbose=1, random_state=42)
search.fit(X_train, y_train)
best_model = search.best_estimator_

Fitting 3 folds for each of 20 candidates, totalling 60 fits


In [8]:
# Train and evaluate
start_time = time.time()
best_model.fit(X_train, y_train)
train_time = time.time() - start_time

y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)

In [9]:
# Metrics
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)
train_mae = mean_absolute_error(y_train, y_train_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
train_mape = mean_absolute_percentage_error(y_train, y_train_pred)*100
test_mape = mean_absolute_percentage_error(y_test, y_test_pred)*100
cv_scores = cross_val_score(best_model, X_train, y_train, cv=5, scoring='r2', n_jobs=-1)
cv_mean, cv_std = cv_scores.mean(), cv_scores.std()

In [10]:
# Save files
joblib.dump(best_model, 'gb_model.pkl')
joblib.dump(label_encoders, 'gb_label_encoders.pkl')
joblib.dump(X.columns.tolist(), 'gb_feature_names.pkl')
joblib.dump(search.best_params_, 'gb_best_params.pkl')


['gb_best_params.pkl']

In [11]:
# Print results
print(f"\nModel trained in {train_time:.2f} seconds")
print(f"Train R²: {train_r2:.4f}")
print(f"Test R²:  {test_r2:.4f}")
print(f"Train MAE: {train_mae:.4f}")
print(f"Test MAE:  {test_mae:.4f}")
print(f"Train RMSE: {train_rmse:.4f}")
print(f"Test RMSE:  {test_rmse:.4f}")
print(f"Train MAPE: {train_mape:.2f}%")
print(f"Test MAPE:  {test_mape:.2f}%")
print(f"CV R²: {cv_mean:.4f} ± {cv_std:.4f}")



Model trained in 15.59 seconds
Train R²: 0.9999
Test R²:  0.9177
Train MAE: 1.9621
Test MAE:  9.1025
Train RMSE: 8.2146
Test RMSE:  256.7560
Train MAPE: 4884797019485059.00%
Test MAPE:  3555451987840904.50%
CV R²: 0.9482 ± 0.0684


In [12]:
# To download in Google Colab
from google.colab import files
files.download('gb_model.pkl')
files.download('gb_label_encoders.pkl')
files.download('gb_feature_names.pkl')
files.download('gb_best_params.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### Download Model Artifacts

This cell will download the `PKL` files that contain your trained Gradient Boosting Regressor model, the label encoders used for categorical features, the list of feature names used for training, and the best hyperparameters found during `RandomizedSearchCV`.

In [None]:
from google.colab import files

files.download('gb_model.pkl')
files.download('gb_label_encoders.pkl')
files.download('gb_feature_names.pkl')
files.download('gb_best_params.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### Combine all model artifacts into a single PKL file

This cell loads the individually saved model, label encoders, feature names, and best hyperparameters, combines them into a dictionary, and then saves this dictionary as a single `.pkl` file. This creates a convenient bundle for your entire model setup.

In [None]:
import joblib

# Load the individual artifacts
model = joblib.load('gb_model.pkl')
label_encoders = joblib.load('gb_label_encoders.pkl')
feature_names = joblib.load('gb_feature_names.pkl')
best_params = joblib.load('gb_best_params.pkl')

# Combine them into a single dictionary
complete_model_artifacts = {
    'model': model,
    'label_encoders': label_encoders,
    'feature_names': feature_names,
    'best_params': best_params
}

# Save the combined artifacts to a single .pkl file
joblib.dump(complete_model_artifacts, 'complete_gb_model_artifacts.pkl')

print("All model artifacts saved to 'complete_gb_model_artifacts.pkl'")

All model artifacts saved to 'complete_gb_model_artifacts.pkl'


### Download the combined model artifact

This cell will download the newly created `complete_gb_model_artifacts.pkl` file, which contains all your model components.

In [None]:
from google.colab import files
files.download('complete_gb_model_artifacts.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### Download Model Artifacts

This cell will download the `PKL` files that contain your trained Gradient Boosting Regressor model, the label encoders used for categorical features, the list of feature names used for training, and the best hyperparameters found during `RandomizedSearchCV`.

In [None]:
from google.colab import files

files.download('gb_model.pkl')
files.download('gb_label_encoders.pkl')
files.download('gb_feature_names.pkl')
files.download('gb_best_params.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>