In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
import joblib
import time

In [2]:
# Load dataset
df = pd.read_csv("crop_yield.csv")

In [None]:
# Feature engineering

In [3]:
# Encode categorical features
categorical_cols = ['Crop', 'Season', 'State']
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col + '_encoded'] = le.fit_transform(df[col])
    label_encoders[col] = le

In [4]:
# Feature engineering
df['production_per_area'] = df['Production'] / df['Area']
df['rainfall_fertilizer_interaction'] = df['Annual_Rainfall'] * df['Fertilizer']

In [5]:
# Prepare feature matrix and target vector
features = [
    'Crop_Year', 'Area', 'Production', 'Annual_Rainfall', 'Fertilizer', 'Pesticide',
    'production_per_area', 'rainfall_fertilizer_interaction',
    'Crop_encoded', 'Season_encoded', 'State_encoded'
]
X = df[features]
y = df['Yield']

In [6]:
# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=None
)

In [7]:
# Define parameter distributions for RandomizedSearch
param_dist = {
    'n_estimators': [100, 200, 300, 400],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2'],
    'bootstrap': [True, False]
}

# Initialize Random Forest Regressor
rf = RandomForestRegressor(random_state=42, n_jobs=-1)

In [8]:
# Initialize Random Forest Regressor
rf = RandomForestRegressor(random_state=42, n_jobs=-1)


In [9]:
# Setup RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=30,  # Number of parameter settings sampled
    cv=3,
    scoring='r2',
    random_state=42,
    n_jobs=-1,
    verbose=1
)

In [10]:
# Fit randomized search
start_time = time.time()
random_search.fit(X_train, y_train)
elapsed_time = time.time() - start_time

print(f"RandomizedSearchCV took {elapsed_time:.2f} seconds")
print(f"Best parameters found: {random_search.best_params_}")
print(f"Best CV R2 score: {random_search.best_score_:.4f}")


Fitting 3 folds for each of 30 candidates, totalling 90 fits


27 fits failed out of a total of 90.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
27 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.12/dist-packages/sklearn/base.py", line 1382, in wrapper
    estimator._validate_params()
  File "/usr/local/lib/python3.12/dist-packages/sklearn/base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.12/dist-packages/sklearn/utils/_param_validation.py", line 98, in validate_parameter_constraints
    raise InvalidParameterError(
sklea

RandomizedSearchCV took 577.85 seconds
Best parameters found: {'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 30, 'bootstrap': False}
Best CV R2 score: 0.9831


In [11]:
# Use the best estimator for prediction
best_rf = random_search.best_estimator_
best_rf.fit(X_train, y_train)

In [12]:
# Predictions and evaluation
y_train_pred = best_rf.predict(X_train)
y_test_pred = best_rf.predict(X_test)

def print_metrics(dataset_name, y_true, y_pred):
    print(f"\n{dataset_name} data evaluation:")
    print(f" R^2 score: {r2_score(y_true, y_pred):.4f}")
    print(f" MAE: {mean_absolute_error(y_true, y_pred):.4f} tons/ha")
    print(f" RMSE: {np.sqrt(mean_squared_error(y_true, y_pred)):.4f} tons/ha")
    print(f" MAPE: {mean_absolute_percentage_error(y_true, y_pred)*100:.2f}%")

print_metrics("Training", y_train, y_train_pred)
print_metrics("Testing", y_test, y_test_pred)


Training data evaluation:
 R^2 score: 1.0000
 MAE: 0.0005 tons/ha
 RMSE: 0.0011 tons/ha
 MAPE: 23588786061.80%

Testing data evaluation:
 R^2 score: 0.9922
 MAE: 5.2252 tons/ha
 RMSE: 79.2842 tons/ha
 MAPE: 384805677990310.12%


In [13]:
# Cross-validation performance
cv_scores = cross_val_score(best_rf, X_train, y_train, cv=5, scoring='r2', n_jobs=-1)
print(f"\nCross-validation R2: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")



Cross-validation R2: 0.9837 ± 0.0215


In [14]:
# Save artifacts for later use
joblib.dump(best_rf, 'random_forest_model.pkl')
joblib.dump(label_encoders, 'rf_label_encoders.pkl')
joblib.dump(features, 'rf_feature_names.pkl')
joblib.dump(random_search.best_params_, 'rf_best_params.pkl')

print("\nModel and preprocessing artifacts saved successfully.")



Model and preprocessing artifacts saved successfully.


In [15]:
model_artifacts = {
    'model': best_rf,
    'label_encoders': label_encoders,
    'features': features,
    'best_params': random_search.best_params_
}

joblib.dump(model_artifacts, 'crop_yield_random_forest_model_artifacts.pkl')

print("Consolidated model artifacts saved to 'crop_yield_random_forest_model_artifacts.pkl'.")

Consolidated model artifacts saved to 'crop_yield_random_forest_model_artifacts.pkl'.


In [16]:
from google.colab import files
files.download('crop_yield_random_forest_model_artifacts.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### How to Download the Notebook as a `.ipynb` File:

To download this entire Colab notebook (`.ipynb` file) which includes all your code, markdown, outputs, and references to your saved model artifacts, please follow these steps:

1.  Go to `File` in the Colab menu bar at the top.
2.  Select `Download`.
3.  Choose `Download .ipynb`.

Your browser will then download the notebook file. You can typically rename the file during the download process or afterward to include the model's name, for example: `crop_yield_prediction_random_forest_model.ipynb`.

In [17]:
 from google.colab import files
 files.download('random_forest_model.pkl')
 files.download('rf_label_encoders.pkl')
 files.download('rf_feature_names.pkl')
 files.download('rf_best_params.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
print(df.columns)

Index(['Crop', 'Crop_Year', 'Season', 'State', 'Area', 'Production',
       'Annual_Rainfall', 'Fertilizer', 'Pesticide', 'Yield'],
      dtype='object')
