In [1]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from catboost import CatBoostRegressor
import pandas as pd
import numpy as np

In [2]:
# Load crop yield dataset
df=pd.read_csv('crop_yield_dataset.csv')
df

Unnamed: 0,Date,Crop_Type,Soil_Type,Soil_pH,Temperature,Humidity,Wind_Speed,N,P,K,Crop_Yield,Soil_Quality
0,2014-01-01,Wheat,Peaty,5.50,9.440599,80.000000,10.956707,60.5,45.0,31.5,0.000000,22.833333
1,2014-01-01,Corn,Loamy,6.50,20.052576,79.947424,8.591577,84.0,66.0,50.0,104.871310,66.666667
2,2014-01-01,Rice,Peaty,5.50,12.143099,80.000000,7.227751,71.5,54.0,38.5,0.000000,27.333333
3,2014-01-01,Barley,Sandy,6.75,19.751848,80.000000,2.682683,50.0,40.0,30.0,58.939796,35.000000
4,2014-01-01,Soybean,Peaty,5.50,16.110395,80.000000,7.696070,49.5,45.0,38.5,32.970413,22.166667
...,...,...,...,...,...,...,...,...,...,...,...,...
36515,2023-12-31,Cotton,Clay,6.25,19.538555,80.000000,3.666664,66.0,55.0,48.0,73.323885,49.291667
36516,2023-12-31,Sugarcane,Peaty,5.50,21.068336,78.931664,8.795036,71.5,54.0,42.0,39.226521,27.916667
36517,2023-12-31,Tomato,Sandy,6.75,6.030148,80.000000,9.409497,50.0,36.0,30.0,0.000000,33.833333
36518,2023-12-31,Potato,Peaty,5.50,11.079561,80.000000,10.969366,60.5,45.0,31.5,6.067881,22.833333


# Custom Transformer to Extract Date Features 

In [3]:

from sklearn.base import BaseEstimator, TransformerMixin
class DateFeatureExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, date_column):
        self.date_column = date_column
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X = X.copy()
        X[self.date_column] = pd.to_datetime(X[self.date_column], errors="coerce")
        X["year"] = X[self.date_column].dt.year
        X["month"] = X[self.date_column].dt.month
        X["day"] = X[self.date_column].dt.day
        X = X.drop(columns=[self.date_column])
        return X

In [4]:
# Split data into features (X) and target (y)
X=df.drop(columns=['Crop_Yield'])

In [5]:
y=df['Crop_Yield']

In [6]:
df.columns

Index(['Date', 'Crop_Type', 'Soil_Type', 'Soil_pH', 'Temperature', 'Humidity',
       'Wind_Speed', 'N', 'P', 'K', 'Crop_Yield', 'Soil_Quality'],
      dtype='object')

In [7]:
# Define numeric and categorical features
num_features=['Soil_pH', 'Temperature', 'Humidity','Wind_Speed', 'N', 'P', 'K', 'Soil_Quality']
cat_features=['Crop_Type', 'Soil_Type']

In [8]:
# Numeric data transformation (scaling)
numeric_transformer = Pipeline(steps=[
    ("scaler", StandardScaler())
])

# Categorical data transformation (one-hot encoding)
categorical_transformer = Pipeline(steps=[
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

# Combine date, numeric, and categorical transformations
preprocessor = Pipeline(steps=[
    ("date_features", DateFeatureExtractor(date_column="Date")),
    ("column_transform", ColumnTransformer(transformers=[
        ("num", numeric_transformer, num_features),
        ("cat", categorical_transformer, cat_features)
    ]))
])

In [9]:
# Build full pipeline with preprocessing + model
model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", CatBoostRegressor(n_estimators=200, learning_rate=0.1, random_state=42, verbose=0))
])

In [10]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
# Train model
model.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,steps,"[('date_features', ...), ('column_transform', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,date_column,'Date'

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [12]:
# Evaluate model performance
preds = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, preds))
mae = mean_absolute_error(y_test, preds)
r2 = r2_score(y_test, preds)
print(rmse,mae,r2)

3.82547465692719 2.382279837645315 0.9780539244785578


# Save and Load Model 

In [13]:

import joblib
joblib.dump(model, "RandomReg.pkl")

['RandomReg.pkl']

In [14]:
# Load saved model
loaded_model = joblib.load("RandomReg.pkl")

In [15]:
df.head()

Unnamed: 0,Date,Crop_Type,Soil_Type,Soil_pH,Temperature,Humidity,Wind_Speed,N,P,K,Crop_Yield,Soil_Quality
0,2014-01-01,Wheat,Peaty,5.5,9.440599,80.0,10.956707,60.5,45.0,31.5,0.0,22.833333
1,2014-01-01,Corn,Loamy,6.5,20.052576,79.947424,8.591577,84.0,66.0,50.0,104.87131,66.666667
2,2014-01-01,Rice,Peaty,5.5,12.143099,80.0,7.227751,71.5,54.0,38.5,0.0,27.333333
3,2014-01-01,Barley,Sandy,6.75,19.751848,80.0,2.682683,50.0,40.0,30.0,58.939796,35.0
4,2014-01-01,Soybean,Peaty,5.5,16.110395,80.0,7.69607,49.5,45.0,38.5,32.970413,22.166667


In [16]:
df.describe()

Unnamed: 0,Soil_pH,Temperature,Humidity,Wind_Speed,N,P,K,Crop_Yield,Soil_Quality
count,36520.0,36520.0,36520.0,36520.0,36520.0,36520.0,36520.0,36520.0,36520.0
mean,6.602731,23.813996,74.256624,10.020153,66.011035,53.014006,42.01825,26.87848,37.516632
std,0.816973,8.920519,6.767587,2.99831,10.886721,8.812884,8.53781,25.740936,17.703171
min,5.5,-3.540176,45.851089,-3.388906,45.0,36.0,27.0,0.0,13.291667
25%,6.25,17.168542,69.745252,7.985872,58.5,45.0,35.0,0.0,22.5
50%,6.5,22.902987,77.097013,10.000299,65.0,54.0,42.0,23.366344,35.583333
75%,6.75,30.254748,80.0,12.038546,71.5,60.0,49.5,46.415729,49.291667
max,8.0,54.148911,80.0,22.606078,91.0,72.0,60.0,136.711982,74.333333


In [17]:
dict(X_test.iloc[100])

{'Date': '2019-12-17',
 'Crop_Type': 'Sunflower',
 'Soil_Type': 'Clay',
 'Soil_pH': np.float64(6.25),
 'Temperature': np.float64(17.083684217884688),
 'Humidity': np.float64(80.0),
 'Wind_Speed': np.float64(13.344424180031458),
 'N': np.float64(60.0),
 'P': np.float64(55.00000000000001),
 'K': np.float64(44.0),
 'Soil_Quality': np.float64(46.375)}

In [18]:
y_test.iloc[100]

np.float64(56.52371428015335)

In [19]:
df.columns

Index(['Date', 'Crop_Type', 'Soil_Type', 'Soil_pH', 'Temperature', 'Humidity',
       'Wind_Speed', 'N', 'P', 'K', 'Crop_Yield', 'Soil_Quality'],
      dtype='object')

# Predict on New Data

In [20]:
data=pd.DataFrame({'Date': ['2019-12-17'],
 'Crop_Type': 'Sunflower',
 'Soil_Type': 'Clay',
 'Soil_pH': np.float64(6.25),
 'Temperature': np.float64(17.083684217884688),
 'Humidity': np.float64(80.0),
 'Wind_Speed': np.float64(13.344424180031458),
 'N': np.float64(60.0),
 'P': np.float64(55.00000000000001),
 'K': np.float64(44.0),
 'Soil_Quality': np.float64(46.375)})

In [21]:
prediction=loaded_model.predict(data)
prediction

array([52.58210151])