In [83]:
import pandas as pd

In [84]:
df = pd.read_csv('dataset/final_df.csv')

In [85]:
df.drop(columns=['Unnamed: 0'], inplace=True)

In [86]:
df.shape[0]

999804

In [87]:
df.dropna(subset=['Yield (Kg per ha)'], inplace=True)


In [88]:
df.shape

(792948, 11)

In [89]:
df.isnull().sum()

Unnamed: 0.1         0
Year                 0
Dist Name            0
Crop                 0
Area (1000 ha)       0
Yield (Kg per ha)    0
temp_avg             0
temp_max             0
temp_min             0
humidity             0
rainfall             0
dtype: int64

In [90]:
df.drop(columns=['Unnamed: 0.1'], inplace=True)

In [91]:
X = df.loc[:, df.columns!='Yield (Kg per ha)']
y = df['Yield (Kg per ha)']

In [92]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [93]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler

In [94]:
cat_features

['Dist Name', 'Crop']

In [95]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, r2_score

In [96]:
y_train

360961    259.26
471369      0.00
576822    824.82
652241      0.00
738992      0.00
           ...  
362606    500.00
469266    737.28
131932      0.00
843535      0.00
121958      0.00
Name: Yield (Kg per ha), Length: 555063, dtype: float64

In [97]:
cat_features = X.select_dtypes(include="object").columns
num_features = X.select_dtypes(exclude="object").columns

numeric_transformer = StandardScaler()
ord_encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)

preprocessor = ColumnTransformer(
    [
        ("Ord Encoder", ord_encoder, cat_features),
        ("StandardScaler", numeric_transformer, num_features)
    ]
)


In [98]:
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

In [99]:
X_test_processed

array([[10.        ,  9.        , -0.62241448, ..., -1.61234903,
        -0.50127493, -0.33152578],
       [23.        ,  3.        , -1.38481905, ..., -0.26405021,
        -1.38680086, -0.33152578],
       [ 5.        , 12.        ,  1.28359695, ...,  0.93429274,
         1.52034099,  1.56177916],
       ...,
       [ 9.        ,  8.        , -0.05061105, ...,  0.65394047,
         1.67982973,  4.83668501],
       [ 1.        ,  3.        , -0.62241448, ...,  1.01644248,
         0.73059382, -0.18854965],
       [15.        , 20.        , -1.00361676, ..., -1.34242848,
         0.53775356, -0.33152578]], shape=(237885, 9))

In [100]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

models = {
    "Decision Tree": DecisionTreeRegressor(max_depth=10, random_state=42),
    "Random Forest": RandomForestRegressor(n_estimators=100, max_depth=15, n_jobs=-1, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=100, max_depth=5, random_state=42),
    "XGBoost": XGBRegressor(n_estimators=200, max_depth=8, tree_method="hist", n_jobs=-1, random_state=42),
    "LightGBM": LGBMRegressor(n_estimators=200, max_depth=-1, n_jobs=-1, random_state=42),
    "CatBoost": CatBoostRegressor(iterations=200, depth=8, learning_rate=0.1, task_type="CPU", verbose=0, random_state=42)
}


In [101]:
import time
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

for name, model in models.items():
    print(f"\nTraining {name}...")
    start = time.time()
    model.fit(X_train_processed, y_train)
    end = time.time()

    y_pred_train = model.predict(X_train_processed)
    y_pred_test = model.predict(X_test_processed)

    
    mae_train = mean_absolute_error(y_train, y_pred_train)
    rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))
    r2_train = r2_score(y_train, y_pred_train)

    
    mae_test = mean_absolute_error(y_test, y_pred_test)
    rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))
    r2_test = r2_score(y_test, y_pred_test)

    print(f"⏱ Training time: {end - start:.2f} sec")
    print('For Train Data')
    print(f"MAE: {mae_train:.2f}, RMSE: {rmse_train:.2f}, R2: {r2_train:.4f}")
    print('For Test Data')
    print(f"MAE: {mae_test:.2f}, RMSE: {rmse_test:.2f}, R2: {r2_test:.4f}")



Training Decision Tree...
⏱ Training time: 3.07 sec
For Train Data
MAE: 156.51, RMSE: 268.02, R2: 0.9567
For Test Data
MAE: 157.03, RMSE: 268.82, R2: 0.9569

Training Random Forest...
⏱ Training time: 39.21 sec
For Train Data
MAE: 73.50, RMSE: 149.16, R2: 0.9866
For Test Data
MAE: 74.31, RMSE: 150.71, R2: 0.9865

Training Gradient Boosting...
⏱ Training time: 128.01 sec
For Train Data
MAE: 164.18, RMSE: 263.04, R2: 0.9583
For Test Data
MAE: 164.78, RMSE: 264.29, R2: 0.9584

Training XGBoost...
⏱ Training time: 1.68 sec
For Train Data
MAE: 28.69, RMSE: 46.13, R2: 0.9987
For Test Data
MAE: 29.78, RMSE: 48.09, R2: 0.9986

Training LightGBM...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001844 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1611
[LightGBM] [Info] Number of data points in the train set: 555063, number of used features: 



⏱ Training time: 1.00 sec
For Train Data
MAE: 125.03, RMSE: 200.26, R2: 0.9758
For Test Data
MAE: 125.63, RMSE: 201.62, R2: 0.9758

Training CatBoost...
⏱ Training time: 5.19 sec
For Train Data
MAE: 147.04, RMSE: 238.82, R2: 0.9656
For Test Data
MAE: 147.92, RMSE: 240.19, R2: 0.9656


In [102]:
df.head()

Unnamed: 0,Year,Dist Name,Crop,Area (1000 ha),Yield (Kg per ha),temp_avg,temp_max,temp_min,humidity,rainfall
0,2001,agra,BARLEY,14.96,3042.84,31.53,36.87,26.3,61.56,10.62
1,2012,agra,BARLEY,8.6,3255.06,20.03,29.62,9.75,21.47,0.0
2,2003,agra,BARLEY,14.91,2903.17,17.95,28.42,10.82,36.26,0.0
3,2015,agra,BARLEY,6.76,2716.28,26.08,30.98,20.73,56.5,1.4
4,2008,agra,BARLEY,10.34,3052.19,31.2,35.15,27.94,66.64,5.19


In [103]:
import joblib
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from xgboost import XGBRegressor


cat_features = ["Dist Name", "Crop"]
num_features = ["Year", "Area (1000 ha)",
                "temp_avg", "temp_max", "temp_min", "humidity", "rainfall"]

numeric_transformer = StandardScaler()
categorical_transformer = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_features),
        ("cat", categorical_transformer, cat_features)
    ]
)

pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", XGBRegressor(n_estimators=500, learning_rate=0.05,
                           max_depth=6, random_state=42))
])

pipeline.fit(X_train, y_train)

joblib.dump(pipeline, "yield_prediction_model.pkl")

pipeline = joblib.load("yield_prediction_model.pkl")
y_pred = pipeline.predict(X_test)


In [116]:
user_input = {
    "Dist Name": ["saharanpur"],           
    "Crop": ["BARLEY"],              
    "Year": [2023],                  
    "Area (1000 ha)": [14],          
    "Production (1000 tons)": [5],  
    "temp_avg": [28],             
    "temp_max": [31],    
    'temp_min': [25],            
    "humidity": [60],               
    "rainfall": [97]              
}

input_df = pd.DataFrame(user_input)
predicted_yield = pipeline.predict(input_df)
print(f"Predicted Yield (Kg per ha): {predicted_yield[0]:.2f}")


Predicted Yield (Kg per ha): 2531.45
