In [39]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OrdinalEncoder,StandardScaler,OneHotEncoder
from sklearn.metrics import r2_score,mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,BaggingRegressor
from xgboost import XGBRegressor

In [40]:
data = pd.read_csv(r"Custom_Crops_yield_Historical_Dataset.csv")
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50765 entries, 0 to 50764
Data columns (total 20 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Dist Code                  50765 non-null  int64  
 1   Year                       50765 non-null  int64  
 2   State Code                 50765 non-null  int64  
 3   State Name                 50765 non-null  object 
 4   Dist Name                  50765 non-null  object 
 5   Crop                       50765 non-null  object 
 6   Area_ha                    50765 non-null  float64
 7   Yield_kg_per_ha            50765 non-null  float64
 8   N_req_kg_per_ha            50765 non-null  float64
 9   P_req_kg_per_ha            50765 non-null  float64
 10  K_req_kg_per_ha            50765 non-null  float64
 11  Total_N_kg                 50765 non-null  float64
 12  Total_P_kg                 50765 non-null  float64
 13  Total_K_kg                 50765 non-null  flo

In [41]:
dataset = data.drop(["State Code","Dist Code","Total_N_kg","Total_P_kg","Total_K_kg"],axis=1)

In [42]:
dataset.columns = dataset.columns.str.strip().str.lower().str.replace(" ", "_")

In [43]:
x = dataset.drop("yield_kg_per_ha",axis=1)
y = dataset["yield_kg_per_ha"]

In [44]:
cat_cols=["state_name","dist_name","crop"]
num_cols=[x for x in x.columns if x not in cat_cols]

In [45]:
encoder = OneHotEncoder(handle_unknown="ignore",sparse_output=False)
encoded_cat = pd.DataFrame(
    encoder.fit_transform(x[cat_cols]),
    columns=encoder.get_feature_names_out(cat_cols),
    index=x.index
)

In [46]:
scaler = StandardScaler()
scaled_num = pd.DataFrame(
    scaler.fit_transform(x[num_cols]),
    columns=num_cols,
    index=x.index
)

In [47]:
x_processed = pd.concat([encoded_cat,scaled_num],axis=1)

In [48]:
x_train,x_test,y_train,y_test= train_test_split(x_processed,y,test_size=0.2,random_state=47)

In [53]:
models={
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(n_estimators=200, random_state=47),
    "XGBoost": XGBRegressor(n_estimators=500, learning_rate=0.05, max_depth=8, random_state=47),
    "Bagging (Linear Regression)": BaggingRegressor(estimator=LinearRegression(), n_estimators=100, random_state=47),
    "Bagging (Decision Tree)": BaggingRegressor(estimator=DecisionTreeRegressor(max_depth=None, random_state=47), n_estimators=100, random_state=47, verbose=0),
}

In [None]:
results = {}
for name, model in models.items():
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)

    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test,y_pred)
    rmse = np.sqrt(mse)
    cv_scores = cross_val_score(model, x_train, y_train, cv=5, scoring="r2")

    results[name] = {
        "R2": round(r2, 4),
        "RMSE": round(rmse, 2),
        "CV_R2_Mean": round(cv_scores.mean(), 4),
        "CV_R2_Std": round(cv_scores.std(), 4),
    }

results_df = pd.DataFrame(results).T
print(results_df)

                                 R2    RMSE  CV_R2_Mean  CV_R2_Std
Linear Regression            0.9997   16.83      0.9996     0.0002
Random Forest                0.9684  175.65      0.9571     0.0847
XGBoost                      0.9358  250.37      0.9521     0.0844
Bagging (Linear Regression)  0.9997   17.23      0.9996     0.0002
Bagging (Decision Tree)      0.9698  171.65      0.9568     0.0849
