In [5]:
import pandas as pd
df1=pd.read_csv("data/yeild_dataset.csv")

In [6]:
df1

Unnamed: 0,State_Name,District_Name,Crop_Year,Season,Crop,Area,Production,annual_rainfall,yeild,Soil Type,Soil pH
0,ANI,NICOBARS,2000,Whole Year,Dry ginger,36.0,100.0,2763.2,3,Mountain,7.489453
1,ANI,NICOBARS,2000,Whole Year,Cashewnut,720.0,165.0,2763.2,0,Mountain,6.217064
2,ANI,NICOBARS,2000,Kharif,Arecanut,1254.0,2000.0,2763.2,2,Mountain,6.456269
3,ANI,NICOBARS,2000,Kharif,Other Kharif pulses,2.0,1.0,2763.2,1,Laterite,6.683922
4,ANI,NICOBARS,2000,Whole Year,Banana,176.0,641.0,2763.2,4,Loamy,6.497723
...,...,...,...,...,...,...,...,...,...,...,...
241221,WB,PURULIA,2014,Kharif,Other Kharif pulses,79.0,39.0,1792.0,0,Alluvial,6.756389
241222,WB,PURULIA,2014,Rabi,Wheat,1622.0,3663.0,1792.0,2,Alluvial,6.422356
241223,WB,PURULIA,2014,Kharif,Sannhamp,171.0,727.0,1792.0,4,Peaty,7.269616
241224,WB,PURULIA,2014,Kharif,Mesta,159.0,2065.0,1792.0,13,Laterite,7.138014


In [10]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor

# 🚀 Load dataset (assuming df1 is already defined)
a = df1.drop(columns=['yeild', 'Production'], axis=1)  # Features
b = df1['yeild']  # Target variable

# **Step 1: Identify Numerical & Categorical Columns**
num_features = a.select_dtypes(exclude="object").columns
cat_features = a.select_dtypes(include="object").columns

# 🚀 **Step 2: Create Column Transformer for Preprocessing**
numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, cat_features),
        ("StandardScaler", numeric_transformer, num_features),        
    ]
)

# 🚀 **Step 3: Split the Data (Avoid Data Leakage)**
a_train, a_test, b_train, b_test = train_test_split(a, b, test_size=0.2, random_state=42)

# 🚀 **Step 4: Apply Preprocessing (Fit Only on Training Data)**
a_train = preprocessor.fit_transform(a_train)
a_test = preprocessor.transform(a_test)

# 🚀 **Step 5: Train XGBoost with GPU acceleration**
xgb = XGBRegressor(
    random_state=42, 
    tree_method='gpu_hist', 
    n_estimators=300,           # Increased from 200 → 300
    learning_rate=0.05,         # Keeping it same
    max_depth=8,                # Reduced from 10 → 8
    subsample=0.8,              # Keeping it same
    colsample_bytree=0.8,       # Increased from 0.7 → 0.8
    reg_alpha=0.3,              # Reduced from 0.5 → 0.3
    reg_lambda=0.8              # Reduced from 1.0 → 0.8
)

# 🚀 Train the Model
xgb.fit(a_train, b_train)

# 🚀 Predictions & Evaluation
y_train_pred = xgb.predict(a_train)
y_test_pred = xgb.predict(a_test)

# 🚀 Training Metrics
train_mae = mean_absolute_error(b_train, y_train_pred)
train_mse = mean_squared_error(b_train, y_train_pred)
train_rmse = np.sqrt(train_mse)
train_r2 = r2_score(b_train, y_train_pred)

# 🚀 Test Metrics
test_mae = mean_absolute_error(b_test, y_test_pred)
test_mse = mean_squared_error(b_test, y_test_pred)
test_rmse = np.sqrt(test_mse)
test_r2 = r2_score(b_test, y_test_pred)

# 🚀 Final Output
print("🚀 Model Performance on Training Data:")
print(f"Train R² Score: {train_r2}")
print(f"Train MAE: {train_mae}")
print(f"Train MSE: {train_mse}")
print(f"Train RMSE: {train_rmse}\n")

print("🚀 Model Performance on Test Data:")
print(f"Test R² Score: {test_r2}")
print(f"Test MAE: {test_mae}")
print(f"Test MSE: {test_mse}")
print(f"Test RMSE: {test_rmse}")



    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




🚀 Model Performance on Training Data:
Train R² Score: 0.9904527068138123
Train MAE: 6.197098255157471
Train MSE: 5683.95947265625
Train RMSE: 75.39203852301813

🚀 Model Performance on Test Data:
Test R² Score: 0.956855297088623
Test MAE: 10.287812232971191
Test MSE: 34735.63671875
Test RMSE: 186.37498952045578


In [11]:
pred_df=pd.DataFrame({'Actual Value':b_test,'Predicted Value':y_test_pred,'Difference':b_test-y_test_pred})
pred_df

Unnamed: 0,Actual Value,Predicted Value,Difference
29464,2,1.489742,0.510258
53130,0,0.771618,-0.771618
14893,2,-0.671841,2.671841
147439,0,1.159634,-1.159634
196468,8,11.688482,-3.688482
...,...,...,...
179343,0,0.635174,-0.635174
186472,0,-1.841515,1.841515
150805,7,8.327303,-1.327303
131417,0,0.796402,-0.796402


In [12]:
num_features

Index(['Crop_Year', 'Area', 'annual_rainfall', 'Soil pH'], dtype='object')

In [13]:
cat_features

Index(['State_Name', 'District_Name', 'Season', 'Crop', 'Soil Type'], dtype='object')