In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
%pip install optuna

Collecting optuna
  Downloading optuna-4.5.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.5.0-py3-none-any.whl (400 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.9/400.9 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna
Successfully installed colorlog-6.9.0 optuna-4.5.0


In [13]:
# 📌 Step 1: Imports
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np
import joblib

# 📌 Step 2: Load dataset
df = pd.read_csv("/content/drive/My Drive/Sih_cleaned.csv")

# Drop State_Name since only one state
df = df.drop(columns=["State_Name", "year"])  # Year removed as per your choice

# 📌 Step 3: Define features and target
target = "Yield_qha"
X = df.drop(columns=[target])
y = df[target]

# 📌 Step 4: One-hot encode categorical features
X = pd.get_dummies(X, columns=["District_Name", "Crop", "Season"], drop_first=True)

# 📌 Step 5: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 📌 Step 6: Train LightGBM model
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test)

params = {
    "objective": "regression",
    "metric": "rmse",
    "boosting_type": "gbdt",
    "num_leaves": 31,
    "learning_rate": 0.05,
    "feature_fraction": 0.9,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "verbose": -1
}

model = lgb.train(
    params,
    train_data,
    valid_sets=[train_data, test_data],
    num_boost_round=1000,
    callbacks=[lgb.early_stopping(stopping_rounds=50)]
)

# 📌 Step 7: Predictions
y_pred = model.predict(X_test, num_iteration=model.best_iteration)

# 📌 Step 8: Evaluation
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"✅ RMSE: {rmse:.3f}")
print(f"✅ MAE: {mae:.3f}")
print(f"✅ R²: {r2:.3f}")

Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[508]	training's rmse: 0.0791679	valid_1's rmse: 0.242524
✅ RMSE: 0.243
✅ MAE: 0.181
✅ R²: 0.763


In [15]:
# 📌 Step 9: Save model + feature schema
joblib.dump(model, "/content/drive/My Drive/yield_lightgbm.pkl")
joblib.dump(X_train.columns.tolist(), "/content/drive/My Drive/feature_names.pkl")
print("✅ Model and feature schema saved!")


✅ Model and feature schema saved!


In [19]:
import pandas as pd
import joblib

# 🔹 Load trained model + feature schema
model = joblib.load("/content/drive/My Drive/yield_lightgbm.pkl")
feature_names = joblib.load("/content/drive/My Drive/feature_names.pkl")

def build_feature_vector(district, crop, season, soil, weather):
    # Start with all zeros
    input_data = pd.DataFrame([[0]*len(feature_names)], columns=feature_names)

    # One-hot categorical features
    for col in [f"District_{district}", f"Crop_{crop}", f"Season_{season}"]:
        if col in input_data.columns:
            input_data[col] = 1

    # Soil features
    if "soil_ph" in input_data.columns:
        input_data["soil_ph"] = soil["pH"]
    if "soil_nitrogen" in input_data.columns:
        input_data["soil_nitrogen"] = soil["N"]
    if "soil_phosphorus" in input_data.columns:
        input_data["soil_phosphorus"] = soil["P"]
    if "soil_potassium" in input_data.columns:
        input_data["soil_potassium"] = soil["K"]
    if "soil_organic_carbon" in input_data.columns:
        input_data["soil_organic_carbon"] = soil.get("OC", 0)

    # Weather features
    if "rainfall_total" in input_data.columns:
        input_data["rainfall_total"] = weather["rainfall"]
    if "avg_temp" in input_data.columns:
        input_data["avg_temp"] = weather["temperature"]

    return input_data

# 🔹 Example farmer input
soil_data = {"pH": 6.4, "N": 40, "P": 28, "K": 220, "OC": 0.75}
weather_data = {"rainfall": 640, "temperature": 27}

fv = build_feature_vector("Nalgonda", "Rice", "Kharif", soil_data, weather_data)

# 🔹 Predict
prediction = model.predict(fv)[0]

print("🌾 Predicted Yield (quintal/ha):", round(prediction, 2))


🌾 Predicted Yield (quintal/ha): 2.2
