In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
from xgboost import XGBRegressor
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
import shap
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam


In [None]:
df = pd.read_csv("/content/odisha_mock_data_final.csv")

In [None]:
if 'ph_range' in df.columns:
    df['soil_ph'] = df['ph_range'].str.extract(r'(\d+\.\d+).*?(\d+\.\d+)').astype(float).mean(axis=1)
    df.drop(columns=['ph_range'], inplace=True)

In [None]:
for col in ['typical_monsoon_onset', 'typical_monsoon_end']:
    if col in df.columns:
        df[col+'_doy'] = pd.to_datetime(df[col], errors='coerce').dt.dayofyear
        df.drop(columns=[col], inplace=True)

In [None]:
df.fillna(df.mean(numeric_only=True), inplace=True)
df.fillna(0, inplace=True)

In [None]:
cat_cols = ['soil_type', 'district', 'season', 'crop']
for col in cat_cols:
    if col in df.columns:
        df = pd.get_dummies(df, columns=[col], drop_first=True)



In [None]:
target = 'estimated_yield_kg_per_ha'
X = df.drop(columns=[target])
y = df[target]

features_col = X.columns.tolist()

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [None]:
# -------- Initial XGBoost to get feature importance --------
xgb_base = XGBRegressor(random_state=42, tree_method='hist')
xgb_base.fit(X_train, y_train)

importance = xgb_base.feature_importances_
feature_names = X.columns
print(f"Feature Importance:{feature_names}")
print(f"Importance:{importance}")

fi_df = pd.DataFrame({'feature': feature_names, 'importance': importance})
fi_df = fi_df.sort_values(by='importance', ascending=False)

In [None]:
low_importance_features = fi_df[fi_df['importance'] < 0.0000003]['feature'].tolist()
print(f"Dropping {len(low_importance_features)} low importance features.")

In [None]:
X_reduced = df.drop(columns=low_importance_features + [target])
X_reduced_scaled = scaler.fit_transform(X_reduced)
X_reduced_scaled_df = pd.DataFrame(X_reduced_scaled, columns=X_reduced.columns)

In [None]:
X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(
    X_reduced_scaled_df, y, test_size=0.2, random_state=42
)

features_col_r = X_train_r.columns.tolist()

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [4, 6, 8],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

In [None]:
xgb = XGBRegressor(
    random_state=42,
    tree_method='hist',  # CPU only
    n_estimators=200,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8
)
grid_search = GridSearchCV(xgb, param_grid, cv=3, scoring='neg_mean_absolute_error', verbose=1, n_jobs=-1)
grid_search.fit(X_train_r, y_train_r)

print(f"Best Params: {grid_search.best_params_}")
best_xgb = grid_search.best_estimator_

In [None]:
y_pred_xgb = best_xgb.predict(X_test_r)
mae = mean_absolute_error(y_test_r, y_pred_xgb)
rmse = np.sqrt(mean_squared_error(y_test_r, y_pred_xgb))
percentage_errors = np.abs((y_test_r - y_pred_xgb) / y_test_r) * 100
mape = np.mean(percentage_errors)
print(f"XGBoost Performance:\nMAE: {mae:.2f}\nRMSE: {rmse:.2f}\nMAPE: {mape:.2f}%")

In [None]:
explainer = shap.Explainer(best_xgb)
shap_values = explainer(X_test_r)
X_test_df = pd.DataFrame(X_test_r, columns=X_reduced.columns)
shap.summary_plot(shap_values, features=X_test_df, feature_names=X_reduced.columns)

In [None]:
# For LSTM, reshape input to 3D (samples, timesteps=1, features)

X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(
    X_reduced_scaled, y, test_size=0.2, random_state=42
)

X_train_lstm = X_train_r.reshape((X_train_r.shape[0], 1, X_train_r.shape[1]))
X_test_lstm = X_test_r.reshape((X_test_r.shape[0], 1, X_test_r.shape[1]))

lstm_model = Sequential()
lstm_model.add(LSTM(64, input_shape=(1, X_train_r.shape[1]), return_sequences=False))
lstm_model.add(Dropout(0.2))
lstm_model.add(Dense(32, activation='relu'))
lstm_model.add(Dense(1))
lstm_model.compile(optimizer=Adam(learning_rate=0.001), loss='mean_absolute_error')

early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)


In [None]:
history = lstm_model.fit(
    X_train_lstm, y_train_r,
    validation_split=0.2,
    epochs=100,
    batch_size=32,
    callbacks=[early_stop],
    verbose=1
)

In [None]:
y_pred_lstm = lstm_model.predict(X_test_lstm).flatten()
mae_lstm = mean_absolute_error(y_test_r, y_pred_lstm)
rmse_lstm = np.sqrt(mean_squared_error(y_test_r, y_pred_lstm))
percentage_errors = np.abs((y_test_r - y_pred_lstm) / y_test_r) * 100
mape = np.mean(percentage_errors)
print(f"LSTM Performance:\nMAE: {mae_lstm:.2f}\nRMSE: {rmse_lstm:.2f}\nMAPE: {mape:.2f}%")

In [None]:
plt.figure(figsize=(10,5))
plt.plot(history.history['loss'], label='train_loss')
plt.plot(history.history['val_loss'], label='val_loss')
plt.title('LSTM Training Loss')
plt.xlabel('Epoch')
plt.ylabel('MAE Loss')
plt.legend()
plt.show()

In [None]:
mock_input = np.zeros((1, X_reduced.shape[1]))
# Example: set year, ndvi, soil_ph etc. to realistic values — adapt as needed
for feature in ['year', 'ndvi.early', 'ndvi.mid', 'ndvi.late', 'soil_ph']:
    if feature in X_reduced.columns:
        mock_input[0, X_reduced.columns.get_loc(feature)] = 2025 if feature == 'year' else 0.5

mock_input_scaled = scaler.transform(mock_input)

In [None]:
mock_pred_xgb = best_xgb.predict(mock_input_scaled)
print(f"XGBoost predicted yield (kg/ha): {mock_pred_xgb[0]:.2f}")

mock_input_lstm = mock_input_scaled.reshape((1, 1, mock_input_scaled.shape[1]))
mock_pred_lstm = lstm_model.predict(mock_input_lstm).flatten()[0]
print(f"LSTM predicted yield (kg/ha): {mock_pred_lstm:.2f}")

In [None]:
import joblib

In [None]:
joblib.dump(best_xgb, 'xgb_model.joblib')
joblib.dump(lstm_model, 'lstm_model.joblib')
joblib.dump(scaler, 'scaler.joblib')
joblib.dump(features_col_r, 'feature_columns_r.joblib')

In [None]:
from google.colab import files
files.download("xgb_model.joblib")
files.download("lstm_model.joblib")
files.download("scaler.joblib")
files.download("feature_columns_r.joblib")

In [None]:
DISTRICT_DEFAULTS = {
    "angul": {"annual_avg_rainfall_mm": 1450, "monsoon_rainfall_mm": 1150, "avg_annual_temp_C": 26,
              "avg_max_temp_C": 33, "avg_min_temp_C": 21, "avg_relative_humidity_pct": 77,
              "ndvi.early": 0.51, "ndvi.mid": 0.65, "ndvi.late": 0.60,
              "organic_carbon": 0.75, "nitrogen_kg_per_ha": 65, "phosphorus_kg_per_ha": 32,
              "potassium_kg_per_ha": 42},
    "balangir": {"annual_avg_rainfall_mm": 1300, "monsoon_rainfall_mm": 1050, "avg_annual_temp_C": 27,
                 "avg_max_temp_C": 34, "avg_min_temp_C": 22, "avg_relative_humidity_pct": 72,
                 "ndvi.early": 0.48, "ndvi.mid": 0.62, "ndvi.late": 0.58,
                 "organic_carbon": 0.70, "nitrogen_kg_per_ha": 60, "phosphorus_kg_per_ha": 30,
                 "potassium_kg_per_ha": 40},
    "balasore": {"annual_avg_rainfall_mm": 1500, "monsoon_rainfall_mm": 1200, "avg_annual_temp_C": 27,
                 "avg_max_temp_C": 35, "avg_min_temp_C": 22, "avg_relative_humidity_pct": 78,
                 "ndvi.early": 0.50, "ndvi.mid": 0.64, "ndvi.late": 0.61,
                 "organic_carbon": 0.72, "nitrogen_kg_per_ha": 62, "phosphorus_kg_per_ha": 31,
                 "potassium_kg_per_ha": 43},
    "bargarh": {"annual_avg_rainfall_mm": 1250, "monsoon_rainfall_mm": 1000, "avg_annual_temp_C": 26,
                "avg_max_temp_C": 33, "avg_min_temp_C": 21, "avg_relative_humidity_pct": 70,
                "ndvi.early": 0.47, "ndvi.mid": 0.61, "ndvi.late": 0.57,
                "organic_carbon": 0.68, "nitrogen_kg_per_ha": 58, "phosphorus_kg_per_ha": 29,
                "potassium_kg_per_ha": 39},
    "bhadrak": {"annual_avg_rainfall_mm": 1400, "monsoon_rainfall_mm": 1100, "avg_annual_temp_C": 27,
                "avg_max_temp_C": 34, "avg_min_temp_C": 22, "avg_relative_humidity_pct": 74,
                "ndvi.early": 0.49, "ndvi.mid": 0.63, "ndvi.late": 0.59,
                "organic_carbon": 0.71, "nitrogen_kg_per_ha": 61, "phosphorus_kg_per_ha": 30,
                "potassium_kg_per_ha": 41},
    "boudh": {"annual_avg_rainfall_mm": 1200, "monsoon_rainfall_mm": 950, "avg_annual_temp_C": 25,
              "avg_max_temp_C": 32, "avg_min_temp_C": 20, "avg_relative_humidity_pct": 68,
              "ndvi.early": 0.46, "ndvi.mid": 0.60, "ndvi.late": 0.55,
              "organic_carbon": 0.67, "nitrogen_kg_per_ha": 57, "phosphorus_kg_per_ha": 28,
              "potassium_kg_per_ha": 38},
    "cuttack": {"annual_avg_rainfall_mm": 1450, "monsoon_rainfall_mm": 1150, "avg_annual_temp_C": 27,
                "avg_max_temp_C": 34, "avg_min_temp_C": 22, "avg_relative_humidity_pct": 76,
                "ndvi.early": 0.51, "ndvi.mid": 0.65, "ndvi.late": 0.61,
                "organic_carbon": 0.74, "nitrogen_kg_per_ha": 64, "phosphorus_kg_per_ha": 32,
                "potassium_kg_per_ha": 42},
    "deogarh": {"annual_avg_rainfall_mm": 1300, "monsoon_rainfall_mm": 1050, "avg_annual_temp_C": 26,
                "avg_max_temp_C": 33, "avg_min_temp_C": 21, "avg_relative_humidity_pct": 71,
                "ndvi.early": 0.48, "ndvi.mid": 0.62, "ndvi.late": 0.58,
                "organic_carbon": 0.70, "nitrogen_kg_per_ha": 60, "phosphorus_kg_per_ha": 30,
                "potassium_kg_per_ha": 40},
    "dhenkanal": {"annual_avg_rainfall_mm": 1350, "monsoon_rainfall_mm": 1100, "avg_annual_temp_C": 26,
                  "avg_max_temp_C": 34, "avg_min_temp_C": 21, "avg_relative_humidity_pct": 73,
                  "ndvi.early": 0.49, "ndvi.mid": 0.63, "ndvi.late": 0.59,
                  "organic_carbon": 0.71, "nitrogen_kg_per_ha": 61, "phosphorus_kg_per_ha": 31,
                  "potassium_kg_per_ha": 41},
    "gajapati": {"annual_avg_rainfall_mm": 1250, "monsoon_rainfall_mm": 1000, "avg_annual_temp_C": 25,
                 "avg_max_temp_C": 32, "avg_min_temp_C": 20, "avg_relative_humidity_pct": 69,
                 "ndvi.early": 0.46, "ndvi.mid": 0.60, "ndvi.late": 0.56,
                 "organic_carbon": 0.68, "nitrogen_kg_per_ha": 58, "phosphorus_kg_per_ha": 29,
                 "potassium_kg_per_ha": 39},
    "ganjam": {"annual_avg_rainfall_mm": 1400, "monsoon_rainfall_mm": 1150, "avg_annual_temp_C": 27,
               "avg_max_temp_C": 34, "avg_min_temp_C": 22, "avg_relative_humidity_pct": 75,
               "ndvi.early": 0.50, "ndvi.mid": 0.64, "ndvi.late": 0.61,
               "organic_carbon": 0.73, "nitrogen_kg_per_ha": 63, "phosphorus_kg_per_ha": 32,
               "potassium_kg_per_ha": 42},
    "jagatsinghpur": {"annual_avg_rainfall_mm": 1450, "monsoon_rainfall_mm": 1150, "avg_annual_temp_C": 27,
                      "avg_max_temp_C": 34, "avg_min_temp_C": 22, "avg_relative_humidity_pct": 76,
                      "ndvi.early": 0.51, "ndvi.mid": 0.65, "ndvi.late": 0.62,
                      "organic_carbon": 0.74, "nitrogen_kg_per_ha": 64, "phosphorus_kg_per_ha": 32,
                      "potassium_kg_per_ha": 42},
    "jajpur": {"annual_avg_rainfall_mm": 1400, "monsoon_rainfall_mm": 1100, "avg_annual_temp_C": 27,
               "avg_max_temp_C": 34, "avg_min_temp_C": 22, "avg_relative_humidity_pct": 74,
               "ndvi.early": 0.50, "ndvi.mid": 0.64, "ndvi.late": 0.60,
               "organic_carbon": 0.72, "nitrogen_kg_per_ha": 62, "phosphorus_kg_per_ha": 31,
               "potassium_kg_per_ha": 41},
    "kalahandi": {"annual_avg_rainfall_mm": 1200, "monsoon_rainfall_mm": 950, "avg_annual_temp_C": 25,
                  "avg_max_temp_C": 32, "avg_min_temp_C": 20, "avg_relative_humidity_pct": 68,
                  "ndvi.early": 0.46, "ndvi.mid": 0.60, "ndvi.late": 0.55,
                  "organic_carbon": 0.67, "nitrogen_kg_per_ha": 57, "phosphorus_kg_per_ha": 28,
                  "potassium_kg_per_ha": 38},
    "kandhamal": {"annual_avg_rainfall_mm": 1350, "monsoon_rainfall_mm": 1100, "avg_annual_temp_C": 26,
                  "avg_max_temp_C": 33, "avg_min_temp_C": 21, "avg_relative_humidity_pct": 72,
                  "ndvi.early": 0.48, "ndvi.mid": 0.62, "ndvi.late": 0.59,
                  "organic_carbon": 0.70, "nitrogen_kg_per_ha": 60, "phosphorus_kg_per_ha": 30,
                  "potassium_kg_per_ha": 40},
    "kendrapara": {"annual_avg_rainfall_mm": 1450, "monsoon_rainfall_mm": 1150, "avg_annual_temp_C": 27,
                   "avg_max_temp_C": 34, "avg_min_temp_C": 22, "avg_relative_humidity_pct": 76,
                   "ndvi.early": 0.51, "ndvi.mid": 0.65, "ndvi.late": 0.62,
                   "organic_carbon": 0.74, "nitrogen_kg_per_ha": 64, "phosphorus_kg_per_ha": 32,
                   "potassium_kg_per_ha": 42},
    "kendujhar": {"annual_avg_rainfall_mm": 1300, "monsoon_rainfall_mm": 1050, "avg_annual_temp_C": 26,
                  "avg_max_temp_C": 33, "avg_min_temp_C": 21, "avg_relative_humidity_pct": 71,
                  "ndvi.early": 0.48, "ndvi.mid": 0.62, "ndvi.late": 0.58,
                  "organic_carbon": 0.70, "nitrogen_kg_per_ha": 60, "phosphorus_kg_per_ha": 30,
                  "potassium_kg_per_ha": 40},
    "khordha": {"annual_avg_rainfall_mm": 1400, "monsoon_rainfall_mm": 1150, "avg_annual_temp_C": 27,
                "avg_max_temp_C": 34, "avg_min_temp_C": 22, "avg_relative_humidity_pct": 75,
                "ndvi.early": 0.50, "ndvi.mid": 0.64, "ndvi.late": 0.61,
                "organic_carbon": 0.73, "nitrogen_kg_per_ha": 63, "phosphorus_kg_per_ha": 32,
                "potassium_kg_per_ha": 42},
    "koraput": {"annual_avg_rainfall_mm": 1250, "monsoon_rainfall_mm": 1000, "avg_annual_temp_C": 25,
                "avg_max_temp_C": 32, "avg_min_temp_C": 20, "avg_relative_humidity_pct": 69,
                "ndvi.early": 0.46, "ndvi.mid": 0.60, "ndvi.late": 0.56,
                "organic_carbon": 0.68, "nitrogen_kg_per_ha": 58, "phosphorus_kg_per_ha": 29,
                "potassium_kg_per_ha": 39},
    "malkangiri": {"annual_avg_rainfall_mm": 1200, "monsoon_rainfall_mm": 950, "avg_annual_temp_C": 25,
                   "avg_max_temp_C": 32, "avg_min_temp_C": 20, "avg_relative_humidity_pct": 68,
                   "ndvi.early": 0.45, "ndvi.mid": 0.59, "ndvi.late": 0.55,
                   "organic_carbon": 0.66, "nitrogen_kg_per_ha": 56, "phosphorus_kg_per_ha": 28,
                   "potassium_kg_per_ha": 38},
    "mayurbhanj": {"annual_avg_rainfall_mm": 1300, "monsoon_rainfall_mm": 1050, "avg_annual_temp_C": 26,
                   "avg_max_temp_C": 33, "avg_min_temp_C": 21, "avg_relative_humidity_pct": 72,
                   "ndvi.early": 0.48, "ndvi.mid": 0.62, "ndvi.late": 0.58,
                   "organic_carbon": 0.70, "nitrogen_kg_per_ha": 60, "phosphorus_kg_per_ha": 30,
                   "potassium_kg_per_ha": 40},
    "nayagarh": {"annual_avg_rainfall_mm": 1350, "monsoon_rainfall_mm": 1100, "avg_annual_temp_C": 26,
                 "avg_max_temp_C": 33, "avg_min_temp_C": 21, "avg_relative_humidity_pct": 73,
                 "ndvi.early": 0.49, "ndvi.mid": 0.63, "ndvi.late": 0.59,
                 "organic_carbon": 0.71, "nitrogen_kg_per_ha": 61, "phosphorus_kg_per_ha": 31,
                 "potassium_kg_per_ha": 41},
    "nuapada": {"annual_avg_rainfall_mm": 1200, "monsoon_rainfall_mm": 950, "avg_annual_temp_C": 25,
                "avg_max_temp_C": 32, "avg_min_temp_C": 20, "avg_relative_humidity_pct": 68,
                "ndvi.early": 0.46, "ndvi.mid": 0.60, "ndvi.late": 0.55,
                "organic_carbon": 0.67, "nitrogen_kg_per_ha": 57, "phosphorus_kg_per_ha": 28,
                "potassium_kg_per_ha": 38},
    "puri": {"annual_avg_rainfall_mm": 1450, "monsoon_rainfall_mm": 1150, "avg_annual_temp_C":
