In [None]:
import pandas as pd
import numpy as np
import time, json, joblib, warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, mean_absolute_percentage_error

try:
    import lightgbm as lgb
except ImportError:
    raise ImportError("Please install lightgbm: pip install lightgbm")

print("="*70)
print("⚡ LIGHTGBM REGRESSOR — FINAL, FAST, AND STABLE")
print("="*70)


⚡ LIGHTGBM REGRESSOR — FINAL, FAST, AND STABLE


In [None]:
# 1. Load dataset safely, skipping bad lines if any
try:
    df = pd.read_csv("crop_yield.csv")
except pd.errors.ParserError:
    df = pd.read_csv("crop_yield.csv", on_bad_lines='skip', engine='python')
print(f"Dataset loaded: {df.shape[0]:,} rows × {df.shape[1]} columns")

# 2. Detect target column automatically
possible_targets = ["Yield", "Yield_tons_per_hectare", "yield", "Crop_Yield", "Production"]
target_col = next((c for c in possible_targets if c in df.columns), None)
if target_col is None:
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    if numeric_cols:
        target_col = numeric_cols[-1]
        print(f"No standard target found — using '{target_col}'")
    else:
        raise ValueError("No numeric target column found! Please specify target column.")
else:
    print(f"Target detected: {target_col}")

Dataset loaded: 303,102 rows × 10 columns
Target detected: Yield_tons_per_hectare


In [None]:
# 3. Encode categorical columns
categoricals = df.select_dtypes(include=["object", "bool"]).columns.tolist()
if target_col in categoricals: categoricals.remove(target_col)
encoders = {}
for col in categoricals:
    le = LabelEncoder()
    df[col + "_encoded"] = le.fit_transform(df[col].astype(str))
    encoders[col] = le
df.drop(columns=categoricals, inplace=True)
print(f"Categorical columns encoded: {len(encoders)}")

Categorical columns encoded: 6


In [None]:
# 4. Prepare features and target
X = df.drop(columns=[target_col])
y = df[target_col]
print(f"Prepared {X.shape[1]} features with target range [{y.min():.2f}, {y.max():.2f}]")

# 5. Split data (80-20)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, shuffle=True
)
print(f"Train: {len(X_train):,} | Test: {len(X_test):,}")

Prepared 9 features with target range [-0.84, 9.73]
Train: 242,481 | Test: 60,621


In [None]:
# 6. Detect device: GPU if available else CPU
use_gpu = True
try:
    if use_gpu:
        test = lgb.LGBMRegressor(device_type="gpu", n_estimators=5, verbosity=-1)
        test.fit(X_train[:50], y_train[:50])
        device = "gpu"
        print("GPU detected and enabled")
    else:
        raise Exception("CPU mode forced")
except:
    device = "cpu"
    print("Using CPU mode")

GPU detected and enabled


In [None]:
model = lgb.LGBMRegressor(random_state=42, device=device)

# Reduced parameter grid for faster tuning
param_grid = {
    "num_leaves": [31, 63],              # fewer options
    "learning_rate": [0.05, 0.1],        # narrowed range
    "n_estimators": [100, 200],           # fewer trees
    "max_depth": [7, 10],                 # smaller depths
    "feature_fraction": [0.8, 1.0],
    "bagging_fraction": [0.8, 1.0],
    "lambda_l1": [0],
    "lambda_l2": [0, 1]
}

search = RandomizedSearchCV(
    model, param_distributions=param_grid,
    n_iter=8,                            # fewer iterations
    cv=2,                               # fewer CV folds
    scoring="r2",
    random_state=42,
    n_jobs=-1,
    verbose=0
)

# Enable early stopping to save time during fit
fit_params = {
    'eval_set': [(X_train, y_train)],
    'eval_metric': 'rmse',
    'callbacks': [lgb.early_stopping(30, verbose=False)] # Correct way to pass early stopping
}

t0 = time.time()
search.fit(X_train, y_train, **fit_params)
tune_time = time.time() - t0

print(f"✅ Tuning completed in {tune_time:.1f} seconds | Best R²: {search.best_score_:.4f}")
print(f"Best params: {search.best_params_}")

✅ Tuning completed in 86.5 seconds | Best R²: 0.9126
Best params: {'num_leaves': 31, 'n_estimators': 200, 'max_depth': 7, 'learning_rate': 0.05, 'lambda_l2': 0, 'lambda_l1': 0, 'feature_fraction': 1.0, 'bagging_fraction': 1.0}


In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline

# Re-load dataset for pipeline construction (to get original columns)
df_pipeline = pd.read_csv("crop_yield.csv")

# Ensure target column is identified correctly in this fresh df
possible_targets = ["Yield", "Yield_tons_per_hectare", "yield", "Crop_Yield", "Production"]
target_col_pipeline = next((c for c in possible_targets if c in df_pipeline.columns), None)
if target_col_pipeline is None:
    numeric_cols_pipeline = df_pipeline.select_dtypes(include=[np.number]).columns.tolist()
    if numeric_cols_pipeline:
        target_col_pipeline = numeric_cols_pipeline[-1]
    else:
        raise ValueError("No numeric target column found! Please specify target column.")
else:
    print(f"Target detected for pipeline: {target_col_pipeline}")

# Define features for the pipeline based on the original dataframe
all_features_pipeline = [col for col in df_pipeline.columns if col != target_col_pipeline]
numeric_features_pipeline = df_pipeline[all_features_pipeline].select_dtypes(include=np.number).columns.tolist()
categorical_features_pipeline = df_pipeline[all_features_pipeline].select_dtypes(include=['object', 'bool']).columns.tolist()

X_pipeline = df_pipeline.drop(columns=[target_col_pipeline])
y_pipeline = df_pipeline[target_col_pipeline]

# Split data for pipeline (using original features)
X_train_pipeline, X_test_pipeline, y_train_pipeline, y_test_pipeline = train_test_split(
    X_pipeline, y_pipeline, test_size=0.2, random_state=42, shuffle=True
)

print(f"Pipeline features (numeric): {numeric_features_pipeline}")
print(f"Pipeline features (categorical): {categorical_features_pipeline}")
print(f"Train set for pipeline: {len(X_train_pipeline):,} | Test set for pipeline: {len(X_test_pipeline):,}")

Target detected for pipeline: Yield_tons_per_hectare
Pipeline features (numeric): ['Rainfall_mm', 'Temperature_Celsius', 'Days_to_Harvest']
Pipeline features (categorical): ['Region', 'Soil_Type', 'Crop', 'Fertilizer_Used', 'Irrigation_Used', 'Weather_Condition']
Train set for pipeline: 269,420 | Test set for pipeline: 67,356


Now, we will define the `ColumnTransformer` for preprocessing and integrate it with the LightGBM Regressor into a `Pipeline`. We'll use the best parameters found during the previous `RandomizedSearchCV`.

In [None]:
# Create the preprocessing pipeline using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features_pipeline),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features_pipeline)
    ],
    remainder='passthrough' # Keep other columns if any, or 'drop' if you want to explicitly exclude them
)

# Get best parameters from previous RandomizedSearchCV
best_lgbm_params = search.best_params_

# Create the full pipeline with preprocessing and LightGBM model
pipeline_lgbm = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', lgb.LGBMRegressor(**best_lgbm_params))
])

print("Fitting the new LightGBM pipeline with ColumnTransformer...")
# Fit the entire pipeline on the original (unscaled, unencoded) training data
# Note: eval_set and callbacks can be passed to the fit method of the pipeline
# if the regressor step supports them and can handle transformed data.
pipeline_lgbm.fit(X_train_pipeline, y_train_pipeline)
print("Pipeline fitting complete.")

# Evaluate the new pipeline
y_pred_pipeline = pipeline_lgbm.predict(X_test_pipeline)

train_r2_pipeline = r2_score(y_train_pipeline, pipeline_lgbm.predict(X_train_pipeline))
test_r2_pipeline = r2_score(y_test_pipeline, y_pred_pipeline)
mae_pipeline = mean_absolute_error(y_test_pipeline, y_pred_pipeline)
rmse_pipeline = np.sqrt(mean_squared_error(y_test_pipeline, y_pred_pipeline))
mape_pipeline = mean_absolute_percentage_error(y_test_pipeline, y_pred_pipeline) * 100
gap_pipeline = train_r2_pipeline - test_r2_pipeline

print("\n======================================================================")
print("FINAL LIGHTGBM PIPELINE RESULTS")
print("======================================================================")
print(f"Train R² (Pipeline): {train_r2_pipeline:.4f}")
print(f"Test  R² (Pipeline): {test_r2_pipeline:.4f} ⭐")
print(f"MAE (Pipeline): {mae_pipeline:.4f}")
print(f"RMSE (Pipeline): {rmse_pipeline:.4f}")
print(f"MAPE (Pipeline): {mape_pipeline:.2f}%")
print(f"Overfit Gap (Pipeline): {gap_pipeline:.4f} → {'Excellent' if gap_pipeline < 0.03 else 'Good' if gap_pipeline < 0.06 else 'Moderate'}")

Fitting the new LightGBM pipeline with ColumnTransformer...
Pipeline fitting complete.

FINAL LIGHTGBM PIPELINE RESULTS
Train R² (Pipeline): 0.9140
Test  R² (Pipeline): 0.9123 ⭐
MAE (Pipeline): 0.4004
RMSE (Pipeline): 0.5023
MAPE (Pipeline): 13.03%
Overfit Gap (Pipeline): 0.0018 → Excellent


Finally, we save the complete `pipeline_lgbm` object to a `.pkl` file. This file will contain all preprocessing steps and the trained LightGBM model, ready for future predictions on raw, unengineered input data.

In [None]:
# Save the complete pipeline to a .pkl file
joblib.dump(pipeline_lgbm, 'lightgbm_full_prediction_pipeline.pkl')
print("\nFull LightGBM prediction pipeline saved to 'lightgbm_full_prediction_pipeline.pkl'")

# Uncomment below for download in Google Colab:
from google.colab import files
files.download('lightgbm_full_prediction_pipeline.pkl')


Full LightGBM prediction pipeline saved to 'lightgbm_full_prediction_pipeline.pkl'


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# 8. Final training with early stopping
final_model = search.best_estimator_
t1 = time.time()
final_model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    callbacks=[lgb.early_stopping(25, verbose=False)]
)
train_time = time.time() - t1
print(f"Training completed in {train_time:.2f}s")


Training completed in 3.06s


In [None]:
# 9. Evaluation
y_pred = final_model.predict(X_test)

# Filter out NaN values from y_test and corresponding y_pred
non_nan_indices = y_test.notna()
y_test_filtered = y_test[non_nan_indices]
y_pred_filtered = y_pred[non_nan_indices]

train_r2 = r2_score(y_train, final_model.predict(X_train))
test_r2 = r2_score(y_test_filtered, y_pred_filtered)
mae = mean_absolute_error(y_test_filtered, y_pred_filtered)
rmse = np.sqrt(mean_squared_error(y_test_filtered, y_pred_filtered))
mape = mean_absolute_percentage_error(y_test_filtered, y_pred_filtered)*100
gap = train_r2 - test_r2

print("="*70)
print("FINAL LIGHTGBM RESULTS")
print("="*70)
print(f"Train R²: {train_r2:.4f}")
print(f"Test  R²: {test_r2:.4f} ⭐")
print(f"MAE: {mae:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"MAPE: {mape:.2f}%")
print(f"Overfit Gap: {gap:.4f} → {'Excellent' if gap < 0.03 else 'Good' if gap < 0.06 else 'Moderate'}")

FINAL LIGHTGBM RESULTS
Train R²: 0.9138
Test  R²: 0.9125 ⭐
MAE: 0.4003
RMSE: 0.5023
MAPE: 13.18%
Overfit Gap: 0.0014 → Excellent


In [None]:
# Feature importance
feat_imp = pd.DataFrame({
    "Feature": X.columns,
    "Importance": final_model.feature_importances_
}).sort_values("Importance", ascending=False)
feat_imp.to_csv("lightgbm_feature_importance.csv", index=False)
print("Feature importance saved to lightgbm_feature_importance.csv")
print("Top 5 features:")
for i, (f, v) in enumerate(feat_imp.head(5).values, 1):
    print(f"{i}. {f:<25} {v:.0f}")

Feature importance saved to lightgbm_feature_importance.csv
Top 5 features:
1. Rainfall_mm               1510
2. Temperature_Celsius       1283
3. Irrigation_Used_encoded   643
4. Days_to_Harvest           370
5. Fertilizer_Used_encoded   331


In [None]:
pipeline_components = {
    "model": final_model,
    "encoders": encoders,
    "feature_importance": feat_imp
}
joblib.dump(pipeline_components, "lightgbm_full_pipeline.pkl")
print("Full pipeline (model, encoders, feature importance) saved to lightgbm_full_pipeline.pkl")

# Research summary
summary = {
    "Model": "LightGBM Regressor",
    "Device": device,
    "Train_R2": train_r2,
    "Test_R2": test_r2,
    "MAE": mae,
    "RMSE": rmse,
    "MAPE": mape,
    "Gap": gap,
    "Best_Params": search.best_params_,
    "Train_Time_s": round(train_time, 2),
    "Tune_Time_s": round(tune_time, 2),
    "Total_Time_min": round((train_time + tune_time) / 60, 2)
}
import json
json.dump(summary, open("lightgbm_summary.json", "w"), indent=4)
print("Summary saved to lightgbm_summary.json")

#Uncomment below for download in Google Colab:
from google.colab import files
files.download('lightgbm_full_pipeline.pkl')
# files.download('lightgbm_summary.json')

Full pipeline (model, encoders, feature importance) saved to lightgbm_full_pipeline.pkl
Summary saved to lightgbm_summary.json


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>