# 🌾 Improved NDVI Classification Pipeline

This notebook enhances the original pipeline to improve classification 
- Better feature engineering
- LightGBM model
- Cross-validation
- Feature importance selection

In [13]:
# 🌱 Advanced NDVI Classifier

# 📦 Import required libraries
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score
from lightgbm import LGBMClassifier
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings('ignore')

# 📂 Load and inspect dataset
df = pd.read_csv("hacktrain.csv")
df = df.drop(columns=["Unnamed: 0", "ID"])
ndvi_cols = [col for col in df.columns if col.endswith("_N")]
X_raw = df[ndvi_cols]
y = df["class"]
print("NDVI columns:", len(ndvi_cols), "| Target distribution:\n", y.value_counts())

# 🗓️ Generate Day-Of-Year array
def col_to_doy(col):
    return datetime.strptime(col.split('_')[0], '%Y%m%d').timetuple().tm_yday
doy_array = np.array([col_to_doy(col) for col in ndvi_cols])

# 🛠️ Feature Engineering Function
def extract_features(row):
    vals = row.values.astype(float)
    safe_vals = np.nan_to_num(vals, nan=np.nanmedian(vals))

    features = {
        'mean': np.nanmean(vals),
        'std': np.nanstd(vals),
        'min': np.nanmin(vals),
        'max': np.nanmax(vals),
        'range': np.nanmax(vals) - np.nanmin(vals),
        'median': np.nanmedian(vals),
        'q1': np.nanpercentile(vals, 25),
        'q3': np.nanpercentile(vals, 75),
        'iqr': np.nanpercentile(vals, 75) - np.nanpercentile(vals, 25),
        'count_valid': np.sum(~np.isnan(vals)),
        'slope': np.polyfit(doy_array, safe_vals, 1)[0],
        'peak_idx': np.argmax(safe_vals),
        'trough_idx': np.argmin(safe_vals),
    }

    # Fourier Transform Features
    fft = np.fft.fft(safe_vals)
    for i in range(1, 4):
        features[f'fft_{i}'] = np.abs(fft[i])

    return pd.Series(features)

# ➕ Apply Feature Engineering
X = X_raw.apply(extract_features, axis=1)

# ⚖️ Balance Classes Using SMOTE
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X, y)

# ⚙️ Build & Evaluate LightGBM Classifier
model = LGBMClassifier(
    n_estimators=1500,
    learning_rate=0.02,
    num_leaves=64,
    colsample_bytree=0.8,
    subsample=0.8,
    reg_alpha=0.1,
    reg_lambda=0.1,
    random_state=42
)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(model, X_res, y_res, cv=cv, scoring='accuracy')

# 📊 Results
print("Cross-validation scores:", scores)
print("Mean CV Accuracy:", np.round(np.mean(scores), 4))

# ✅ Train final model
model.fit(X_res, y_res)

# 🔮 Predict on test set
# Replace with your actual test CSV if needed
test_df = pd.read_csv("hacktest.csv")  # <- make sure you have this file
test_ndvi = test_df[ndvi_cols]
X_test = test_ndvi.apply(extract_features, axis=1)
test_preds = model.predict(X_test)

# 📝 Save output to CSV
submission = pd.DataFrame({
    'ID': test_df['ID'],
    'class': test_preds
})
submission.to_csv("submission.csv", index=False)
print("✅ Predictions saved to submission.csv")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Cross-validation scores: [0.98701123 0.98741713 0.98768773 0.98836423 0.98714479]
Mean CV Accuracy: 0.9875
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013739 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4080
[LightGBM] [Info] Number of data points in the train set: 36954, number of used features: 16
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
[LightGBM] [Info] Start training from score -1.791759
✅ Predictions saved to submission.csv
