# Milestone 2 — Crop Yield: ML Pipelines + Time-Series Analysis
**Author:** Arup Kanungo  
**Branch:** `intern-ArupKanungo`  
**Notebook filename (suggested):** `milestone2_agri_yield.ipynb`

This notebook:
- compares pipelines (OneHot+Standard + RF vs LeaveOneOut+MinMax + RF),
- evaluates several regressors,
- builds a final sklearn `Pipeline`,
- prepares a daily time-series, runs stationarity tests (ADF, KPSS), ACF/PACF,
- fits ARIMA baseline and runs `auto_arima`,
- saves outputs and provides git commands to push to the project repo.

In [None]:
# Run this cell once in Colab
%pip install -q category_encoders pmdarima xgboost lightgbm catboost statsmodels joblib
print("Dependencies installed (or were already present).")

## 1) Imports & Settings

In [None]:
import warnings
warnings.filterwarnings('ignore')

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# sklearn
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

# optional boosters
try:
    from xgboost import XGBRegressor
except Exception:
    XGBRegressor = None

import category_encoders as ce

# time-series
from statsmodels.tsa.stattools import adfuller, kpss
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.arima.model import ARIMA
from pmdarima import auto_arima

# persistence
import joblib

sns.set(style='whitegrid')
plt.rcParams['figure.figsize'] = (10,5)

# Paths / outputs
OUTPUT_DIR = "/content/milestone2_outputs"
os.makedirs(OUTPUT_DIR, exist_ok=True)
PREPROCESSED_CSV = "/content/preprocessed_crop_data.csv"   # change if needed
RAW_CSV = "/content/crop_yield_dataset.csv"               # fallback raw csv


## 2) Load dataset (preprocessed if available; otherwise raw)

In [None]:
# Load preprocessed if exists, else load raw
if os.path.exists(PREPROCESSED_CSV):
    print("Loading preprocessed CSV:", PREPROCESSED_CSV)
    df = pd.read_csv(PREPROCESSED_CSV)
else:
    print("Preprocessed not found. Loading raw CSV:", RAW_CSV)
    df = pd.read_csv(RAW_CSV)

print("Initial shape:", df.shape)
display(df.head(3))
print(df.columns.tolist())

# Basic checks
if 'Crop_Yield' not in df.columns:
    raise RuntimeError("Expected column 'Crop_Yield' missing. Fix CSV path or column name.")

## 3) Data cleaning: drop zero-yield rows and handle missing values

In [None]:
# Drop rows where Crop_Yield == 0
before = len(df)
df = df[df['Crop_Yield'] != 0].reset_index(drop=True)
print(f"Dropped {before - len(df)} rows with zero yield. Remaining rows: {len(df)}")

# Quick dtypes and missing summary
print("\nDtypes:")
print(df.dtypes)
print("\nMissing values per column:")
print(df.isnull().sum())

# Fill numeric NaNs with mean, categorical with "Unknown" (we'll refine per pipeline)
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = df.select_dtypes(include=['object']).columns.tolist()
print("\nNumeric cols (auto):", num_cols)
print("Categorical cols (auto):", cat_cols)

## 4) Prepare ML dataset: select categorical & numerical features

In [None]:
# Typical expected lists; adjust if names differ
expected_num = ['Soil_pH', 'Temperature', 'Humidity', 'Wind_Speed', 'N', 'P', 'K', 'Soil_Quality']
expected_cat = [c for c in ['Crop_Type','Soil_Type'] if c in df.columns]

# Use expected if available, else infer numeric columns minus target
if set(expected_num).issubset(set(df.columns)):
    numerical_features = expected_num
else:
    numerical_features = [c for c in num_cols if c != 'Crop_Yield']

categorical_features = expected_cat

print("Numerical features used:", numerical_features)
print("Categorical features used:", categorical_features)

# Create ML DF (drop Date if present)
ml_df = df.copy()
if 'Date' in ml_df.columns:
    ml_df = ml_df.drop(columns=['Date'])
# Fill missing
ml_df[numerical_features] = ml_df[numerical_features].fillna(ml_df[numerical_features].mean())
for c in categorical_features:
    ml_df[c] = ml_df[c].fillna("Unknown")
print("ML DF shape:", ml_df.shape)
display(ml_df.head(2))

## 5) Train/Test split (holdout) — 80/20

In [None]:
X = ml_df.drop(columns=['Crop_Yield'])
y = ml_df['Crop_Yield']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
print("X_train:", X_train.shape, "X_test:", X_test.shape)

## 6) Helper function — evaluate models (CV + holdout metrics)

In [None]:
from sklearn.model_selection import KFold

def evaluate_model(name, model, Xtr, Xte, ytr, yte, cv=5):
    kf = KFold(n_splits=cv, shuffle=True, random_state=42)
    try:
        scores = cross_val_score(model, Xtr, ytr, scoring='neg_mean_squared_error', cv=kf, n_jobs=-1)
        cv_rmse = np.mean(np.sqrt(-scores))
    except Exception:
        cv_rmse = np.nan
    model.fit(Xtr, ytr)
    preds = model.predict(Xte)
    r2 = r2_score(yte, preds)
    mae = mean_absolute_error(yte, preds)
    rmse = np.sqrt(mean_squared_error(yte, preds))
    print(f"{name:30s} | CV_RMSE: {cv_rmse:8.4f} | Test RMSE: {rmse:8.4f} | MAE: {mae:8.4f} | R2: {r2:8.4f}")
    return {'name': name, 'cv_rmse': cv_rmse, 'test_rmse': rmse, 'mae': mae, 'r2': r2, 'preds': preds, 'model': model}

## 7) Build two pipelines to test
- **Pipeline A**: OneHotEncoder (categorical) + StandardScaler (numerical) + RandomForest
- **Pipeline B**: LeaveOneOutEncoder (categorical) + MinMaxScaler (numerical) + RandomForest

In [None]:
# Pipeline A: OneHot + Standard + RF
preproc_A = ColumnTransformer(transformers=[
    ('num', StandardScaler(), numerical_features),
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse=False), categorical_features)
], remainder='drop')

pipe_A = Pipeline([
    ('preproc', preproc_A),
    ('model', RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1))
])

# Pipeline B: LeaveOneOut + MinMax + RF
preproc_B = ColumnTransformer(transformers=[
    ('num', MinMaxScaler(), numerical_features),
    ('cat', ce.LeaveOneOutEncoder(cols=categorical_features, sigma=0.1), categorical_features)
], remainder='drop')

pipe_B = Pipeline([
    ('preproc', preproc_B),
    ('model', RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1))
])

print("Pipelines defined.")

## 8) Evaluate the two pipelines (mentor suggested & alternative)

In [None]:
results = []
print("\nEvaluating Pipeline A (OneHot + Standard + RF):")
resA = evaluate_model("OneHot+Std+RF", pipe_A, X_train, X_test, y_train, y_test)
results.append(resA)

print("\nEvaluating Pipeline B (LOO + MinMax + RF):")
resB = evaluate_model("LOO+MinMax+RF", pipe_B, X_train, X_test, y_train, y_test)
results.append(resB)

comp_df = pd.DataFrame([{'pipeline': r['name'], 'cv_rmse': r['cv_rmse'], 'test_rmse': r['test_rmse'], 'mae': r['mae'], 'r2': r['r2']} for r in results])
display(comp_df)
comp_df.to_csv(os.path.join(OUTPUT_DIR, "pipeline_comparison_initial.csv"), index=False)

## 9) Visualize pipeline comparison (Test RMSE & R²)

In [None]:
plt.figure(figsize=(10,4))
plt.subplot(1,2,1)
sns.barplot(data=comp_df, x='pipeline', y='test_rmse')
plt.title('Test RMSE by Pipeline')
plt.xticks(rotation=30)

plt.subplot(1,2,2)
sns.barplot(data=comp_df, x='pipeline', y='r2')
plt.title('R² by Pipeline')
plt.xticks(rotation=30)

plt.tight_layout()
plt.show()

## 10) Sweep several regressors using Pipeline A (OneHot + Standard)
We try: LinearRegression, Ridge, Lasso, DecisionTree, RandomForest, (XGBoost optional)

In [None]:
base_preproc = preproc_A
models_to_try = {
    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(alpha=1.0),
    "Lasso": Lasso(alpha=0.01),
    "DecisionTree": DecisionTreeRegressor(random_state=42),
    "RandomForest": RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1)
}
if XGBRegressor is not None:
    models_to_try["XGBoost"] = XGBRegressor(n_estimators=200, random_state=42, n_jobs=-1)

ml_results = []
for name, estimator in models_to_try.items():
    pipeline = Pipeline([('preproc', base_preproc), ('model', estimator)])
    res = evaluate_model(name, pipeline, X_train, X_test, y_train, y_test)
    ml_results.append(res)

ml_df = pd.DataFrame([{'model': r['name'], 'cv_rmse': r['cv_rmse'], 'test_rmse': r['test_rmse'], 'mae': r['mae'], 'r2': r['r2']} for r in ml_results]).sort_values('test_rmse')
display(ml_df)
ml_df.to_csv(os.path.join(OUTPUT_DIR, "ml_model_comparison.csv"), index=False)

## 11) Choose best model by test RMSE, show predictions, and save pipeline

In [None]:
# pick best by smallest test_rmse
best_row = ml_df.iloc[0]
best_name = best_row['model']
print("Best model by test RMSE:", best_name)

best_estimator = models_to_try[best_name]
best_pipeline = Pipeline([('preproc', base_preproc), ('model', best_estimator)])
best_pipeline.fit(X_train, y_train)
y_pred = best_pipeline.predict(X_test)

metrics = {'r2': r2_score(y_test, y_pred),
           'mae': mean_absolute_error(y_test, y_pred),
           'rmse': np.sqrt(mean_squared_error(y_test, y_pred))}
print("Test metrics (best):", metrics)

# Save predictions
pred_df = X_test.copy()
pred_df['y_true'] = y_test.values
pred_df['y_pred'] = y_pred
pred_path = os.path.join(OUTPUT_DIR, "best_pipeline_predictions.csv")
pred_df.to_csv(pred_path, index=False)
print("Saved predictions to:", pred_path)

# Save the best pipeline
pipeline_path = os.path.join(OUTPUT_DIR, f"best_pipeline_{best_name.replace(' ','_')}.joblib")
joblib.dump(best_pipeline, pipeline_path)
print("Saved best pipeline to:", pipeline_path)

## 12) Time-series: prepare daily series and test stationarity (ADF, KPSS), plot ACF/PACF

In [None]:
if 'Date' in df.columns:
    ts = df.copy()
    ts['Date'] = pd.to_datetime(ts['Date'], errors='coerce')
    ts = ts.dropna(subset=['Date']).set_index('Date')
    ts_daily = ts.resample('D').mean()
    ts_daily.to_csv(os.path.join(OUTPUT_DIR, "preprocessed_timeseries.csv"))
    print("Saved daily timeseries to:", os.path.join(OUTPUT_DIR, "preprocessed_timeseries.csv"))
    display(ts_daily.head())

    # Choose series
    series = ts_daily['Crop_Yield'].dropna()
    plt.figure(figsize=(12,4))
    plt.plot(series, color='darkgreen')
    plt.title('Daily Crop_Yield')
    plt.show()

    # ADF
    print("\nAugmented Dickey-Fuller test:")
    adf_res = adfuller(series)
    print("ADF stat:", adf_res[0], "p-value:", adf_res[1])
    for k,v in adf_res[4].items():
        print(k, v)
    print("=> stationary if p < 0.05")

    # KPSS
    print("\nKPSS test:")
    try:
        kpss_res = kpss(series, regression='c', nlags='auto')
        print("KPSS stat:", kpss_res[0], "p-value:", kpss_res[1])
        for k,v in kpss_res[3].items():
            print(k, v)
    except Exception as e:
        print("KPSS test error:", e)
    print("=> KPSS stationary if p > 0.05")

    # ACF / PACF
    fig, axes = plt.subplots(1,2,figsize=(12,4))
    plot_acf(series, lags=30, ax=axes[0])
    plot_pacf(series, lags=30, ax=axes[1])
    axes[0].set_title('ACF')
    axes[1].set_title('PACF')
    plt.show()
else:
    print("Date column not present — skipping time-series steps.")

## 13) ARIMA baseline & auto_arima (optional)

In [None]:
if 'Date' in df.columns:
    try:
        print("Fitting ARIMA(1,0,1)...")
        arima_model = ARIMA(series, order=(1,0,1))
        arima_fit = arima_model.fit()
        print(arima_fit.summary())
        plt.figure(figsize=(12,4))
        plt.plot(series, label='Actual')
        plt.plot(arima_fit.fittedvalues, color='r', label='Fitted')
        plt.legend()
        plt.show()
    except Exception as e:
        print("ARIMA failed:", e)

    # auto_arima (can be slow)
    try:
        print("Running auto_arima (may take time)...")
        auto_res = auto_arima(series, seasonal=False, trace=False, error_action='ignore', suppress_warnings=True, max_p=5, max_q=5)
        print(auto_res.summary())
    except Exception as e:
        print("auto_arima failed or took long:", e)

## 14) Save important outputs (CSV + pipeline files)

In [None]:
# Already saved earlier; list output folder
print("Files in", OUTPUT_DIR)
print(os.listdir(OUTPUT_DIR))

# Save ml_df and comp_df if present
if 'ml_df' in globals():
    ml_df.to_csv(os.path.join(OUTPUT_DIR, "ml_model_comparison_sorted.csv"), index=False)
print("Saved model comparisons.")

## 15) Git: push notebook to GitHub branch `intern-ArupKanungo`
**Important:** For pushing from Colab you need a GitHub Personal Access Token (PAT) with repo permissions.
1. Create a PAT in your GitHub account (Settings → Developer settings → Personal access tokens → repo scope).
2. Run the cell below and paste the token when prompted. The token is used only in this session.

In [None]:
# --- CONFIGURE THESE ---
GIT_REPO_URL = "https://github.com/springboardmentor789r/AgriYield.git"
BRANCH_NAME = "intern-ArupKanungo"
NOTEBOOK_FILENAME = "milestone2_agri_yield.ipynb"  # ensure your Colab notebook is named this before running

# 1) Ask for PAT securely
from getpass import getpass
token = getpass("Enter your GitHub Personal Access Token (will not echo): ")

# 2) Clone the repo (or pull if exists) into /content/repo
REPO_DIR = "/content/AgriYield"
if os.path.exists(REPO_DIR):
    print("Repo directory exists — pulling latest changes.")
    %cd {REPO_DIR}
    !git pull
    %cd /content
else:
    # clone with token in URL (private repo access)
    clone_url = GIT_REPO_URL.replace("https://", f"https://{token}@")
    print("Cloning:", GIT_REPO_URL)
    !git clone {clone_url} {REPO_DIR}
    # remove token from history (just in case)
    %cd /content

# 3) Create branch and switch to it
%cd {REPO_DIR}
!git checkout -b {BRANCH_NAME}

# 4) Copy this notebook file into the repo folder
import shutil
src_nb_path = f"/content/{NOTEBOOK_FILENAME}"
dest_nb_path = f"{REPO_DIR}/{NOTEBOOK_FILENAME}"
if os.path.exists(src_nb_path):
    shutil.copy(src_nb_path, dest_nb_path)
    print("Copied notebook to repo:", dest_nb_path)
else:
    print("Notebook file not found at", src_nb_path)
    print("Make sure your notebook is saved with name:", NOTEBOOK_FILENAME)

# 5) Stage, commit, push
!git add -A
!git commit -m "Milestone 2: pipelines + timeseries - Arup Kanungo" || true
push_url = GIT_REPO_URL.replace("https://", f"https://{token}@")
!git push {push_url} {BRANCH_NAME}

print("Done. If push succeeded, visit the repo to create a Pull Request.")

## Final Notes & Next Steps
- If push fails due to permission or token issues, double-check PAT scopes (repo) and branch name.
- After pushing, open GitHub → repo → "Compare & pull request" to create PR.
- If you want, I can convert this notebook to a downloadable `.ipynb` file for you.