# Data Preprocessing
This notebook loads the raw dataset, performs cleaning, and saves a processed CSV.

In [None]:

import pandas as pd
import numpy as np
from pathlib import Path

RAW_PATH = Path("../data/agri_production.csv")  # place your raw CSV here
OUT_DIR = Path("../data/processed")
OUT_DIR.mkdir(parents=True, exist_ok=True)
OUT_PATH = OUT_DIR / "processed.csv"

df = pd.read_csv(RAW_PATH)
print("Rows, Cols:", df.shape)
print("Columns:", df.columns.tolist())

for c in df.select_dtypes(include=["object"]).columns:
    df[c] = df[c].astype(str).str.strip()

rename_map = {
    "State_Name":"state", "Crop":"crop", "Season":"season",
    "Area":"area", "Production":"production", "Cost":"cost",
    "Year":"year"
}
df = df.rename(columns={k:v for k,v in rename_map.items() if k in df.columns})

if "production" in df.columns:
    df["production"] = pd.to_numeric(df["production"], errors="coerce")
if "area" in df.columns:
    df["area"] = pd.to_numeric(df["area"], errors="coerce")

df = df.dropna(how="all")

for c in df.columns:
    if pd.api.types.is_numeric_dtype(df[c]):
        df[c] = df[c].fillna(df[c].median())
    else:
        df[c] = df[c].fillna("Unknown")

if {"production","area"}.issubset(df.columns):
    df["prod_per_hectare"] = np.where(df["area"]>0, df["production"]/df["area"], 0.0)

df.to_csv(OUT_PATH, index=False)
print("Saved processed CSV to", OUT_PATH)
