In [12]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import pickle  # Use Python's native pickle library
import json    # For saving the column list

# --- Configuration ---
# Let's use clean, descriptive filenames.
DATA_FILE = 'data/spoilage_data.csv' 
MODEL_FILENAME = 'spoilage_model.pkl'      # Use .pkl extension for pickle files
COLUMNS_FILENAME = 'spoilage_columns.json'

def run_training_pipeline():
    """
    Executes the full pipeline for creating the final deployment model:
    1. Load Data
    2. Engineer Features
    3. Preprocess Data (One-Hot Encode)
    4. Train Model on FULL Data
    5. Save Model and Columns using stable methods
    """
    
    # --- Step 1: Load Data ---
    print("--- Step 1: Loading Data ---")
    try:
        df = pd.read_csv(DATA_FILE)
        print(f"Successfully loaded {DATA_FILE} with {len(df)} rows.")
    except FileNotFoundError:
        print(f"Error: {DATA_FILE} not found. Please ensure it's in the same directory as the script.")
        return

    # --- Step 2: Feature Engineering ---
    print("\n--- Step 2: Performing Feature Engineering ---")
    df['temp_x_hours'] = df['avg_temp'] * df['transit_hours']
    df['temp_squared'] = df['avg_temp']**2
    print("New features created: ['temp_x_hours', 'temp_squared']")

    # --- Step 3: Data Preprocessing ---
    print("\n--- Step 3: Preprocessing Data ---")
    X = df.drop(columns=['shipment_id', 'spoilage_flag']) 
    y = df['spoilage_flag']
    X_encoded = pd.get_dummies(X, columns=['sku_id'], prefix='sku')
    print("Categorical feature 'sku_id' has been one-hot encoded.")

    # --- Step 4: Train Model on FULL Data ---
    # We train on 100% of the data to make the final model as robust as possible.
    # The train/test split was only for getting the performance metrics.
    print("\n--- Step 4: Training Final Model on FULL Dataset ---")
    model = RandomForestClassifier(
        n_estimators=200,
        max_depth=15,
        random_state=42,
        n_jobs=-1
    )
    model.fit(X_encoded, y)
    print("Final model training complete.")

    # --- Step 5: Save Model and Supporting Files ---
    print("\n--- Step 5: Saving Model and Column List for Deployment ---")
    
    # Save the trained model object using pickle with a compatible protocol
    with open(MODEL_FILENAME, 'wb') as f:
        pickle.dump(model, f, protocol=4)
    print(f"  -> Model saved to '{MODEL_FILENAME}'")

    # Save the list of column names as a JSON file
    model_columns = X_encoded.columns.tolist()
    with open(COLUMNS_FILENAME, 'w') as f:
        json.dump(model_columns, f)
    print(f"  -> Model columns saved to '{COLUMNS_FILENAME}'")

    print("\n✅ Pipeline execution finished successfully!")


if __name__ == "__main__":
    run_training_pipeline()

--- Step 1: Loading Data ---
Successfully loaded data/spoilage_data.csv with 500 rows.

--- Step 2: Performing Feature Engineering ---
New features created: ['temp_x_hours', 'temp_squared']

--- Step 3: Preprocessing Data ---
Categorical feature 'sku_id' has been one-hot encoded.

--- Step 4: Training Final Model on FULL Dataset ---
Final model training complete.

--- Step 5: Saving Model and Column List for Deployment ---
  -> Model saved to 'spoilage_model.pkl'
  -> Model columns saved to 'spoilage_columns.json'

✅ Pipeline execution finished successfully!
