In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
import joblib
import json

# --- Configuration ---
DATA_FILE = 'data/spoilage_data.csv'
MODEL_FILENAME = 'spoilage_model.joblib'
COLUMNS_FILENAME = 'model_columns.json'

def run_training_pipeline():
    """
    Executes the full pipeline:
    1. Load Data
    2. Engineer Features
    3. Preprocess Data (One-Hot Encode)
    4. Split Data
    5. Train Model
    6. Evaluate Model
    7. Save Model and Columns
    """
    
    # --- Step 1: Load Data & Initial Exploration ---
    print("--- Step 1: Loading Data ---")
    try:
        df = pd.read_csv(DATA_FILE)
        print(f"Successfully loaded {DATA_FILE} with {len(df)} rows.")
    except FileNotFoundError:
        print(f"Error: {DATA_FILE} not found. Please make sure it's in the same directory.")
        return

    # Basic Exploration
    # print(df.head())
    # print(df.info())
    # print(df['spoilage_flag'].value_counts(normalize=True))
    
    # --- Step 2: Feature Engineering ---
    print("\n--- Step 2: Performing Feature Engineering ---")
    # Interaction feature between temperature and transit time
    df['temp_x_hours'] = df['avg_temp'] * df['transit_hours']
    # Non-linear temperature feature to penalize high temperatures more
    df['temp_squared'] = df['avg_temp']**2
    print("New features created: ['temp_x_hours', 'temp_squared']")

    # --- Step 3: Data Preprocessing ---
    print("\n--- Step 3: Preprocessing Data ---")
    # Define features (X) and target (y)
    X = df.drop(columns=['shipment_id', 'spoilage_flag']) 
    y = df['spoilage_flag']

    # One-Hot Encode the 'sku_id' feature
    X_encoded = pd.get_dummies(X, columns=['sku_id'], prefix='sku')
    print("Categorical feature 'sku_id' has been one-hot encoded.")
    # print(X_encoded.head())

    # # --- Step 4: Split Data ---
    # print("\n--- Step 4: Splitting Data into Training and Testing Sets ---")
    # X_train, X_test, y_train, y_test = train_test_split(
    #     X_encoded, y, 
    #     test_size=0.2,       # 20% of data will be for testing
    #     random_state=42,     # Ensures reproducibility
    #     stratify=y           # Ensures train/test sets have similar spoilage ratios
    # )
    # print(f"Data split into {len(X_train)} training rows and {len(X_test)} testing rows.")

    # --- Step 5: Model Training on FULL Data ---
    print("\n--- Step 5: Training Final Model on FULL Dataset ---")
    model = RandomForestClassifier(
        n_estimators=200,
        max_depth=15,
        random_state=42,
        n_jobs=-1
    )

    # Fit the model on ALL of X_encoded and y
    model.fit(X_encoded, y)
    print("Final model training complete.")

    # --- Step 6: Model Evaluation ---
    # print("\n--- Step 6: Evaluating Model Performance ---")
    # # Make predictions on the unseen test set
    # y_pred = model.predict(X_test)
    # y_pred_proba = model.predict_proba(X_test)[:, 1] # Probabilities for the "spoiled" class

    # # Calculate performance metrics
    # accuracy = accuracy_score(y_test, y_pred)
    # roc_auc = roc_auc_score(y_test, y_pred_proba)

    # print(f"  -> Accuracy on Test Set:      {accuracy:.4f}")
    # print(f"  -> ROC-AUC Score on Test Set: {roc_auc:.4f}")

    # # Display feature importances to understand the model's decisions
    # print("\n--- Top 10 Feature Importances ---")
    # importances = pd.DataFrame({
    #     'feature': X_encoded.columns,
    #     'importance': model.feature_importances_
    # }).sort_values('importance', ascending=False)
    # print(importances.head(10))

    # --- Step 7: Save Model and Supporting Files ---
    print("\n--- Step 7: Saving Model and Column List for Deployment ---")
    
    # Save the trained model object
    joblib.dump(model, MODEL_FILENAME)
    print(f"  -> Model saved to '{MODEL_FILENAME}'")

    # Save the list of column names in the exact order
    model_columns = X_encoded.columns.tolist()
    with open(COLUMNS_FILENAME, 'w') as f:
        json.dump(model_columns, f)
    print(f"  -> Model columns saved to '{COLUMNS_FILENAME}'")
    
    print("\n✅ Pipeline execution finished successfully!")


if __name__ == "__main__":
    run_training_pipeline()

--- Step 1: Loading Data ---
Successfully loaded data/spoilage_data.csv with 500 rows.

--- Step 2: Performing Feature Engineering ---
New features created: ['temp_x_hours', 'temp_squared']

--- Step 3: Preprocessing Data ---
Categorical feature 'sku_id' has been one-hot encoded.

--- Step 5: Training Final Model on FULL Dataset ---
Final model training complete.

--- Step 7: Saving Model and Column List for Deployment ---
  -> Model saved to 'spoilage_model.joblib'
  -> Model columns saved to 'model_columns.json'

✅ Pipeline execution finished successfully!
