In [6]:
import pandas as pd
import pickle
import json
from lightgbm import LGBMClassifier
import numpy as np

# --- Configuration ---
# Match the filenames your Streamlit app expects
DATA_FILE = 'data/spoilage_data.csv' 
MODEL_FILENAME = 'spoilage_modelv2.pkl'  # Updated to match app.py
COLUMNS_FILENAME = 'spoilage_columnsv2.json' # Matched to app.py

def run_training_pipeline():
    """
    Executes the final training pipeline on the FULL dataset for deployment.
    """
    
    # --- Step 1: Load Data ---
    print("--- Step 1: Loading Data ---")
    try:
        df = pd.read_csv(DATA_FILE)
        print(f"Successfully loaded {DATA_FILE} with {len(df)} rows.")
    except FileNotFoundError:
        print(f"Error: {DATA_FILE} not found. Please ensure it is available.")
        return

    # --- Step 2: Feature Engineering ---
    print("\n--- Step 2: Performing Feature Engineering ---")
    df['temp_x_hours'] = df['avg_temp'] * df['transit_hours']
    df['temp_squared'] = df['avg_temp']**2
    print("New features created: ['temp_x_hours', 'temp_squared']")

    # --- Step 3: Data Preprocessing ---
    print("\n--- Step 3: Preprocessing Data ---")
    # Define features (X) and target (y) from the full dataset
    X = df.drop(columns=['shipment_id', 'spoilage_flag']) 
    y = df['spoilage_flag']
    
    # One-Hot Encode the full dataset
    X_encoded = pd.get_dummies(X, columns=['sku_id'], prefix='sku')
    print("Categorical feature 'sku_id' has been one-hot encoded on the full dataset.")

    # --- Step 4: Train Final Model ---
    print("\n--- Step 4: Training Final Model on 100% of Data ---")
    
    # We train the model directly on the entire processed dataset (X_encoded, y)
    final_model = LGBMClassifier(random_state=42) # Use a random_state for reproducibility
    final_model.fit(X_encoded, y)
    print("Model training complete.")

    # --- Step 5: Save Production Artifacts ---
    print("\n--- Step 5: Saving Model and Columns ---")
    
    # Save the trained model object using pickle
    with open(MODEL_FILENAME, 'wb') as f:
        pickle.dump(final_model, f)
    print(f"  -> Final model saved to '{MODEL_FILENAME}'")

    # Save the list of column names. This is CRITICAL for production.
    model_columns = X_encoded.columns.tolist()
    with open(COLUMNS_FILENAME, 'w') as f:
        json.dump(model_columns, f)
    print(f"  -> Model columns saved to '{COLUMNS_FILENAME}'")

    print("\n✅ Final model pipeline finished successfully!")

if __name__ == "__main__":
    run_training_pipeline()

--- Step 1: Loading Data ---
Successfully loaded data/spoilage_data.csv with 500 rows.

--- Step 2: Performing Feature Engineering ---
New features created: ['temp_x_hours', 'temp_squared']

--- Step 3: Preprocessing Data ---
Categorical feature 'sku_id' has been one-hot encoded on the full dataset.

--- Step 4: Training Final Model on 100% of Data ---
[LightGBM] [Info] Number of positive: 314, number of negative: 186
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 677
[LightGBM] [Info] Number of data points in the train set: 500, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.628000 -> initscore=0.523646
[LightGBM] [Info] Start training from score 0.523646
Model training complete.

--- Step 5: Saving Model and Columns ---
  -> Final model saved to 'spoilage_modelv2.pkl'
  -> Model columns saved to 'spoilage_columnsv2.json'

✅ Final model pipeline finished successfully!


In [2]:
import pandas as pd
import pickle  # Use Python's native pickle library
import json    # For saving the column list
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

# --- Configuration ---
# Let's use clean, descriptive filenames.
DATA_FILE = 'data/spoilage_data.csv' 
MODEL_FILENAME = 'spoilage_model_lightgbm.pkl'
COLUMNS_FILENAME = 'spoilage_columnsv2.json'

def run_training_pipeline():
    """
    Executes the full pipeline for creating the final deployment model:
    1. Load Data
    2. Engineer Features
    3. Preprocess Data (One-Hot Encode)
    4. Train Model and Evaluate
    5. Save Model and Columns using stable methods
    """
    
    # --- Step 1: Load Data ---
    print("--- Step 1: Loading Data ---")
    try:
        df = pd.read_csv(DATA_FILE)
        print(f"Successfully loaded {DATA_FILE} with {len(df)} rows.")
    except FileNotFoundError:
        print(f"Error: {DATA_FILE} not found. Please ensure it's in the same directory as the script.")
        return

    # --- Step 2: Feature Engineering ---
    print("\n--- Step 2: Performing Feature Engineering ---")
    df['temp_x_hours'] = df['avg_temp'] * df['transit_hours']
    df['temp_squared'] = df['avg_temp']**2
    print("New features created: ['temp_x_hours', 'temp_squared']")

    # --- Step 3: Data Preprocessing ---
    print("\n--- Step 3: Preprocessing Data ---")
    X = df.drop(columns=['shipment_id', 'spoilage_flag']) 
    y = df['spoilage_flag']
    X_encoded = pd.get_dummies(X, columns=['sku_id'], prefix='sku')
    print("Categorical feature 'sku_id' has been one-hot encoded.")

    # --- Step 4: Train Model and Evaluate ---
    print("\n--- Step 4: Training and Evaluating Model ---")
    X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

    model = LGBMClassifier()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    # Save the trained model object using pickle with a compatible protocol
    with open(MODEL_FILENAME, 'wb') as f:
        pickle.dump(model, f, protocol=4)
    print(f"  -> LightGBM model saved to '{MODEL_FILENAME}'")

    # Display results
    print("\nModel Evaluation Results:")
    print(f"Accuracy: {accuracy}, F1 Score: {f1}")

    # Save the list of column names as a JSON file
    model_columns = X_encoded.columns.tolist()
    with open(COLUMNS_FILENAME, 'w') as f:
        json.dump(model_columns, f)
    print(f"  -> Model columns saved to '{COLUMNS_FILENAME}'")

    print("\n✅ Pipeline execution finished successfully!")

if __name__ == "__main__":
    run_training_pipeline()

--- Step 1: Loading Data ---
Successfully loaded data/spoilage_data.csv with 500 rows.

--- Step 2: Performing Feature Engineering ---
New features created: ['temp_x_hours', 'temp_squared']

--- Step 3: Preprocessing Data ---
Categorical feature 'sku_id' has been one-hot encoded.

--- Step 4: Training and Evaluating Model ---
[LightGBM] [Info] Number of positive: 258, number of negative: 142
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000137 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 519
[LightGBM] [Info] Number of data points in the train set: 400, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.645000 -> initscore=0.597133
[LightGBM] [Info] Start training from score 0.597133
  -> LightGBM model saved to 'spoilage_model_lightgbm.pkl'

Model Evaluation Results:
Accuracy: 0.79, F1 Score: 0.823529411

In [1]:
pip install lightgbm

Collecting lightgbm
  Downloading lightgbm-4.6.0-py3-none-win_amd64.whl (1.5 MB)
     ---------------------------------------- 1.5/1.5 MB 1.1 MB/s eta 0:00:00
Installing collected packages: lightgbm
Successfully installed lightgbm-4.6.0
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip available: 22.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip
