In [7]:
import pandas as pd
import pickle
import json
from lightgbm import LGBMClassifier
import numpy as np

# --- Configuration ---
# Match the filenames your Streamlit app expects
DATA_FILE = 'data/spoilage_data.csv' 
MODEL_FILENAME = 'spoilage_model.pkl'  # Updated to match app.py
COLUMNS_FILENAME = 'spoilage_columns.json' # Matched to app.py

def run_training_pipeline():
    """
    Executes the final training pipeline on the FULL dataset for deployment.
    """
    
    # --- Step 1: Load Data ---
    print("--- Step 1: Loading Data ---")
    try:
        df = pd.read_csv(DATA_FILE)
        print(f"Successfully loaded {DATA_FILE} with {len(df)} rows.")
    except FileNotFoundError:
        print(f"Error: {DATA_FILE} not found. Please ensure it is available.")
        return

    # --- Step 2: Feature Engineering ---
    print("\n--- Step 2: Performing Feature Engineering ---")
    df['temp_x_hours'] = df['avg_temp'] * df['transit_hours']
    df['temp_squared'] = df['avg_temp']**2
    print("New features created: ['temp_x_hours', 'temp_squared']")

    # --- Step 3: Data Preprocessing ---
    print("\n--- Step 3: Preprocessing Data ---")
    # Define features (X) and target (y) from the full dataset
    X = df.drop(columns=['shipment_id', 'spoilage_flag']) 
    y = df['spoilage_flag']
    
    # One-Hot Encode the full dataset
    X_encoded = pd.get_dummies(X, columns=['sku_id'], prefix='sku')
    print("Categorical feature 'sku_id' has been one-hot encoded on the full dataset.")

    # --- Step 4: Train Final Model ---
    print("\n--- Step 4: Training Final Model on 100% of Data ---")
    
    # We train the model directly on the entire processed dataset (X_encoded, y)
    final_model = LGBMClassifier(random_state=42) # Use a random_state for reproducibility
    final_model.fit(X_encoded, y)
    print("Model training complete.")

    # --- Step 5: Save Production Artifacts ---
    print("\n--- Step 5: Saving Model and Columns ---")
    
    # Save the trained model object using pickle
    with open(MODEL_FILENAME, 'wb') as f:
        pickle.dump(final_model, f)
    print(f"  -> Final model saved to '{MODEL_FILENAME}'")

    # Save the list of column names. This is CRITICAL for production.
    model_columns = X_encoded.columns.tolist()
    with open(COLUMNS_FILENAME, 'w') as f:
        json.dump(model_columns, f)
    print(f"  -> Model columns saved to '{COLUMNS_FILENAME}'")

    print("\n✅ Final model pipeline finished successfully!")

if __name__ == "__main__":
    run_training_pipeline()

--- Step 1: Loading Data ---
Successfully loaded data/spoilage_data.csv with 500 rows.

--- Step 2: Performing Feature Engineering ---
New features created: ['temp_x_hours', 'temp_squared']

--- Step 3: Preprocessing Data ---
Categorical feature 'sku_id' has been one-hot encoded on the full dataset.

--- Step 4: Training Final Model on 100% of Data ---
[LightGBM] [Info] Number of positive: 314, number of negative: 186
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000178 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 677
[LightGBM] [Info] Number of data points in the train set: 500, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.628000 -> initscore=0.523646
[LightGBM] [Info] Start training from score 0.523646
Model training complete.

--- Step 5: Saving Model and Columns ---
  -> Final model saved to 'spoilage_model.pkl'
  -> Model columns saved to 'spoilage_columns.

In [8]:
import pandas as pd
import pickle
import json
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from lightgbm import LGBMClassifier

# --- Configuration ---
DATA_FILE = 'data/spoilage_data.csv' 
MODEL_FILENAME = 'spoilage_modelv2.pkl'
COLUMNS_FILENAME = 'spoilage_columnsv2.json'
METRICS_FILENAME = 'spoilage_metrics.json' # File to save our scores

def run_training_pipeline():
    """
    Executes a two-part pipeline:
    1. Evaluates model performance on a test set and saves the metrics.
    2. Trains a final model on the FULL dataset for deployment.
    """
    
    # --- Load and Preprocess Data (Same for both parts) ---
    print("--- Loading and Preprocessing Data ---")
    df = pd.read_csv(DATA_FILE)
    df['temp_x_hours'] = df['avg_temp'] * df['transit_hours']
    df['temp_squared'] = df['avg_temp']**2
    X = df.drop(columns=['shipment_id', 'spoilage_flag']) 
    y = df['spoilage_flag']
    X_encoded = pd.get_dummies(X, columns=['sku_id'], prefix='sku')

    # === Part 1: Evaluate Model and Save Metrics ===
    print("\n--- Part 1: Evaluating Model Performance ---")
    X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

    # Train a temporary model for evaluation
    eval_model = LGBMClassifier(random_state=42)
    eval_model.fit(X_train, y_train)
    y_pred_test = eval_model.predict(X_test)
    y_proba_test = eval_model.predict_proba(X_test)[:, 1]

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred_test)
    roc_auc = roc_auc_score(y_test, y_proba_test)
    
    print(f"  -> Test Set Accuracy: {accuracy:.4f}")
    print(f"  -> Test Set ROC-AUC: {roc_auc:.4f}")

    # Save the calculated metrics to a JSON file
    metrics = {
        'accuracy': accuracy,
        'roc_auc': roc_auc
    }
    with open(METRICS_FILENAME, 'w') as f:
        json.dump(metrics, f)
    print(f"  -> Metrics saved to '{METRICS_FILENAME}'")
    
    # === Part 2: Train Final Model on 100% of Data ===
    print("\n--- Part 2: Training Final Model for Deployment ---")
    final_model = LGBMClassifier(random_state=42)
    final_model.fit(X_encoded, y) # Use the entire dataset
    
    # Save the final model
    with open(MODEL_FILENAME, 'wb') as f:
        pickle.dump(final_model, f)
    print(f"  -> Final model saved to '{MODEL_FILENAME}'")

    # Save the columns for the final model
    model_columns = X_encoded.columns.tolist()
    with open(COLUMNS_FILENAME, 'w') as f:
        json.dump(model_columns, f)
    print(f"  -> Model columns saved to '{COLUMNS_FILENAME}'")
    
    print("\n✅ Full pipeline finished successfully!")

if __name__ == "__main__":
    run_training_pipeline()

--- Loading and Preprocessing Data ---

--- Part 1: Evaluating Model Performance ---
[LightGBM] [Info] Number of positive: 258, number of negative: 142
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001966 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 519
[LightGBM] [Info] Number of data points in the train set: 400, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.645000 -> initscore=0.597133
[LightGBM] [Info] Start training from score 0.597133
  -> Test Set Accuracy: 0.7900
  -> Test Set ROC-AUC: 0.8738
  -> Metrics saved to 'spoilage_metrics.json'

--- Part 2: Training Final Model for Deployment ---
[LightGBM] [Info] Number of positive: 314, number of negative: 186
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000060 seconds.
You can set `force_col_wise=true` t

In [1]:
pip install lightgbm

Collecting lightgbm
  Downloading lightgbm-4.6.0-py3-none-win_amd64.whl (1.5 MB)
     ---------------------------------------- 1.5/1.5 MB 1.1 MB/s eta 0:00:00
Installing collected packages: lightgbm
Successfully installed lightgbm-4.6.0
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip available: 22.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
import pandas as pd

# Load the CSV file to print basic information
csv_file_path = 'data/spoilage_data.csv'
df = pd.read_csv(csv_file_path)

# Print basic information about the CSV
print(df.info())
print(df.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   shipment_id    500 non-null    object 
 1   sku_id         500 non-null    object 
 2   transit_hours  500 non-null    float64
 3   avg_temp       500 non-null    float64
 4   shock_events   500 non-null    int64  
 5   spoilage_flag  500 non-null    int64  
dtypes: float64(2), int64(2), object(2)
memory usage: 23.6+ KB
None
       transit_hours    avg_temp  shock_events  spoilage_flag
count     500.000000  500.000000    500.000000     500.000000
mean       60.299400   15.595400      4.590000       0.628000
std        33.116756   11.588651      2.900815       0.483822
min         5.300000   -4.900000      0.000000       0.000000
25%        31.000000    4.975000      2.000000       0.000000
50%        58.800000   15.900000      4.000000       1.000000
75%        88.625000   25.800000      7.00

In [4]:
for column in df.columns:
    unique_values = df[column].unique()
    print(f"Unique values in column '{column}': {unique_values.tolist()}")


Unique values in column 'shipment_id': ['SHP-001', 'SHP-002', 'SHP-003', 'SHP-004', 'SHP-005', 'SHP-006', 'SHP-007', 'SHP-008', 'SHP-009', 'SHP-010', 'SHP-011', 'SHP-012', 'SHP-013', 'SHP-014', 'SHP-015', 'SHP-016', 'SHP-017', 'SHP-018', 'SHP-019', 'SHP-020', 'SHP-021', 'SHP-022', 'SHP-023', 'SHP-024', 'SHP-025', 'SHP-026', 'SHP-027', 'SHP-028', 'SHP-029', 'SHP-030', 'SHP-031', 'SHP-032', 'SHP-033', 'SHP-034', 'SHP-035', 'SHP-036', 'SHP-037', 'SHP-038', 'SHP-039', 'SHP-040', 'SHP-041', 'SHP-042', 'SHP-043', 'SHP-044', 'SHP-045', 'SHP-046', 'SHP-047', 'SHP-048', 'SHP-049', 'SHP-050', 'SHP-051', 'SHP-052', 'SHP-053', 'SHP-054', 'SHP-055', 'SHP-056', 'SHP-057', 'SHP-058', 'SHP-059', 'SHP-060', 'SHP-061', 'SHP-062', 'SHP-063', 'SHP-064', 'SHP-065', 'SHP-066', 'SHP-067', 'SHP-068', 'SHP-069', 'SHP-070', 'SHP-071', 'SHP-072', 'SHP-073', 'SHP-074', 'SHP-075', 'SHP-076', 'SHP-077', 'SHP-078', 'SHP-079', 'SHP-080', 'SHP-081', 'SHP-082', 'SHP-083', 'SHP-084', 'SHP-085', 'SHP-086', 'SHP-087', 'SH

In [5]:
numeric_columns = df.select_dtypes(include=['number']).columns
for column in numeric_columns:
    min_value = df[column].min()
    max_value = df[column].max()
    print(f"Column '{column}' - Min: {min_value}, Max: {max_value}")


Column 'transit_hours' - Min: 5.3, Max: 119.9
Column 'avg_temp' - Min: -4.9, Max: 34.6
Column 'shock_events' - Min: 0, Max: 9
Column 'spoilage_flag' - Min: 0, Max: 1
