In [21]:

import logging
import warnings
import joblib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from datetime import datetime

# ML & Preprocessing
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.metrics import (
    classification_report, confusion_matrix, accuracy_score,
    f1_score, roc_auc_score, roc_curve, precision_recall_curve
)
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

# Note: Requires 'pip install imbalanced-learn shap'
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
import shap
from streamlit import header

warnings.filterwarnings("ignore")

class LogisticGuardian:
    """
    Unified System for Training and Predicting Logistics Delays.
    """
    def __init__(self, data_path: str = "dataset_logistica_ml_10k.csv"):
        self.data_path = data_path
        self.model_filename = "models/logistic_guardian_v3_2.pkl"
        self.viz_dir = Path("visualizations")
        self.log_dir = Path("logs")

        # Setup folders
        for folder in [self.viz_dir, self.log_dir, Path("models")]:
            folder.mkdir(exist_ok=True)

        self._setup_logging()
        self.pipeline = None

        # Schema definition
        self.numeric_features = [
            "distanza_km", "valore_merce_eur", "peso_kg",
            "numero_transiti", "rischio_meteo", "rischio_doganale"
        ]
        self.categorical_features = ["modalit√†_trasporto", "fragile", "tracking_gps"]
        self.target = "ritardo"

    def _setup_logging(self):
        logging.basicConfig(
            level=logging.INFO,
            format="%(asctime)s [%(levelname)s] %(message)s",
            handlers=[
                logging.FileHandler(self.log_dir / "system.log"),
                logging.StreamHandler()
            ]
        )
        self.logger = logging.getLogger("LogisticGuardian")

    def train_pipeline(self):
        self.logger.info("--- Starting Training Pipeline ---")

        # 1. Load and Validate
        df = pd.read_csv(self.data_path)
        missing = [c for c in self.numeric_features + self.categorical_features if c not in df.columns]
        if missing: raise ValueError(f"Missing columns: {missing}")

        X = df[self.numeric_features + self.categorical_features]
        y = df[self.target]

        # 2. Split
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, stratify=y, random_state=42
        )

        # 3. Build Transformers
        preprocessor = ColumnTransformer([
            ("num", StandardScaler(), self.numeric_features),
            ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), self.categorical_features)
        ])

        # 4. Ensemble Model
        ensemble = VotingClassifier(
            estimators=[
                ("rf", RandomForestClassifier(n_estimators=150, n_jobs=-1, random_state=42)),
                ("gb", GradientBoostingClassifier(n_estimators=100, random_state=42))
            ],
            voting="soft"
        )

        # 5. Full Pipeline with SMOTE
        self.pipeline = Pipeline([
            ("preprocessor", preprocessor),
            ("smote", SMOTE(random_state=42)),
            ("classifier", ensemble)
        ])

        # 6. Fit & Evaluate
        self.pipeline.fit(X_train, y_train)
        self._generate_reports(X_test, y_test)

        # 7. Save
        joblib.dump(self.pipeline, self.model_filename)
        self.logger.info(f"Model successfully saved to {self.model_filename}")

    def _generate_reports(self, X_test, y_test):
        y_pred = self.pipeline.predict(X_test)
        y_proba = self.pipeline.predict_proba(X_test)[:, 1]

        # Save Metrics to Log
        self.logger.info(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
        self.logger.info(f"ROC-AUC: {roc_auc_score(y_test, y_proba):.4f}")

        # Visual 1: Confusion Matrix
        plt.figure(figsize=(6, 5))
        sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt="d", cmap="Blues")
        plt.title("Confusion Matrix")
        plt.savefig(self.viz_dir / "confusion_matrix.png")

        # Visual 2: Precision-Recall Curve (Crucial for logistics)
        precision, recall, _ = precision_recall_curve(y_test, y_proba)
        plt.figure(figsize=(6, 5))
        plt.plot(recall, precision, color='darkorange', lw=2)
        plt.xlabel('Recall')
        plt.ylabel('Precision')
        plt.title('Precision-Recall Curve')
        plt.savefig(self.viz_dir / "precision_recall.png")
        plt.close('all')

    def predict_new(self, data: pd.DataFrame):
        """Method to predict on new, unseen data."""
        if self.pipeline is None:
            self.pipeline = joblib.load(self.model_filename)

        preds = self.pipeline.predict(data)
        probs = self.pipeline.predict_proba(data)[:, 1]

        data['predizione_ritardo'] = preds
        data['probabilit√†_ritardo'] = np.round(probs, 3)
        return data

# --- Execution ---
if __name__ == "__main__":
    guardian = LogisticGuardian("dataset_logistica_ml_10k.csv")

    # Run Training
    guardian.train_pipeline()

    # Simulation: Predict on a small sample of the same file
    sample_data = pd.read_csv("dataset_logistica_ml_10k.csv").head(5)
    results = guardian.predict_new(sample_data.drop(columns=['ritardo']))
    print("\n--- Example Predictions ---")
    print(results[['distanza_km', 'modalit√†_trasporto', 'predizione_ritardo', 'probabilit√†_ritardo']])

    ##


2025-12-18 12:23:36,742 - INFO - --- Starting Training Pipeline ---
2025-12-18 12:23:40,734 - INFO - Accuracy: 0.7730
2025-12-18 12:23:40,740 - INFO - ROC-AUC: 0.5873
2025-12-18 12:23:41,759 - INFO - Model successfully saved to models/logistic_guardian_v3_2.pkl



--- Example Predictions ---
   distanza_km modalit√†_trasporto  predizione_ritardo  probabilit√†_ritardo
0          910             Strada                   1                0.698
1         1344             Strada                   1                0.856
2         1180             Strada                   1                0.791
3         1145               Mare                   1                0.824
4         1688           Ferrovia                   1                0.756


In [None]:
#Example  below model also open terminal #cd D:\Github_code_back\esercizi-python-rajaroybca6\logistic_project
#streamlit run logistic_web.py

' ' ' # üì¶ Example Shipment Scenarios

## ‚úÖ SCENARIO 1: Perfect On-Time Delivery (0% Risk)

### Input Values:
- **Distance (km):** 150
- **Cargo Value (‚Ç¨):** 5,000
- **Weight (kg):** 200
- **Number of Transits:** 0
- **Weather Risk:** 1 (Clear conditions)
- **Customs Risk:** 1 (Minimal complexity)
- **Transport Mode:** Strada (Road)
- **Fragile Goods:** No (0)
- **GPS Tracking:** Active (1)

### Why This is Low Risk:
- ‚úÖ Short distance (local delivery)
- ‚úÖ Low cargo value (minimal security concerns)
- ‚úÖ Light weight (easy handling)
- ‚úÖ No transit points (direct route)
- ‚úÖ Perfect weather conditions
- ‚úÖ Simple customs (domestic)
- ‚úÖ GPS tracking enabled
- ‚úÖ Non-fragile goods

---

## ‚ö†Ô∏è SCENARIO 2: Medium Risk (40-60% Delay)

### Input Values:
- **Distance (km):** 800
- **Cargo Value (‚Ç¨):** 25,000
- **Weight (kg):** 800
- **Number of Transits:** 3
- **Weather Risk:** 3 (Moderate concerns)
- **Customs Risk:** 3 (Some complexity)
- **Transport Mode:** Strada (Road)
- **Fragile Goods:** Yes (1)
- **GPS Tracking:** Active (1)

### Why This is Medium Risk:
- ‚ö†Ô∏è Medium distance (regional)
- ‚ö†Ô∏è Moderate cargo value
- ‚ö†Ô∏è Multiple transit points
- ‚ö†Ô∏è Weather could cause delays
- ‚ö†Ô∏è Fragile goods need careful handling
- ‚úÖ GPS tracking helps monitoring

---

## üö® SCENARIO 3: High Risk - Delay Predicted (80-95% Risk)

### Input Values:
- **Distance (km):** 3,500
- **Cargo Value (‚Ç¨):** 150,000
- **Weight (kg):** 2,500
- **Number of Transits:** 8
- **Weather Risk:** 5 (Severe conditions - storms/snow)
- **Customs Risk:** 5 (Complex international shipping)
- **Transport Mode:** Mare (Sea) or Strada (Road)
- **Fragile Goods:** Yes (1)
- **GPS Tracking:** Not Active (0)

### Why This is High Risk:
- üî¥ Very long distance (international)
- üî¥ High value cargo (theft/insurance concerns)
- üî¥ Heavy weight (handling challenges)
- üî¥ Multiple transit points (8 handoffs)
- üî¥ Severe weather conditions
- üî¥ Complex customs procedures
- üî¥ Fragile goods with no GPS tracking
- üî¥ No real-time monitoring

---

## üåä SCENARIO 4: Extreme Risk - Almost Certain Delay (95%+ Risk)

### Input Values:
- **Distance (km):** 5,000
- **Cargo Value (‚Ç¨):** 250,000
- **Weight (kg):** 5,000
- **Number of Transits:** 10
- **Weather Risk:** 5 (Hurricane/Typhoon season)
- **Customs Risk:** 5 (Multiple countries, complex regulations)
- **Transport Mode:** Mare (Sea)
- **Fragile Goods:** Yes (1)
- **GPS Tracking:** Not Active (0)

### Why This is Extreme Risk:
- üî¥ Intercontinental distance
- üî¥ Very high value (requires special security)
- üî¥ Very heavy cargo
- üî¥ Maximum transit complexity
- üî¥ Extreme weather threats
- üî¥ Multiple international borders
- üî¥ Sea transport (longer, more variables)
- üî¥ No tracking capability
- üî¥ Fragile goods at high risk

---

## üöÄ SCENARIO 5: Express Air - Low Risk Despite Distance

### Input Values:
- **Distance (km):** 2,000
- **Cargo Value (‚Ç¨):** 30,000
- **Weight (kg):** 150
- **Number of Transits:** 1
- **Weather Risk:** 2 (Minor concerns)
- **Customs Risk:** 2 (Pre-cleared)
- **Transport Mode:** Aereo (Air)
- **Fragile Goods:** No (0)
- **GPS Tracking:** Active (1)

### Why This is Still Low Risk:
- ‚úÖ Air transport (fastest mode)
- ‚úÖ Single transit point (airport to airport)
- ‚úÖ Light weight (priority handling)
- ‚úÖ GPS tracking throughout
- ‚úÖ Pre-cleared customs
- ‚úÖ Non-fragile goods
- ‚ö†Ô∏è Distance offset by speed

---

## üìä Quick Reference Table

| Scenario | Distance | Value | Transits | Weather | Customs | GPS | Expected Risk |
|----------|----------|-------|----------|---------|---------|-----|---------------|
| Perfect  | 150 km   | ‚Ç¨5K   | 0        | 1       | 1       | ‚úÖ  | 0-10%        |
| Medium   | 800 km   | ‚Ç¨25K  | 3        | 3       | 3       | ‚úÖ  | 40-60%       |
| High     | 3,500 km | ‚Ç¨150K | 8        | 5       | 5       | ‚ùå  | 80-95%       |
| Extreme  | 5,000 km | ‚Ç¨250K | 10       | 5       | 5       | ‚ùå  | 95%+         |
| Express  | 2,000 km | ‚Ç¨30K  | 1        | 2       | 2       | ‚úÖ  | 10-20%       |

---

## üí° Tips for Testing

1. **Start with Perfect scenario** - Should show green "ON TIME"
2. **Gradually increase risk factors** - Watch probability climb
3. **Test extreme scenario** - Should show red "DELAY PREDICTED"
4. **Mix factors** - See how different combinations affect risk
5. **Try transport modes** - Air typically lowest risk for long distance

---

## üéØ Key Insights

**Risk Multipliers:**
- Each transit point adds complexity
- Weather risk ‚â• 4 significantly increases delays
- No GPS tracking removes visibility and control
- Fragile goods require extra handling time
- High value cargo needs additional security checks
- Sea transport has more variables than air/road

**Risk Reducers:**
- GPS tracking provides proactive management
- Fewer transit points = fewer failure points
- Good weather = predictable timeline
- Simple customs = faster processing
- Non-fragile goods = standard handling
- Moderate distances = fewer complications
' ' '

In [20]:
import logging
import warnings
import joblib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from datetime import datetime

# ML & Preprocessing
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.metrics import (
    classification_report, confusion_matrix, accuracy_score,
    f1_score, roc_auc_score, roc_curve, precision_recall_curve
)
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

# Note: Requires 'pip install imbalanced-learn shap'
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
import shap

warnings.filterwarnings("ignore")

class LogisticGuardian:
    """
    Unified System for Training and Predicting Logistics Delays.
    """
    def __init__(self, data_path: str = "dataset_logistica_ml_10k.csv"):
        self.data_path = data_path
        self.model_filename = "models/logistic_guardian_v3_2.pkl"
        self.viz_dir = Path("visualizations")
        self.log_dir = Path("logs")

        # Setup folders
        for folder in [self.viz_dir, self.log_dir, Path("models")]:
            folder.mkdir(exist_ok=True)

        self._setup_logging()
        self.pipeline = None

        # Schema definition
        self.numeric_features = [
            "distanza_km", "valore_merce_eur", "peso_kg",
            "numero_transiti", "rischio_meteo", "rischio_doganale"
        ]
        self.categorical_features = ["modalit√†_trasporto", "fragile", "tracking_gps"]
        self.target = "ritardo"

    def _setup_logging(self):
        logging.basicConfig(
            level=logging.INFO,
            format="%(asctime)s [%(levelname)s] %(message)s",
            handlers=[
                logging.FileHandler(self.log_dir / "system.log"),
                logging.StreamHandler()
            ]
        )
        self.logger = logging.getLogger("LogisticGuardian")

    def train_pipeline(self):
        self.logger.info("--- Starting Training Pipeline ---")

        # 1. Load and Validate
        df = pd.read_csv(self.data_path)
        missing = [c for c in self.numeric_features + self.categorical_features if c not in df.columns]
        if missing: raise ValueError(f"Missing columns: {missing}")

        X = df[self.numeric_features + self.categorical_features]
        y = df[self.target]

        # 2. Split
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, stratify=y, random_state=42
        )

        # 3. Build Transformers
        preprocessor = ColumnTransformer([
            ("num", StandardScaler(), self.numeric_features),
            ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), self.categorical_features)
        ])

        # 4. Ensemble Model
        ensemble = VotingClassifier(
            estimators=[
                ("rf", RandomForestClassifier(n_estimators=150, n_jobs=-1, random_state=42)),
                ("gb", GradientBoostingClassifier(n_estimators=100, random_state=42))
            ],
            voting="soft"
        )

        # 5. Full Pipeline with SMOTE
        self.pipeline = Pipeline([
            ("preprocessor", preprocessor),
            ("smote", SMOTE(random_state=42)),
            ("classifier", ensemble)
        ])

        # 6. Fit & Evaluate
        self.pipeline.fit(X_train, y_train)
        self._generate_reports(X_test, y_test)

        # 7. Save
        joblib.dump(self.pipeline, self.model_filename)
        self.logger.info(f"Model successfully saved to {self.model_filename}")

    def _generate_reports(self, X_test, y_test):
        y_pred = self.pipeline.predict(X_test)
        y_proba = self.pipeline.predict_proba(X_test)[:, 1]

        # Save Metrics to Log
        self.logger.info(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
        self.logger.info(f"ROC-AUC: {roc_auc_score(y_test, y_proba):.4f}")

        # Visual 1: Confusion Matrix
        plt.figure(figsize=(6, 5))
        sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt="d", cmap="Blues")
        plt.title("Confusion Matrix")
        plt.savefig(self.viz_dir / "confusion_matrix.png")

        # Visual 2: Precision-Recall Curve (Crucial for logistics)
        precision, recall, _ = precision_recall_curve(y_test, y_proba)
        plt.figure(figsize=(6, 5))
        plt.plot(recall, precision, color='darkorange', lw=2)
        plt.xlabel('Recall')
        plt.ylabel('Precision')
        plt.title('Precision-Recall Curve')
        plt.savefig(self.viz_dir / "precision_recall.png")
        plt.close('all')

    def predict_new(self, data: pd.DataFrame):
        """Method to predict on new, unseen data."""
        if self.pipeline is None:
            self.pipeline = joblib.load(self.model_filename)

        preds = self.pipeline.predict(data)
        probs = self.pipeline.predict_proba(data)[:, 1]

        data['predizione_ritardo'] = preds
        data['probabilit√†_ritardo'] = np.round(probs, 3)
        return data

# --- Execution ---
if __name__ == "__main__":
    guardian = LogisticGuardian("dataset_logistica_ml_10k.csv")

    # Run Training
    guardian.train_pipeline()

    # Simulation: Predict on a small sample of the same file
    sample_data = pd.read_csv("dataset_logistica_ml_10k.csv").head(5)
    results = guardian.predict_new(sample_data.drop(columns=['ritardo']))
    print("\n--- Example Predictions ---")
    print(results[['distanza_km', 'modalit√†_trasporto', 'predizione_ritardo', 'probabilit√†_ritardo']])

2025-12-18 12:22:56,838 - INFO - --- Starting Training Pipeline ---
2025-12-18 12:23:01,111 - INFO - Accuracy: 0.7730
2025-12-18 12:23:01,115 - INFO - ROC-AUC: 0.5873
2025-12-18 12:23:01,872 - INFO - Model successfully saved to models/logistic_guardian_v3_2.pkl



--- Example Predictions ---
   distanza_km modalit√†_trasporto  predizione_ritardo  probabilit√†_ritardo
0          910             Strada                   1                0.698
1         1344             Strada                   1                0.856
2         1180             Strada                   1                0.791
3         1145               Mare                   1                0.824
4         1688           Ferrovia                   1                0.756


In [22]:
import pandas as pd

# 1. Use quotes around the path.
# 2. Add 'r' before the path to handle the backslashes (\) correctly in Windows.
path = r"D:\Github_code_back\esercizi-python-rajaroybca6\logistic_project\dataset_logistica_ml_10k.csv"

df = pd.read_csv(path)

# To see the column names (headers)
print(df.columns)

# To see the first 5 rows of data
print(df.head())

Index(['distanza_km', 'valore_merce_eur', 'peso_kg', 'fragile',
       'numero_transiti', 'rischio_meteo', 'rischio_doganale', 'tracking_gps',
       'modalit√†_trasporto', 'ritardo'],
      dtype='object')
   distanza_km  valore_merce_eur  peso_kg  fragile  numero_transiti  \
0          910            131842      366        1                2   
1         1344            265418     7835        0                1   
2         1180            151033      655        0                0   
3         1145             21044      624        0                2   
4         1688            100372     1227        0                2   

   rischio_meteo  rischio_doganale  tracking_gps modalit√†_trasporto  ritardo  
0              2                 5             1             Strada        1  
1              5                 1             0             Strada        1  
2              4                 3             0             Strada        1  
3              4                 5             0 