In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm



In [None]:


 # Split the data
def split(data, id_col="Index", target_col='Cancel', train_size=0.8):
    split_index = int(len(data) * train_size)
    train_df = data.iloc[:split_index].copy()
    test_df  = data.iloc[split_index:].copy()
        
    print(f"Split complete. Training: {len(train_df)} rows, Test: {len(test_df)} rows.")

    # Final Clean-up: drop the ID and the Date column
    train_df = train_df.drop(columns=[id_col])
    test_df = test_df.drop(columns=[id_col])

    X_train = train_df.drop(columns=['Cancel'])
    y_train = train_df['Cancel']

    X_test = test_df.drop(columns=['Cancel'])
    y_test = test_df['Cancel']

    # Convert it to a Series for Statsmodels.
    y_train = y_train.squeeze()
    y_test = y_test.squeeze()

    return X_train, y_train, X_test, y_test

df=pd.read_csv("../data/updated_data2.csv")

df_plane = df[df['Vehicle_Plane'] == 1].copy()

# Rimuoviamo le colonne dei veicoli che ora sono ridondanti (tutti sono Plane)
df_plane = df_plane.drop(columns=['Vehicle_Plane', 'Vehicle_Bus', 'Vehicle_Train'])

# 2. Creazione del DataFrame per TERRESTRI (Treni e Bus)
# Filtriamo dove Vehicle_Train o Vehicle_Bus sono 1
df_terrestrial = df[(df['Vehicle_Train'] == 1) | (df['Vehicle_Bus'] == 1)].copy()
df_terrestrial = df_terrestrial.drop(columns=['Vehicle_Plane', 'Vehicle_Bus', 'Vehicle_Train'])

X_train_plane, y_train_plane, X_test_plane, y_test_plane=split(df_plane)
X_train_train, y_train_train, X_test_train, y_test_train=split(df_terrestrial)

print(X_train_plane.info())
X_train_plane.head()


Split complete. Training: 10728 rows, Test: 2683 rows.
Split complete. Training: 68735 rows, Test: 17184 rows.
<class 'pandas.core.frame.DataFrame'>
Index: 10728 entries, 0 to 79909
Data columns (total 13 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Domestic                           10728 non-null  int64  
 1   TripReason                         10728 non-null  int64  
 2   LeadTime_Days                      10728 non-null  float64
 3   LogPrice                           10728 non-null  float64
 4   TimeOfDay_Afternoon                10728 non-null  int64  
 5   TimeOfDay_Evening                  10728 non-null  int64  
 6   TimeOfDay_Morning                  10728 non-null  int64  
 7   TimeOfDay_Night                    10728 non-null  int64  
 8   From_Rate                          10728 non-null  float64
 9   To_Rate                            10728 non-null  float64
 10  Route_Rate  

Unnamed: 0,Domestic,TripReason,LeadTime_Days,LogPrice,TimeOfDay_Afternoon,TimeOfDay_Evening,TimeOfDay_Morning,TimeOfDay_Night,From_Rate,To_Rate,Route_Rate,User_Rate,cancel_rate_per_vehicle_and_price
0,1,1,10.597348,15.70258,0,0,0,1,0.145227,0.148492,0.156824,0.114425,0.113812
1,1,0,2.557834,16.066802,0,0,0,1,0.16356,0.148492,0.182062,0.13731,0.113812
6,1,1,7.740009,16.044341,0,0,1,0,0.142249,0.10486,0.113083,0.292909,0.113812
11,1,1,0.482153,15.616065,0,0,0,1,0.129168,0.148492,0.148526,0.114425,0.113812
15,0,1,0.821631,16.593522,0,0,1,0,0.142249,0.082844,0.086369,0.114425,0.113812


In [3]:

# List of reference categories to drop
# (We drop 'Bus' and 'Night' so they become the standard baseline)
def drops(X_train, X_test):                                                         #O sostituisci con solo "cancel_rate_per_vehicle_and_price"
    cols_to_drop = ['TimeOfDay_Night','From_Rate', 'To_Rate', 'Domestic','LogPrice',"cancel_rate_per_vehicle_and_price"]

    # Apply the drops
    X_train = X_train.drop(columns=cols_to_drop)
    X_test = X_test.drop(columns=cols_to_drop)

    print("Reference categories dropped. Ready for VIF check or Modeling.")
    return X_train,X_test

# Create a VIF dataframe
def vif(X_train):
    vif_data = pd.DataFrame()
    vif_data["feature"] = X_train.columns
    vif_data["VIF"] = [variance_inflation_factor(X_train.values, i) 
                    for i in range(len(X_train.columns))]

    print(vif_data.sort_values(by="VIF", ascending=False))


X_train_plane, X_test_plane=drops(X_train_plane, X_test_plane)
X_train_train, X_test_train=drops(X_train_train, X_test_train)

vif(X_train_plane)
vif(X_train_train)

Reference categories dropped. Ready for VIF check or Modeling.
Reference categories dropped. Ready for VIF check or Modeling.
               feature       VIF
5           Route_Rate  7.284695
6            User_Rate  6.901697
2  TimeOfDay_Afternoon  2.569271
4    TimeOfDay_Morning  2.545879
3    TimeOfDay_Evening  2.457656
0           TripReason  1.937560
1        LeadTime_Days  1.851600
               feature       VIF
5           Route_Rate  7.055179
6            User_Rate  5.615913
2  TimeOfDay_Afternoon  2.431548
0           TripReason  2.360429
3    TimeOfDay_Evening  2.204024
1        LeadTime_Days  1.753330
4    TimeOfDay_Morning  1.708805


In [4]:

from imblearn.over_sampling import SMOTE
def oversample(X_train,y_train):
    print("="*70)
    print("STEP 1: APPLY SMOTE TO TRAINING DATA ONLY")
    print("="*70)

    print("\nBEFORE Oversampling (Training Data):")
    print(f"Class 0 (No Cancel): {(y_train == 0).sum()} samples")
    print(f"Class 1 (Cancel):    {(y_train == 1).sum()} samples")
    print(f"Ratio: {((y_train == 1).sum() / len(y_train)) * 100:.2f}% cancellations")

    # Apply SMOTE ONLY to training data
    # SMOTE creates synthetic samples of the minority class (1) to match majority class (0)
    smote = SMOTE(random_state=42)
    X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)



    print("\nAFTER Oversampling (Training Data):")
    print(f"Class 0 (No Cancel): {(y_train_balanced == 0).sum()} samples")
    print(f"Class 1 (Cancel):    {(y_train_balanced == 1).sum()} samples")
    print(f"Ratio: {((y_train_balanced == 1).sum() / len(y_train_balanced)) * 100:.2f}% cancellations")
    return X_train_balanced, y_train_balanced

X_train_plane_balanced, y_train_plane_balanced=oversample(X_train_plane, y_train_plane)
X_train_train_balanced, y_train_train_balanced=oversample(X_train_train, y_train_train)


STEP 1: APPLY SMOTE TO TRAINING DATA ONLY

BEFORE Oversampling (Training Data):
Class 0 (No Cancel): 9517 samples
Class 1 (Cancel):    1211 samples
Ratio: 11.29% cancellations

AFTER Oversampling (Training Data):
Class 0 (No Cancel): 9517 samples
Class 1 (Cancel):    9517 samples
Ratio: 50.00% cancellations
STEP 1: APPLY SMOTE TO TRAINING DATA ONLY

BEFORE Oversampling (Training Data):
Class 0 (No Cancel): 59087 samples
Class 1 (Cancel):    9648 samples
Ratio: 14.04% cancellations

AFTER Oversampling (Training Data):
Class 0 (No Cancel): 59087 samples
Class 1 (Cancel):    59087 samples
Ratio: 50.00% cancellations


In [5]:

from sklearn.preprocessing import StandardScaler
print("\n" + "="*70)
print("STEP 2: APPLY STANDARD SCALING")
print("="*70)

# Fit scaler on balanced training data, transform both train and test
def scale(X_train_balanced,X_test):
    scaler = StandardScaler()
    X_train_balanced_scaled = pd.DataFrame(
        scaler.fit_transform(X_train_balanced), 
        columns=X_train_balanced.columns
    )
    X_test_scaled = pd.DataFrame(
        scaler.transform(X_test), 
        columns=X_test.columns
    )

    print("Scaling applied successfully!")
    print(f"X_train_balanced_scaled shape: {X_train_balanced_scaled.shape}")
    print(f"X_test_scaled shape: {X_test_scaled.shape}")
    return X_train_balanced_scaled,X_test_scaled
X_train_plane_balanced_scaled,X_test_plane_scaled = scale(X_train_plane_balanced,X_test_plane)
X_train_train_balanced_scaled,X_test_train_scaled = scale(X_train_train_balanced,X_test_train)


STEP 2: APPLY STANDARD SCALING
Scaling applied successfully!
X_train_balanced_scaled shape: (19034, 7)
X_test_scaled shape: (2683, 7)
Scaling applied successfully!
X_train_balanced_scaled shape: (118174, 7)
X_test_scaled shape: (17184, 7)


In [7]:
# === STEP 3: BASELINE MODEL (LogLeadTime Only) ===

print("="*70)
print("STEP 3: BASELINE MODEL - cancel_rate_per_vehicle_and_price Only")
print("="*70)

def baseline_mode_create(X_train_balanced_scaled,X_test_scaled,y_train_balanced,feature="cancel_rate_per_vehicle_and_price"):
    # Prepare baseline data (LogLeadTime only) from scaled data
    X_train_balanced_scaled_baseline = X_train_balanced_scaled[[feature]].copy()
    X_train_balanced_scaled_baseline_const = sm.add_constant(X_train_balanced_scaled_baseline)

    X_test_scaled_baseline = X_test_scaled[[feature]].copy()
    X_test_scaled_baseline_const = sm.add_constant(X_test_scaled_baseline)

    # Fit baseline model
    baseline_model = sm.GLM(y_train_balanced, X_train_balanced_scaled_baseline_const, 
                            family=sm.families.Binomial()).fit()

    print(baseline_model.summary())

    # Predictions using threshold = 0.5
    threshold = 0.5
    y_pred_prob_baseline = baseline_model.predict(X_test_scaled_baseline_const)
    y_pred_baseline = (y_pred_prob_baseline >= threshold).astype(int)

    print(f"\nAIC: {baseline_model.aic:.2f}")
    print(f"Log-Likelihood: {baseline_model.llf:.2f}")

    return baseline_model, y_pred_prob_baseline, y_pred_baseline

baseline_model_plane, y_pred_prob_baseline_plane, y_pred_baseline_plane = baseline_mode_create(X_train_plane_balanced_scaled,X_test_plane_scaled,y_train_plane_balanced, feature="User_Rate")
baseline_model_train, y_pred_prob_baseline_train, y_pred_baseline_train = baseline_mode_create(X_train_train_balanced_scaled,X_test_train_scaled,y_train_train_balanced, feature="User_Rate")

STEP 3: BASELINE MODEL - cancel_rate_per_vehicle_and_price Only
                 Generalized Linear Model Regression Results                  
Dep. Variable:                 Cancel   No. Observations:                19034
Model:                            GLM   Df Residuals:                    19032
Model Family:                Binomial   Df Model:                            1
Link Function:                  Logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -12833.
Date:                Tue, 27 Jan 2026   Deviance:                       25665.
Time:                        11:59:59   Pearson chi2:                 1.92e+04
No. Iterations:                     4   Pseudo R-squ. (CS):            0.03720
Covariance Type:            nonrobust                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------

In [8]:
# === STEP 4: FULL MODEL WITH ALL FEATURES ===
from scipy import stats
from sklearn.metrics import log_loss, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import numpy as np

print("="*70)
print("STEP 4: FULL MODEL - ALL FEATURES")
print("="*70)
def complete_model_create_and_compare(X_train_balanced_scaled,X_test_scaled,y_train_balanced, y_test, y_pred_prob_baseline,baseline_model, y_pred_baseline):
    # Add constant to scaled data
    X_train_balanced_scaled_const = sm.add_constant(X_train_balanced_scaled)
    X_test_scaled_const = sm.add_constant(X_test_scaled, has_constant='add')

    # Per evitare altri ValueError, assicurati che l'ordine delle colonne sia IDENTICO al train
    X_test_scaled_const = X_test_scaled_const[X_train_balanced_scaled_const.columns]

    # Fit full model on balanced and scaled training data
    full_model = sm.GLM(y_train_balanced, X_train_balanced_scaled_const, 
                        family=sm.families.Binomial()).fit()

    print(f"Features nel modello (training): {full_model.params.index.tolist()}")
    print(f"Features nel dataset di test: {X_test_scaled_const.columns.tolist()}")
    print(full_model.summary())

    # Predictions using threshold = 0.5
    threshold = 0.5
    y_pred_prob_full = full_model.predict(X_test_scaled_const)
    y_pred_full = (y_pred_prob_full >= threshold).astype(int)

    print("\n" + "="*70)
    print("MODEL COMPARISON: BASELINE vs FULL MODEL")
    print("="*70)

    # Cross-Entropy Loss
    baseline_cross_entropy = log_loss(y_test, y_pred_prob_baseline)
    full_cross_entropy = log_loss(y_test, y_pred_prob_full)

    # Create comparison dataframe
    comparison_data = {
        'Metric': ['AIC', 'BIC', 'Log-Likelihood', 'Cross-Entropy Loss', 'Features Used'],
        'Baseline (LogLeadTime)': [
            f"{baseline_model.aic:.2f}",
            f"{baseline_model.bic:.2f}",
            f"{baseline_model.llf:.2f}",
            f"{baseline_cross_entropy:.4f}",
            "1"
        ],
        'Full Model (All)': [
            f"{full_model.aic:.2f}",
            f"{full_model.bic:.2f}",
            f"{full_model.llf:.2f}",
            f"{full_cross_entropy:.4f}",
            f"{len(X_train_balanced_scaled.columns)}"
        ]
    }

    comparison_df = pd.DataFrame(comparison_data)
    print(comparison_df.to_string(index=False))


    # Performance comparison
    print(f"{'Metric':<15} {'Baseline':<15} {'Full Model':<15}")
    print("-"*45)

    baseline_acc = accuracy_score(y_test, y_pred_baseline)
    full_acc = accuracy_score(y_test, y_pred_full)
    print(f"{'Accuracy':<15} {baseline_acc:<15.4f} {full_acc:<15.4f}")

    baseline_prec = precision_score(y_test, y_pred_baseline, zero_division=0)
    full_prec = precision_score(y_test, y_pred_full, zero_division=0)
    print(f"{'Precision':<15} {baseline_prec:<15.4f} {full_prec:<15.4f}")

    baseline_rec = recall_score(y_test, y_pred_baseline, zero_division=0)
    full_rec = recall_score(y_test, y_pred_full, zero_division=0)
    print(f"{'Recall':<15} {baseline_rec:<15.4f} {full_rec:<15.4f}")

    baseline_f1 = f1_score(y_test, y_pred_baseline, zero_division=0)
    full_f1 = f1_score(y_test, y_pred_full, zero_division=0)
    print(f"{'F1-Score':<15} {baseline_f1:<15.4f} {full_f1:<15.4f}")

    baseline_auc = roc_auc_score(y_test, y_pred_prob_baseline)
    full_auc = roc_auc_score(y_test, y_pred_prob_full)
    print(f"{'ROC-AUC':<15} {baseline_auc:<15.4f} {full_auc:<15.4f}")
    return full_model
complete_model_create_and_compare(X_train_plane_balanced_scaled,X_test_plane_scaled,y_train_plane_balanced, y_test_plane, y_pred_prob_baseline_plane,baseline_model_plane, y_pred_baseline_plane)
complete_model_create_and_compare(X_train_train_balanced_scaled,X_test_train_scaled,y_train_train_balanced, y_test_train, y_pred_prob_baseline_train,baseline_model_train, y_pred_baseline_train)

STEP 4: FULL MODEL - ALL FEATURES
Features nel modello (training): ['const', 'TripReason', 'LeadTime_Days', 'TimeOfDay_Afternoon', 'TimeOfDay_Evening', 'TimeOfDay_Morning', 'Route_Rate', 'User_Rate']
Features nel dataset di test: ['const', 'TripReason', 'LeadTime_Days', 'TimeOfDay_Afternoon', 'TimeOfDay_Evening', 'TimeOfDay_Morning', 'Route_Rate', 'User_Rate']
                 Generalized Linear Model Regression Results                  
Dep. Variable:                 Cancel   No. Observations:                19034
Model:                            GLM   Df Residuals:                    19026
Model Family:                Binomial   Df Model:                            7
Link Function:                  Logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -12471.
Date:                Tue, 27 Jan 2026   Deviance:                       24941.
Time:                        12:00:06   Pearson chi2:                 5.33e+04
No. I



Features nel modello (training): ['const', 'TripReason', 'LeadTime_Days', 'TimeOfDay_Afternoon', 'TimeOfDay_Evening', 'TimeOfDay_Morning', 'Route_Rate', 'User_Rate']
Features nel dataset di test: ['const', 'TripReason', 'LeadTime_Days', 'TimeOfDay_Afternoon', 'TimeOfDay_Evening', 'TimeOfDay_Morning', 'Route_Rate', 'User_Rate']
                 Generalized Linear Model Regression Results                  
Dep. Variable:                 Cancel   No. Observations:               118174
Model:                            GLM   Df Residuals:                   118166
Model Family:                Binomial   Df Model:                            7
Link Function:                  Logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -76686.
Date:                Tue, 27 Jan 2026   Deviance:                   1.5337e+05
Time:                        12:00:06   Pearson chi2:                 1.19e+05
No. Iterations:                     4  



<statsmodels.genmod.generalized_linear_model.GLMResultsWrapper at 0x798a5c1ca720>