In [18]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm



In [19]:
df=pd.read_csv("../data/updated_data2.csv")

 # Split the data
id_col="Index", 
target_col='Cancel', 
train_size=0.8
split_index = int(len(df) * train_size)
train_df = df.iloc[:split_index].copy()
test_df  = df.iloc[split_index:].copy()
    
print(f"Split complete. Training: {len(train_df)} rows, Test: {len(test_df)} rows.")

# Final Clean-up: drop the ID and the Date column
train_df = train_df.drop(columns=[id_col[0]])
test_df = test_df.drop(columns=[id_col[0]])

X_train = train_df.drop(columns=['Cancel'])
y_train = train_df['Cancel']

X_test = test_df.drop(columns=['Cancel'])
y_test = test_df['Cancel']

# Convert it to a Series for Statsmodels.
y_train = y_train.squeeze()
y_test = y_test.squeeze()

print(train_df.info())
train_df.head()


Split complete. Training: 79464 rows, Test: 19866 rows.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79464 entries, 0 to 79463
Data columns (total 17 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Domestic                           79464 non-null  int64  
 1   TripReason                         79464 non-null  int64  
 2   Cancel                             79464 non-null  int64  
 3   LeadTime_Days                      79464 non-null  float64
 4   LogPrice                           79464 non-null  float64
 5   Vehicle_Bus                        79464 non-null  int64  
 6   Vehicle_Plane                      79464 non-null  int64  
 7   Vehicle_Train                      79464 non-null  int64  
 8   TimeOfDay_Afternoon                79464 non-null  int64  
 9   TimeOfDay_Evening                  79464 non-null  int64  
 10  TimeOfDay_Morning                  79464 non-null  int64  
 11

Unnamed: 0,Domestic,TripReason,Cancel,LeadTime_Days,LogPrice,Vehicle_Bus,Vehicle_Plane,Vehicle_Train,TimeOfDay_Afternoon,TimeOfDay_Evening,TimeOfDay_Morning,TimeOfDay_Night,From_Rate,To_Rate,Route_Rate,User_Rate,cancel_rate_per_vehicle_and_price
0,1,1,0,10.597348,15.70258,0,1,0,0,0,0,1,0.145227,0.148492,0.156824,0.114425,0.113812
1,1,0,0,2.557834,16.066802,0,1,0,0,0,0,1,0.16356,0.148492,0.182062,0.13731,0.113812
2,1,1,0,0.732323,14.508658,1,0,0,0,0,1,0,0.104223,0.185071,0.145818,0.114425,0.125352
3,1,0,0,0.872986,10.596635,0,0,1,0,0,1,0,0.127785,0.031021,0.035157,0.114425,0.117717
4,1,0,0,1.478546,13.937728,1,0,0,0,0,0,1,0.142249,0.115205,0.093713,0.098079,0.12535


In [20]:

# List of reference categories to drop
# (We drop 'Bus' and 'Night' so they become the standard baseline)

cols_to_drop = ['Vehicle_Bus','TimeOfDay_Night','From_Rate', 'To_Rate', 'Domestic','LogPrice','cancel_rate_per_vehicle_and_price']

# Apply the drops
X_train = X_train.drop(columns=cols_to_drop)
X_test = X_test.drop(columns=cols_to_drop)

print("Reference categories dropped. Ready for VIF check or Modeling.")

# Create a VIF dataframe
vif_data = pd.DataFrame()
vif_data["feature"] = X_train.columns
vif_data["VIF"] = [variance_inflation_factor(X_train.values, i) 
                   for i in range(len(X_train.columns))]

print(vif_data.sort_values(by="VIF", ascending=False))

Reference categories dropped. Ready for VIF check or Modeling.
               feature       VIF
7           Route_Rate  7.405538
8            User_Rate  5.769223
4  TimeOfDay_Afternoon  2.473931
0           TripReason  2.346586
5    TimeOfDay_Evening  2.248752
3        Vehicle_Train  2.185912
1        LeadTime_Days  2.003988
6    TimeOfDay_Morning  1.832509
2        Vehicle_Plane  1.373849


In [21]:

from imblearn.over_sampling import SMOTE

print("="*70)
print("STEP 1: APPLY SMOTE TO TRAINING DATA ONLY")
print("="*70)

print("\nBEFORE Oversampling (Training Data):")
print(f"Class 0 (No Cancel): {(y_train == 0).sum()} samples")
print(f"Class 1 (Cancel):    {(y_train == 1).sum()} samples")
print(f"Ratio: {((y_train == 1).sum() / len(y_train)) * 100:.2f}% cancellations")

# Apply SMOTE ONLY to training data
# SMOTE creates synthetic samples of the minority class (1) to match majority class (0)

smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)


print("\nAFTER Oversampling (Training Data):")
print(f"Class 0 (No Cancel): {(y_train_balanced == 0).sum()} samples")
print(f"Class 1 (Cancel):    {(y_train_balanced == 1).sum()} samples")
print(f"Ratio: {((y_train_balanced == 1).sum() / len(y_train_balanced)) * 100:.2f}% cancellations")


STEP 1: APPLY SMOTE TO TRAINING DATA ONLY

BEFORE Oversampling (Training Data):
Class 0 (No Cancel): 68600 samples
Class 1 (Cancel):    10864 samples
Ratio: 13.67% cancellations

AFTER Oversampling (Training Data):
Class 0 (No Cancel): 68600 samples
Class 1 (Cancel):    68600 samples
Ratio: 50.00% cancellations


In [22]:

from sklearn.preprocessing import StandardScaler
print("\n" + "="*70)
print("STEP 2: APPLY STANDARD SCALING")
print("="*70)

# Fit scaler on balanced training data, transform both train and test
scaler = StandardScaler()
X_train_balanced_scaled = pd.DataFrame(
    scaler.fit_transform(X_train_balanced), 
    columns=X_train_balanced.columns
)
X_test_scaled = pd.DataFrame(
    scaler.transform(X_test), 
    columns=X_test.columns
)

print("Scaling applied successfully!")
print(f"X_train_balanced_scaled shape: {X_train_balanced_scaled.shape}")
print(f"X_test_scaled shape: {X_test_scaled.shape}")


STEP 2: APPLY STANDARD SCALING
Scaling applied successfully!
X_train_balanced_scaled shape: (137200, 9)
X_test_scaled shape: (19866, 9)


In [23]:
# === STEP 3: BASELINE MODEL (LogLeadTime Only) ===

print("="*70)
print("STEP 3: BASELINE MODEL - cancel_rate_per_vehicle_and_price Only")
print("="*70)

# Prepare baseline data (LogLeadTime only) from scaled data
X_train_balanced_scaled_baseline = X_train_balanced_scaled[['User_Rate']].copy()
X_train_balanced_scaled_baseline_const = sm.add_constant(X_train_balanced_scaled_baseline)

X_test_scaled_baseline = X_test_scaled[['User_Rate']].copy()
X_test_scaled_baseline_const = sm.add_constant(X_test_scaled_baseline)

# Fit baseline model
baseline_model = sm.GLM(y_train_balanced, X_train_balanced_scaled_baseline_const, 
                        family=sm.families.Binomial()).fit()

print(baseline_model.summary())

# Predictions using threshold = 0.5
threshold = 0.5
y_pred_prob_baseline = baseline_model.predict(X_test_scaled_baseline_const)
y_pred_baseline = (y_pred_prob_baseline >= threshold).astype(int)

print(f"\nAIC: {baseline_model.aic:.2f}")
print(f"Log-Likelihood: {baseline_model.llf:.2f}")

STEP 3: BASELINE MODEL - cancel_rate_per_vehicle_and_price Only
                 Generalized Linear Model Regression Results                  
Dep. Variable:                 Cancel   No. Observations:               137200
Model:                            GLM   Df Residuals:                   137198
Model Family:                Binomial   Df Model:                            1
Link Function:                  Logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -92038.
Date:                Tue, 27 Jan 2026   Deviance:                   1.8408e+05
Time:                        15:10:58   Pearson chi2:                 1.38e+05
No. Iterations:                     4   Pseudo R-squ. (CS):            0.04366
Covariance Type:            nonrobust                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------

In [24]:
# === STEP 4: FULL MODEL WITH ALL FEATURES ===
from scipy import stats
from sklearn.metrics import log_loss, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import numpy as np

print("="*70)
print("STEP 4: FULL MODEL - ALL FEATURES")
print("="*70)

# Add constant to scaled data
X_train_balanced_scaled_const = sm.add_constant(X_train_balanced_scaled)
X_test_scaled_const = sm.add_constant(X_test_scaled, has_constant='add')

# Per evitare altri ValueError, assicurati che l'ordine delle colonne sia IDENTICO al train
X_test_scaled_const = X_test_scaled_const[X_train_balanced_scaled_const.columns]

# Fit full model on balanced and scaled training data
full_model = sm.GLM(y_train_balanced, X_train_balanced_scaled_const, 
                    family=sm.families.Binomial()).fit()

print(f"Features nel modello (training): {full_model.params.index.tolist()}")
print(f"Features nel dataset di test: {X_test_scaled_const.columns.tolist()}")
print(full_model.summary())

# Predictions using threshold = 0.5
threshold = 0.5
y_pred_prob_full = full_model.predict(X_test_scaled_const)
y_pred_full = (y_pred_prob_full >= threshold).astype(int)

print("\n" + "="*70)
print("MODEL COMPARISON: BASELINE vs FULL MODEL")
print("="*70)

# Cross-Entropy Loss
baseline_cross_entropy = log_loss(y_test, y_pred_prob_baseline)
full_cross_entropy = log_loss(y_test, y_pred_prob_full)

# Create comparison dataframe
comparison_data = {
    'Metric': ['AIC', 'BIC', 'Log-Likelihood', 'Cross-Entropy Loss', 'Features Used'],
    'Baseline (LogLeadTime)': [
        f"{baseline_model.aic:.2f}",
        f"{baseline_model.bic:.2f}",
        f"{baseline_model.llf:.2f}",
        f"{baseline_cross_entropy:.4f}",
        "1"
    ],
    'Full Model (All)': [
        f"{full_model.aic:.2f}",
        f"{full_model.bic:.2f}",
        f"{full_model.llf:.2f}",
        f"{full_cross_entropy:.4f}",
        f"{len(X_train_balanced_scaled.columns)}"
    ]
}

comparison_df = pd.DataFrame(comparison_data)
print(comparison_df.to_string(index=False))


# Performance comparison
print(f"{'Metric':<15} {'Baseline':<15} {'Full Model':<15}")
print("-"*45)

baseline_acc = accuracy_score(y_test, y_pred_baseline)
full_acc = accuracy_score(y_test, y_pred_full)
print(f"{'Accuracy':<15} {baseline_acc:<15.4f} {full_acc:<15.4f}")

baseline_prec = precision_score(y_test, y_pred_baseline, zero_division=0)
full_prec = precision_score(y_test, y_pred_full, zero_division=0)
print(f"{'Precision':<15} {baseline_prec:<15.4f} {full_prec:<15.4f}")

baseline_rec = recall_score(y_test, y_pred_baseline, zero_division=0)
full_rec = recall_score(y_test, y_pred_full, zero_division=0)
print(f"{'Recall':<15} {baseline_rec:<15.4f} {full_rec:<15.4f}")

baseline_f1 = f1_score(y_test, y_pred_baseline, zero_division=0)
full_f1 = f1_score(y_test, y_pred_full, zero_division=0)
print(f"{'F1-Score':<15} {baseline_f1:<15.4f} {full_f1:<15.4f}")

baseline_auc = roc_auc_score(y_test, y_pred_prob_baseline)
full_auc = roc_auc_score(y_test, y_pred_prob_full)
print(f"{'ROC-AUC':<15} {baseline_auc:<15.4f} {full_auc:<15.4f}")

STEP 4: FULL MODEL - ALL FEATURES
Features nel modello (training): ['const', 'TripReason', 'LeadTime_Days', 'Vehicle_Plane', 'Vehicle_Train', 'TimeOfDay_Afternoon', 'TimeOfDay_Evening', 'TimeOfDay_Morning', 'Route_Rate', 'User_Rate']
Features nel dataset di test: ['const', 'TripReason', 'LeadTime_Days', 'Vehicle_Plane', 'Vehicle_Train', 'TimeOfDay_Afternoon', 'TimeOfDay_Evening', 'TimeOfDay_Morning', 'Route_Rate', 'User_Rate']
                 Generalized Linear Model Regression Results                  
Dep. Variable:                 Cancel   No. Observations:               137200
Model:                            GLM   Df Residuals:                   137190
Model Family:                Binomial   Df Model:                            9
Link Function:                  Logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -89266.
Date:                Tue, 27 Jan 2026   Deviance:                   1.7853e+05
Time:           



test of mixture of variables

In [25]:
print("="*70)
print("STEP 4: BASEmy")
print("="*70)

def mixture_model_create_and_compare(X_train_balanced_scaled,X_test_scaled,y_train_balanced, y_test, y_pred_prob_baseline,baseline_model, y_pred_baseline, columns):
    # Add constant to scaled data
    X_train_balanced_scaled_const = sm.add_constant(X_train_balanced_scaled)
    X_test_scaled_const = sm.add_constant(X_test_scaled, has_constant='add')

    # Per evitare altri ValueError, assicurati che l'ordine delle colonne sia IDENTICO al train
    X_test_scaled_const = X_test_scaled_const[X_train_balanced_scaled_const.columns]

    # Fit full model on balanced and scaled training data
    full_model = sm.GLM(y_train_balanced, X_train_balanced_scaled_const, 
                        family=sm.families.Binomial()).fit()

    print(f"Features nel modello (training): {full_model.params.index.tolist()}")
    print(f"Features nel dataset di test: {X_test_scaled_const.columns.tolist()}")
    print(full_model.summary())

    # Predictions using threshold = 0.5
    threshold = 0.5
    y_pred_prob_full = full_model.predict(X_test_scaled_const)
    y_pred_full = (y_pred_prob_full >= threshold).astype(int)

    print("\n" + "="*70)
    print("MODEL COMPARISON: BASELINE vs FULL MODEL")
    print("="*70)

    # Cross-Entropy Loss
    baseline_cross_entropy = log_loss(y_test, y_pred_prob_baseline)
    full_cross_entropy = log_loss(y_test, y_pred_prob_full)

    # Create comparison dataframe
    comparison_data = {
        'Metric': ['AIC', 'BIC', 'Log-Likelihood', 'Cross-Entropy Loss', 'Features Used'],
        'Baseline (LogLeadTime)': [
            f"{baseline_model.aic:.2f}",
            f"{baseline_model.bic:.2f}",
            f"{baseline_model.llf:.2f}",
            f"{baseline_cross_entropy:.4f}",
            "1"
        ],
        'Full Model (All)': [
            f"{full_model.aic:.2f}",
            f"{full_model.bic:.2f}",
            f"{full_model.llf:.2f}",
            f"{full_cross_entropy:.4f}",
            f"{len(X_train_balanced_scaled.columns)}"
        ]
    }

    comparison_df = pd.DataFrame(comparison_data)
    print(comparison_df.to_string(index=False))


    # Performance comparison
    print(f"{'Metric':<15} {'Baseline':<15} {'Full Model':<15}")
    print("-"*45)

    baseline_acc = accuracy_score(y_test, y_pred_baseline)
    full_acc = accuracy_score(y_test, y_pred_full)
    print(f"{'Accuracy':<15} {baseline_acc:<15.4f} {full_acc:<15.4f}")

    baseline_prec = precision_score(y_test, y_pred_baseline, zero_division=0)
    full_prec = precision_score(y_test, y_pred_full, zero_division=0)
    print(f"{'Precision':<15} {baseline_prec:<15.4f} {full_prec:<15.4f}")

    baseline_rec = recall_score(y_test, y_pred_baseline, zero_division=0)
    full_rec = recall_score(y_test, y_pred_full, zero_division=0)
    print(f"{'Recall':<15} {baseline_rec:<15.4f} {full_rec:<15.4f}")

    baseline_f1 = f1_score(y_test, y_pred_baseline, zero_division=0)
    full_f1 = f1_score(y_test, y_pred_full, zero_division=0)
    print(f"{'F1-Score':<15} {baseline_f1:<15.4f} {full_f1:<15.4f}")

    baseline_auc = roc_auc_score(y_test, y_pred_prob_baseline)
    full_auc = roc_auc_score(y_test, y_pred_prob_full)
    print(f"{'ROC-AUC':<15} {baseline_auc:<15.4f} {full_auc:<15.4f}")
    return full_model

STEP 4: BASEmy
