In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from catboost import CatBoostClassifier
from typing import Dict, Tuple
import os
from sklearn.metrics import f1_score

In [3]:
df_train = pd.read_csv('../data/train.csv').drop('Unnamed: 0', axis=1)
df_train.head()

Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,70172,Male,Loyal Customer,13,Personal Travel,Eco Plus,460,3,4,3,...,5,4,3,4,4,5,5,25,18.0,neutral or dissatisfied
1,5047,Male,disloyal Customer,25,Business travel,Business,235,3,2,3,...,1,1,5,3,1,4,1,1,6.0,neutral or dissatisfied
2,110028,Female,Loyal Customer,26,Business travel,Business,1142,2,2,2,...,5,4,3,4,4,4,5,0,0.0,satisfied
3,24026,Female,Loyal Customer,25,Business travel,Business,562,2,5,5,...,2,2,5,3,1,4,2,11,9.0,neutral or dissatisfied
4,119299,Male,Loyal Customer,61,Business travel,Business,214,3,3,3,...,3,3,4,4,3,3,3,0,0.0,satisfied


In [4]:
df_train.isna().mean()

id                                   0.000000
Gender                               0.000000
Customer Type                        0.000000
Age                                  0.000000
Type of Travel                       0.000000
Class                                0.000000
Flight Distance                      0.000000
Inflight wifi service                0.000000
Departure/Arrival time convenient    0.000000
Ease of Online booking               0.000000
Gate location                        0.000000
Food and drink                       0.000000
Online boarding                      0.000000
Seat comfort                         0.000000
Inflight entertainment               0.000000
On-board service                     0.000000
Leg room service                     0.000000
Baggage handling                     0.000000
Checkin service                      0.000000
Inflight service                     0.000000
Cleanliness                          0.000000
Departure Delay in Minutes        

In [5]:
df_train.columns

Index(['id', 'Gender', 'Customer Type', 'Age', 'Type of Travel', 'Class',
       'Flight Distance', 'Inflight wifi service',
       'Departure/Arrival time convenient', 'Ease of Online booking',
       'Gate location', 'Food and drink', 'Online boarding', 'Seat comfort',
       'Inflight entertainment', 'On-board service', 'Leg room service',
       'Baggage handling', 'Checkin service', 'Inflight service',
       'Cleanliness', 'Departure Delay in Minutes', 'Arrival Delay in Minutes',
       'satisfaction'],
      dtype='object')

In [6]:
FLIGHT_DISTANCE_MEDIAN = df_train['Flight Distance'].median()
print(f"{FLIGHT_DISTANCE_MEDIAN=}")

FLIGHT_DISTANCE_MEDIAN=np.float64(843.0)


In [7]:
def preprocess_data(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    if 'id' in df.columns:
        df = df.drop(columns=['id'])

    cat_cols = ['Gender', 'Customer Type', 'Type of Travel', 'Class']
    for col in cat_cols:
        if col in df.columns:
            df[col] = df[col].astype(str)

    df['Total Delay'] = df['Departure Delay in Minutes'] + df['Arrival Delay in Minutes']
    df['Is Delayed'] = (df['Total Delay'] > 0).astype(int)
    df['Long Flight'] = (df['Flight Distance'] > FLIGHT_DISTANCE_MEDIAN).astype(int)

    return df

In [8]:
def train_and_evaluate_catboost(
    df: pd.DataFrame,
    target_col: str = 'satisfaction',
    categorical_features: list = ['Gender', 'Customer Type', 'Type of Travel', 'Class'],
    test_size: float = 0.2,
    random_state: int = 42,
):
    
    df_clean = preprocess_data(df)
    
    df_clean[target_col] = df_clean[target_col].map({
        'neutral or dissatisfied': 0,
        'satisfied': 1
    })
    
    X = df_clean.drop(columns=[target_col])
    y = df_clean[target_col]
    
    X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=test_size, random_state=random_state, stratify=y
    )
    
    cat_features = [X.columns.get_loc(c) for c in categorical_features if c in X.columns]
    
    best_score = 0
    best_params = {}
    best_model = None
    
    param_grid = {
        'learning_rate': [0.03, 0.1, 0.15],
        'depth': [4, 6],
        'iterations': [100, 500]
    }
    
    for lr in param_grid['learning_rate']:
        for depth in param_grid['depth']:
            for iters in param_grid['iterations']:
                model_candidate = CatBoostClassifier(
                    iterations=iters,
                    learning_rate=lr,
                    depth=depth,
                    eval_metric='F1',
                    early_stopping_rounds=50,
                    random_seed=random_state,
                    verbose=False
                )
                model_candidate.fit(
                    X_train, y_train,
                    cat_features=cat_features,
                    eval_set=(X_val, y_val),
                    use_best_model=True
                )
                val_pred = model_candidate.predict(X_val)
                f1 = f1_score(y_val, val_pred)

                print(f"{lr=}, {depth=}, {iters=}; f1={f1:.3f}")
                
                if f1 > best_score:
                    best_score = f1
                    best_params = {'learning_rate': lr, 'depth': depth, 'iterations': iters}
                    best_model = model_candidate
    
    print(f"Best params: {best_params} (F1 = {best_score:.4f})")
    model = best_model
    
    y_pred = model.predict(X_val)
    y_pred_proba = model.predict_proba(X_val)[:, 1]
    
    metrics = {
        'accuracy': accuracy_score(y_val, y_pred),
        'precision': precision_score(y_val, y_pred),
        'recall': recall_score(y_val, y_pred),
        'f1': f1_score(y_val, y_pred),
        'roc_auc': roc_auc_score(y_val, y_pred_proba)
    }
    
    return model, metrics, X_val, y_val

In [9]:
# Load data
df_train = pd.read_csv('../data/train.csv').drop('Unnamed: 0', axis=1)

# Train model
model, metrics, X_val, y_val = train_and_evaluate_catboost(df_train)

lr=0.03, depth=4, iters=100; f1=0.930
lr=0.03, depth=4, iters=500; f1=0.949
lr=0.03, depth=6, iters=100; f1=0.942
lr=0.03, depth=6, iters=500; f1=0.957
lr=0.1, depth=4, iters=100; f1=0.945
lr=0.1, depth=4, iters=500; f1=0.956
lr=0.1, depth=6, iters=100; f1=0.955
lr=0.1, depth=6, iters=500; f1=0.960
lr=0.15, depth=4, iters=100; f1=0.950
lr=0.15, depth=4, iters=500; f1=0.958
lr=0.15, depth=6, iters=100; f1=0.957
lr=0.15, depth=6, iters=500; f1=0.960
Best params: {'learning_rate': 0.1, 'depth': 6, 'iterations': 500} (F1 = 0.9597)


In [10]:
def find_optimal_threshold(y_true: np.ndarray, y_proba: np.ndarray) -> Tuple[float, float]:
    best_threshold = 0.5
    best_f1 = 0.0

    # Test thresholds from 0.1 to 0.9 with step 0.01 (you can adjust granularity)
    thresholds = np.arange(0.01, 1.0, 0.01)
    
    for th in thresholds:
        y_pred = (y_proba >= th).astype(int)
        f1 = f1_score(y_true, y_pred)
        if f1 > best_f1:
            best_f1 = f1
            best_threshold = th

    return best_threshold, best_f1

In [11]:
y_val_proba = model.predict_proba(X_val)[:, 1]

opt_th, opt_f1 = find_optimal_threshold(y_val, y_val_proba)

print(f"Optimal threshold: {opt_th:.3f}")
print(f"F1 at optimal threshold: {opt_f1:.6f}")
print(f"F1 at default threshold (0.5): {f1_score(y_val, model.predict(X_val)):.6f}")

Optimal threshold: 0.480
F1 at optimal threshold: 0.959708
F1 at default threshold (0.5): 0.959703


In [12]:
model_path = '../model/catboost_airline_satisfaction.cbm'

model.save_model(model_path)

print(f"✅ Model saved to: {model_path}")

✅ Model saved to: ../model/catboost_airline_satisfaction.cbm
