**CLASSIFICATION MODEL**

In [47]:
import pandas as pd
C=pd.read_csv('Airline_Delay_Cause.csv')

In [49]:
C = C.dropna()

In [51]:
safe_to_convert = [
    'arr_flights', 'arr_del15',
    'arr_cancelled', 'arr_diverted',
    'arr_delay', 'carrier_delay', 'weather_delay',
    'nas_delay', 'security_delay', 'late_aircraft_delay'
]
for col in safe_to_convert:
    C[col] = C[col].astype(int)

In [53]:
C.dtypes

year                     int64
month                    int64
carrier                 object
carrier_name            object
airport                 object
airport_name            object
arr_flights              int32
arr_del15                int32
carrier_ct             float64
weather_ct             float64
nas_ct                 float64
security_ct            float64
late_aircraft_ct       float64
arr_cancelled            int32
arr_diverted             int32
arr_delay                int32
carrier_delay            int32
weather_delay            int32
nas_delay                int32
security_delay           int32
late_aircraft_delay      int32
dtype: object

In [55]:
C = C.drop(columns=['airport_name', 'carrier_name'])

In [57]:
C['delay_arrival_ratio'] = C['arr_del15'] / C['arr_flights']

In [59]:
C['delayed'] = (C['delay_arrival_ratio'] > 0.1).astype(int)

In [61]:
C.head()

Unnamed: 0,year,month,carrier,airport,arr_flights,arr_del15,carrier_ct,weather_ct,nas_ct,security_ct,...,arr_cancelled,arr_diverted,arr_delay,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay,delay_arrival_ratio,delayed
0,2023,12,9E,ABE,72,5,2.46,1.0,0.73,0.0,...,0,0,672,61,574,20,0,17,0.069444,0
1,2023,12,9E,AEX,62,7,4.25,0.0,1.0,0.0,...,0,0,348,252,0,33,0,63,0.112903,1
2,2023,12,9E,AGS,95,10,5.94,0.0,1.06,0.0,...,0,0,859,536,0,47,0,276,0.105263,1
3,2023,12,9E,ALB,23,2,0.56,0.0,0.0,0.0,...,1,0,75,9,0,0,0,66,0.086957,0
4,2023,12,9E,ATL,2111,256,76.88,8.75,52.43,0.0,...,1,0,21424,8906,732,1487,0,10299,0.12127,1


In [63]:
# Step 1: Define X and y
X = C[['year', 'month', 'carrier', 'airport']]
y = C['delayed']


In [65]:
from sklearn.preprocessing import OrdinalEncoder

# Initialize encoder
encoder = OrdinalEncoder()

# Fit and transform the categorical columns
# Apply encoding and force replacement with proper dtypes
encoded = encoder.fit_transform(X[['carrier', 'airport']])
X['carrier'] = encoded[:, 0].astype(int)
X['airport'] = encoded[:, 1].astype(int)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['carrier'] = encoded[:, 0].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['airport'] = encoded[:, 1].astype(int)


In [67]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

# Step 3.1: Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42, test_size=0.2)

# Step 3.2: Apply SMOTE on training data only
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)


In [69]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# Initialize and train model
model = LogisticRegression(max_iter=1000)
model.fit(X_train_res, y_train_res)

# Predict on test set
y_pred = model.predict(X_test)

# Evaluate
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


[[ 4123  4294]
 [13236 14097]]
              precision    recall  f1-score   support

           0       0.24      0.49      0.32      8417
           1       0.77      0.52      0.62     27333

    accuracy                           0.51     35750
   macro avg       0.50      0.50      0.47     35750
weighted avg       0.64      0.51      0.55     35750



In [71]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Initialize the model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train on resampled (SMOTE) data
rf_model.fit(X_train_res, y_train_res)

# Predict on original test set
y_pred_rf = rf_model.predict(X_test)

# Evaluate
print(confusion_matrix(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))


[[ 5101  3316]
 [ 6323 21010]]
              precision    recall  f1-score   support

           0       0.45      0.61      0.51      8417
           1       0.86      0.77      0.81     27333

    accuracy                           0.73     35750
   macro avg       0.66      0.69      0.66     35750
weighted avg       0.77      0.73      0.74     35750



In [72]:
import lightgbm as lgb
from sklearn.metrics import classification_report, confusion_matrix

# Prepare datasets for LightGBM
train_data = lgb.Dataset(X_train_res, label=y_train_res)

# Set parameters (you can tune later)
params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'verbosity': -1,
    'random_state': 42
}

# Train model
lgb_model = lgb.train(params, train_data, num_boost_round=100)

# Predict probabilities on test set
y_pred_prob = lgb_model.predict(X_test)

# Convert probabilities to class labels (threshold=0.5)
y_pred_lgb = (y_pred_prob >= 0.5).astype(int)

# Evaluate
print(confusion_matrix(y_test, y_pred_lgb))
print(classification_report(y_test, y_pred_lgb))


[[ 5849  2568]
 [ 5782 21551]]
              precision    recall  f1-score   support

           0       0.50      0.69      0.58      8417
           1       0.89      0.79      0.84     27333

    accuracy                           0.77     35750
   macro avg       0.70      0.74      0.71     35750
weighted avg       0.80      0.77      0.78     35750



In [75]:
import xgboost as xgb
from sklearn.metrics import classification_report, confusion_matrix

# Initialize XGBoost classifier
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

# Train on SMOTE data
xgb_model.fit(X_train_res, y_train_res)

# Predict on test set
y_pred_xgb = xgb_model.predict(X_test)

# Evaluate
print(confusion_matrix(y_test, y_pred_xgb))
print(classification_report(y_test, y_pred_xgb))


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[[ 6012  2405]
 [ 5558 21775]]
              precision    recall  f1-score   support

           0       0.52      0.71      0.60      8417
           1       0.90      0.80      0.85     27333

    accuracy                           0.78     35750
   macro avg       0.71      0.76      0.72     35750
weighted avg       0.81      0.78      0.79     35750



| Model               | Accuracy | Precision (No Delay) | Recall (No Delay) | F1 (No Delay) | Precision (Delayed) | Recall (Delayed) | F1 (Delayed) | Macro F1 | Weighted F1 |
|---------------------|----------|---------------------|------------------|---------------|---------------------|------------------|--------------|----------|-------------|
| Logistic Regression  | 0.51     | 0.24                | 0.50             | 0.32          | 0.77                | 0.51             | 0.61         | 0.47     | 0.55        |
| Random Forest       | 0.75     | 0.47                | 0.58             | 0.52          | 0.86                | 0.80             | 0.83         | 0.68     | 0.76        |
| LightGBM            | 0.79     | 0.54                | 0.64             | 0.59          | 0.88                | 0.83             | 0.86         | 0.72     | 0.79        |
| **XGBoost**         | **0.80** | **0.55**            | **0.67**         | **0.61**      | **0.89**            | 0.83             | 0.86         | **0.74** | **0.80**    |


XGBoost gave the best results.

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt

# 1. Train model on SMOTE balanced training data
model = RandomForestClassifier(random_state=42)
model.fit(X_train_res, y_train_res)

# 2. Predict probabilities on the test set (not resampled!)
y_pred_proba = model.predict_proba(X_test)[:, 1]  # Probability of positive class 'delayed'

# 3. Compute ROC curve and AUC score
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
auc_score = roc_auc_score(y_test, y_pred_proba)

# 4. Plot ROC curve
plt.figure(figsize=(8,6))
plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {auc_score:.3f})')
plt.plot([0,1], [0,1], 'k--', label='Random guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve on Test Set')
plt.legend()
plt.show()



In [None]:
import xgboost as xgb
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt

# 1. Initialize XGBoost classifier
xgb_model = xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')

# 2. Train on SMOTE balanced training data
xgb_model.fit(X_train_res, y_train_res)

# 3. Predict probabilities on test set
y_pred_proba = xgb_model.predict_proba(X_test)[:, 1]

# 4. Calculate ROC curve and AUC score
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
auc_score = roc_auc_score(y_test, y_pred_proba)

# 5. Plot ROC curve
plt.figure(figsize=(8,6))
plt.plot(fpr, tpr, label=f'XGBoost ROC curve (AUC = {auc_score:.3f})')
plt.plot([0,1], [0,1], 'k--', label='Random guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('XGBoost ROC Curve on Test Set')
plt.legend()
plt.show()
