In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import scipy
import sklearn
import os
from sklearn.model_selection import train_test_split

In [2]:
def convert_to_categorical(time_str):
    hour = int(time_str.split(':')[0])
    
    if 0 <= hour < 3:
        return 'Late Night'
    elif 3 <= hour < 6:
        return 'Early Morning'
    elif 6 <= hour < 9:
        return 'Morning'
    elif 9 <= hour < 12:
        return 'Late Morning'
    elif 12 <= hour < 15:
        return 'Noon'
    elif 15 <= hour < 18:
        return 'Afternoon'
    elif 18 <= hour < 21:
        return 'Evening'
    else:
        return 'Night'

# 1 HOP

In [3]:
data = pd.read_csv("1_HOP.csv")

In [4]:
data.head()

Unnamed: 0,Carrier Code,Date (MM/DD/YYYY),Flight Number,Tail Number,Origin Airport,Scheduled Arrival Time,Scheduled Elapsed Time (Minutes),FLIGHT_STATUS,month,day,season,WeekDay,PREV_STAT
0,B6,2010-01-01,22,N608JB,JFK,00:01,76,LATE,1,1,winter,Friday,ONTIME
1,B6,2010-01-01,44,N586JB,JFK,08:55,75,LATE,1,1,winter,Friday,LATE
2,MQ,2010-01-01,4094,N610MQ,ORD,11:20,100,ONTIME,1,1,winter,Friday,LATE
3,9E,2010-01-01,3818,89289E,DTW,11:44,84,LATE,1,1,winter,Friday,ONTIME
4,B6,2010-01-01,42,N586JB,JFK,11:52,71,LATE,1,1,winter,Friday,LATE


In [5]:
data['SCHED_ARRV_TIME_CAT'] = data['Scheduled Arrival Time'].apply(convert_to_categorical)

In [6]:
data.head()

Unnamed: 0,Carrier Code,Date (MM/DD/YYYY),Flight Number,Tail Number,Origin Airport,Scheduled Arrival Time,Scheduled Elapsed Time (Minutes),FLIGHT_STATUS,month,day,season,WeekDay,PREV_STAT,SCHED_ARRV_TIME_CAT
0,B6,2010-01-01,22,N608JB,JFK,00:01,76,LATE,1,1,winter,Friday,ONTIME,Late Night
1,B6,2010-01-01,44,N586JB,JFK,08:55,75,LATE,1,1,winter,Friday,LATE,Morning
2,MQ,2010-01-01,4094,N610MQ,ORD,11:20,100,ONTIME,1,1,winter,Friday,LATE,Late Morning
3,9E,2010-01-01,3818,89289E,DTW,11:44,84,LATE,1,1,winter,Friday,ONTIME,Late Morning
4,B6,2010-01-01,42,N586JB,JFK,11:52,71,LATE,1,1,winter,Friday,LATE,Late Morning


In [7]:
df = data.drop(columns=['Date (MM/DD/YYYY)', 'Flight Number','Tail Number', 'Scheduled Arrival Time'])

In [8]:
df.head()

Unnamed: 0,Carrier Code,Origin Airport,Scheduled Elapsed Time (Minutes),FLIGHT_STATUS,month,day,season,WeekDay,PREV_STAT,SCHED_ARRV_TIME_CAT
0,B6,JFK,76,LATE,1,1,winter,Friday,ONTIME,Late Night
1,B6,JFK,75,LATE,1,1,winter,Friday,LATE,Morning
2,MQ,ORD,100,ONTIME,1,1,winter,Friday,LATE,Late Morning
3,9E,DTW,84,LATE,1,1,winter,Friday,ONTIME,Late Morning
4,B6,JFK,71,LATE,1,1,winter,Friday,LATE,Late Morning


In [9]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder

class MultiColumnOneHotEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None):
        self.columns = columns
        self.encoder = None

    def fit(self, X, y=None):
        self.encoder = OneHotEncoder(sparse_output=False, drop='first')
        self.encoder.fit(X[self.columns])
        return self

    def transform(self, X):
        X_encoded = X.copy()
        encoded_data = self.encoder.transform(X[self.columns])
        encoded_df = pd.DataFrame(encoded_data, columns=self.encoder.get_feature_names_out(self.columns), index=X.index)
        
        # Drop the original columns
        X_encoded = X_encoded.drop(columns=self.columns)
        
        # Concatenate the encoded DataFrame with the original DataFrame, preserving the index
        X_encoded = pd.concat([X_encoded, encoded_df], axis=1)
        return X_encoded

    def fit_transform(self, X, y=None):
        self.fit(X)
        return self.transform(X)

In [10]:
df.columns

Index(['Carrier Code', 'Origin Airport', 'Scheduled Elapsed Time (Minutes)',
       'FLIGHT_STATUS', 'month', 'day', 'season', 'WeekDay', 'PREV_STAT',
       'SCHED_ARRV_TIME_CAT'],
      dtype='object')

In [11]:
df['FLIGHT_STATUS'].value_counts()

FLIGHT_STATUS
EARLY     52240
LATE      34106
ONTIME    27325
Name: count, dtype: int64

In [12]:
encoder = MultiColumnOneHotEncoder(columns=['Carrier Code', 'Origin Airport','season', 'SCHED_ARRV_TIME_CAT', 'month','PREV_STAT', 'WeekDay'])

In [13]:
encoded_data = encoder.fit_transform(df.drop(columns=['FLIGHT_STATUS']))

In [14]:
trainX, testX, trainY, testY = train_test_split(
    encoded_data,
    df['FLIGHT_STATUS'], 
    test_size=0.2, 
    random_state=947,
    stratify=df['FLIGHT_STATUS']
)

In [15]:
from sklearn.preprocessing import StandardScaler

In [16]:
scaler = StandardScaler()


enc_trainX = pd.DataFrame(scaler.fit_transform(trainX), index=trainX.index, columns=trainX.columns)
enc_testX = pd.DataFrame(scaler.transform(testX), index=testX.index, columns=testX.columns)

In [17]:
enc_trainX

Unnamed: 0,Scheduled Elapsed Time (Minutes),day,Carrier Code_AA,Carrier Code_B6,Carrier Code_DL,Carrier Code_EV,Carrier Code_F9,Carrier Code_G4,Carrier Code_MQ,Carrier Code_OH,...,month_11,month_12,PREV_STAT_LATE,PREV_STAT_ONTIME,WeekDay_Monday,WeekDay_Saturday,WeekDay_Sunday,WeekDay_Thursday,WeekDay_Tuesday,WeekDay_Wednesday
109767,0.420191,0.718113,-0.237358,-0.448859,-0.351464,-0.344872,-0.127516,-0.15405,2.914831,-0.226431,...,-0.30596,-0.300294,1.526678,-0.561416,-0.417338,-0.370459,-0.408178,-0.420982,-0.409240,-0.411655
21666,-1.129327,-1.678529,-0.237358,-0.448859,-0.351464,2.899624,-0.127516,-0.15405,-0.343073,-0.226431,...,-0.30596,-0.300294,1.526678,-0.561416,-0.417338,-0.370459,2.449914,-0.420982,-0.409240,-0.411655
107873,2.104451,-1.336151,-0.237358,2.227871,-0.351464,-0.344872,-0.127516,-0.15405,-0.343073,-0.226431,...,-0.30596,-0.300294,1.526678,-0.561416,-0.417338,-0.370459,-0.408178,-0.420982,-0.409240,-0.411655
20188,0.049654,-1.107900,-0.237358,-0.448859,-0.351464,-0.344872,-0.127516,-0.15405,2.914831,-0.226431,...,-0.30596,-0.300294,-0.655017,1.781212,-0.417338,-0.370459,-0.408178,-0.420982,-0.409240,-0.411655
96511,-0.624050,1.288742,-0.237358,-0.448859,-0.351464,-0.344872,-0.127516,-0.15405,-0.343073,4.416353,...,-0.30596,-0.300294,1.526678,-0.561416,-0.417338,-0.370459,-0.408178,-0.420982,2.443552,-0.411655
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56030,1.969710,1.516994,-0.237358,2.227871,-0.351464,-0.344872,-0.127516,-0.15405,-0.343073,-0.226431,...,-0.30596,-0.300294,-0.655017,-0.561416,-0.417338,2.699351,-0.408178,-0.420982,-0.409240,-0.411655
22547,1.228636,0.375736,-0.237358,-0.448859,2.845245,-0.344872,-0.127516,-0.15405,-0.343073,-0.226431,...,-0.30596,-0.300294,-0.655017,-0.561416,-0.417338,-0.370459,2.449914,-0.420982,-0.409240,-0.411655
63492,0.790729,-1.336151,-0.237358,-0.448859,2.845245,-0.344872,-0.127516,-0.15405,-0.343073,-0.226431,...,-0.30596,-0.300294,-0.655017,-0.561416,-0.417338,-0.370459,-0.408178,-0.420982,-0.409240,-0.411655
51736,1.262321,1.745245,-0.237358,-0.448859,2.845245,-0.344872,-0.127516,-0.15405,-0.343073,-0.226431,...,-0.30596,-0.300294,-0.655017,-0.561416,2.396141,-0.370459,-0.408178,-0.420982,-0.409240,-0.411655


In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

def logistic_regression_classification(trainX, trainY, testX, testY, penalty='l2', C=1.0, max_iter=1000, solver='lbfgs'):
    """
    Perform classification using Regularized Logistic Regression for three-class prediction.
    
    Parameters:
        trainX (DataFrame or array-like): Features for training.
        trainY (Series or array-like): Target variable for training.
        testX (DataFrame or array-like): Features for testing.
        testY (Series or array-like): Target variable for testing.
        penalty (str, optional): Regularization type ('l1' for Lasso, 'l2' for Ridge). Default is 'l2'.
        C (float, optional): Inverse of regularization strength; smaller values specify stronger regularization. Default is 1.0.
        max_iter (int, optional): Maximum number of iterations for optimization algorithm. Default is 1000.
        solver (str, optional): Optimization algorithm to use ('lbfgs', 'sag', 'saga', etc.). Default is 'lbfgs'.
        
    Returns:
        dict: Dictionary containing accuracy score and classification report.
    """
    # Get unique target names
    target_names = trainY.unique() if isinstance(trainY, pd.Series) else testY.unique()
    
    # Initialize and train the Logistic Regression model
    model = LogisticRegression(penalty=penalty, C=C, max_iter=max_iter, solver=solver, verbose=1 if max_iter > 300 else 0)
    model.fit(trainX, trainY)
    
    # Predict on the testing set
    testY_pred = model.predict(testX)
    
    # Calculate accuracy score
    accuracy = accuracy_score(testY, testY_pred)
    
    # Generate classification report
    report = classification_report(testY, testY_pred, target_names=target_names, output_dict=True)
    
    results = {
        'accuracy': accuracy,
        'classification_report': report
    }
    
    return results


In [19]:
report = logistic_regression_classification(enc_trainX, trainY, enc_testX, testY, max_iter=1500, solver='saga')
print(report)

Epoch 1, change: 1.00000000
Epoch 2, change: 0.40631949
Epoch 3, change: 0.25617942
Epoch 4, change: 0.17130056
Epoch 5, change: 0.13842040
Epoch 6, change: 0.10927224
Epoch 7, change: 0.09148050
Epoch 8, change: 0.07978914
Epoch 9, change: 0.07003899
Epoch 10, change: 0.06163186
Epoch 11, change: 0.05534555
Epoch 12, change: 0.04983306
Epoch 13, change: 0.04531318
Epoch 14, change: 0.04140333
Epoch 15, change: 0.03795715
Epoch 16, change: 0.03508635
Epoch 17, change: 0.03261564
Epoch 18, change: 0.03020416
Epoch 19, change: 0.02821778
Epoch 20, change: 0.02643043
Epoch 21, change: 0.02469567
Epoch 22, change: 0.02327157
Epoch 23, change: 0.02187227
Epoch 24, change: 0.02063510
Epoch 25, change: 0.01952582
Epoch 26, change: 0.01839472
Epoch 27, change: 0.01749525
Epoch 28, change: 0.01659937
Epoch 29, change: 0.01571208
Epoch 30, change: 0.01492517
Epoch 31, change: 0.01420785
Epoch 32, change: 0.01354653
Epoch 33, change: 0.01290001
Epoch 34, change: 0.01231182
Epoch 35, change: 0.011

In [20]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

def decision_tree_classification(trainX, trainY, testX, testY, criterion='gini', max_depth=None):
    """
    Perform classification using Decision Trees for three-class prediction.
    
    Parameters:
        trainX (DataFrame or array-like): Features for training.
        trainY (Series or array-like): Target variable for training.
        testX (DataFrame or array-like): Features for testing.
        testY (Series or array-like): Target variable for testing.
        criterion (str, optional): Criterion used to measure the quality of a split ('gini' or 'entropy'). Default is 'gini'.
        max_depth (int, optional): Maximum depth of the tree. If None, the tree is fully grown. Default is None.
        
    Returns:
        dict: Dictionary containing accuracy score and classification report.
    """
    # Get unique target names
    target_names = trainY.unique() if isinstance(trainY, pd.Series) else testY.unique()
    
    # Initialize and train the Decision Tree model
    model = DecisionTreeClassifier(criterion=criterion, max_depth=max_depth)
    model.fit(trainX, trainY)
    
    # Predict on the testing set
    testY_pred = model.predict(testX)
    
    # Calculate accuracy score
    accuracy = accuracy_score(testY, testY_pred)
    
    # Generate classification report
    report = classification_report(testY, testY_pred, target_names=target_names, output_dict=True)
    
    results = {
        'accuracy': accuracy,
        'classification_report': report
    }
    
    return results

In [21]:
decision_tree_classification(enc_trainX, trainY, enc_testX, testY)

{'accuracy': 0.4087970090169342,
 'classification_report': {'EARLY': {'precision': 0.5222134073780246,
   'recall': 0.5040199081163859,
   'f1-score': 0.512955386713423,
   'support': 10448.0},
  'LATE': {'precision': 0.36149685718462216,
   'recall': 0.3625036646144826,
   'f1-score': 0.3619995608577911,
   'support': 6822.0},
  'ONTIME': {'precision': 0.2676419965576592,
   'recall': 0.2845379688929552,
   'f1-score': 0.27583148558758314,
   'support': 5465.0},
  'accuracy': 0.4087970090169342,
  'macro avg': {'precision': 0.383784087040102,
   'recall': 0.38368718054127454,
   'f1-score': 0.3835954777195991,
   'support': 22735.0},
  'weighted avg': {'precision': 0.41279440295521,
   'recall': 0.4087970090169342,
   'f1-score': 0.4106592458011804,
   'support': 22735.0}}}

In [22]:
from sklearn.metrics import accuracy_score, classification_report

from sklearn.preprocessing import LabelEncoder

def fit_and_evaluate(model, trainX, trainY, testX, testY):
    # Convert string labels to integers
    label_encoder = LabelEncoder()
    trainY_encoded = label_encoder.fit_transform(trainY)
    testY_encoded = label_encoder.transform(testY)
    
    model.fit(trainX, trainY_encoded)
    testY_pred = model.predict(testX)
    accuracy = accuracy_score(testY_encoded, testY_pred)
    report = classification_report(testY_encoded, testY_pred, output_dict=True)
    results = {'accuracy': accuracy, 'classification_report': report}
    return results

# Update other classification functions similarly...


# Random Forest
from sklearn.ensemble import RandomForestClassifier

def random_forest_classification(trainX, trainY, testX, testY, n_estimators=100, criterion='gini', max_depth=None):
    model = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth)
    return fit_and_evaluate(model, trainX, trainY, testX, testY)

# Support Vector Machines (SVM)
from sklearn.svm import SVC

def svm_classification(trainX, trainY, testX, testY, kernel='rbf', C=1.0):
    model = SVC(kernel=kernel, C=C)
    return fit_and_evaluate(model, trainX, trainY, testX, testY)

# K-Nearest Neighbors (KNN)
from sklearn.neighbors import KNeighborsClassifier

def knn_classification(trainX, trainY, testX, testY, n_neighbors=5):
    model = KNeighborsClassifier(n_neighbors=n_neighbors)
    return fit_and_evaluate(model, trainX, trainY, testX, testY)

# Gradient Boosting Machines (GBM)
from sklearn.ensemble import GradientBoostingClassifier

def gbm_classification(trainX, trainY, testX, testY, n_estimators=100, learning_rate=0.1, max_depth=3):
    model = GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth)
    return fit_and_evaluate(model, trainX, trainY, testX, testY)

# Naive Bayes
from sklearn.naive_bayes import GaussianNB

def naive_bayes_classification(trainX, trainY, testX, testY):
    model = GaussianNB()
    return fit_and_evaluate(model, trainX, trainY, testX, testY)

# AdaBoost
from sklearn.ensemble import AdaBoostClassifier

def adaboost_classification(trainX, trainY, testX, testY, n_estimators=50, learning_rate=1.0):
    model = AdaBoostClassifier(n_estimators=n_estimators, learning_rate=learning_rate)
    return fit_and_evaluate(model, trainX, trainY, testX, testY)

# XGBoost
from xgboost import XGBClassifier

def xgboost_classification(trainX, trainY, testX, testY, n_estimators=100, learning_rate=0.1, max_depth=3):
    model = XGBClassifier(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth)
    return fit_and_evaluate(model, trainX, trainY, testX, testY)


In [23]:
random_forest_classification(enc_trainX, trainY, enc_testX, testY)

{'accuracy': 0.4573125137453266,
 'classification_report': {'0': {'precision': 0.5368553261637491,
   'recall': 0.6545750382848392,
   'f1-score': 0.5898995126579549,
   'support': 10448.0},
  '1': {'precision': 0.39609120521172636,
   'recall': 0.35649369686309,
   'f1-score': 0.3752507329115877,
   'support': 6822.0},
  '2': {'precision': 0.2920124481327801,
   'recall': 0.2060384263494968,
   'f1-score': 0.24160497800665165,
   'support': 5465.0},
  'accuracy': 0.4573125137453266,
  'macro avg': {'precision': 0.4083196598360852,
   'recall': 0.4057023871658087,
   'f1-score': 0.40225174119206475,
   'support': 22735.0},
  'weighted avg': {'precision': 0.43576189482115196,
   'recall': 0.4573125137453266,
   'f1-score': 0.4417682785563895,
   'support': 22735.0}}}

In [24]:
knn_classification(enc_trainX, trainY, enc_testX, testY)

{'accuracy': 0.44908731031449306,
 'classification_report': {'0': {'precision': 0.5044393970679331,
   'recall': 0.7014739663093414,
   'f1-score': 0.5868599111182288,
   'support': 10448.0},
  '1': {'precision': 0.3835780129737112,
   'recall': 0.32937554969217236,
   'f1-score': 0.35441640378548894,
   'support': 6822.0},
  '2': {'precision': 0.27001703577512776,
   'recall': 0.11601097895699909,
   'f1-score': 0.16229361320875463,
   'support': 5465.0},
  'accuracy': 0.44908731031449306,
  'macro avg': {'precision': 0.386011481938924,
   'recall': 0.38228683165283767,
   'f1-score': 0.36785664270415747,
   'support': 22735.0},
  'weighted avg': {'precision': 0.4118229657173299,
   'recall': 0.44908731031449306,
   'f1-score': 0.41505501008021567,
   'support': 22735.0}}}

In [25]:
gbm_classification(enc_trainX, trainY, enc_testX, testY)

{'accuracy': 0.49566747305915987,
 'classification_report': {'0': {'precision': 0.5042281006071119,
   'recall': 0.8903139356814701,
   'f1-score': 0.6438261351052049,
   'support': 10448.0},
  '1': {'precision': 0.46487804878048783,
   'recall': 0.2793902081501026,
   'f1-score': 0.34902032594762866,
   'support': 6822.0},
  '2': {'precision': 0.32620320855614976,
   'recall': 0.011161939615736504,
   'f1-score': 0.021585279547062988,
   'support': 5465.0},
  'accuracy': 0.49566747305915987,
  'macro avg': {'precision': 0.43176978598124977,
   'recall': 0.39362202781576977,
   'f1-score': 0.33814391353329887,
   'support': 22735.0},
  'weighted avg': {'precision': 0.44962717302322197,
   'recall': 0.49566747305915987,
   'f1-score': 0.4057917605418343,
   'support': 22735.0}}}

In [26]:
naive_bayes_classification(enc_trainX, trainY, enc_testX, testY)

{'accuracy': 0.3308115240818122,
 'classification_report': {'0': {'precision': 0.5743099787685775,
   'recall': 0.10356049004594181,
   'f1-score': 0.17547843010055142,
   'support': 10448.0},
  '1': {'precision': 0.308841963166522,
   'recall': 0.9390208150102609,
   'f1-score': 0.4648091713829633,
   'support': 6822.0},
  '2': {'precision': 0.30275229357798167,
   'recall': 0.006038426349496798,
   'f1-score': 0.011840688912809472,
   'support': 5465.0},
  'accuracy': 0.3308115240818122,
  'macro avg': {'precision': 0.3953014118376937,
   'recall': 0.34953991046856653,
   'f1-score': 0.2173760967987747,
   'support': 22735.0},
  'weighted avg': {'precision': 0.42937549220584037,
   'recall': 0.3308115240818122,
   'f1-score': 0.22296178446332265,
   'support': 22735.0}}}

In [27]:
adaboost_classification(enc_trainX, trainY, enc_testX, testY)



{'accuracy': 0.49056520782933805,
 'classification_report': {'0': {'precision': 0.5067747604739498,
   'recall': 0.8555704441041347,
   'f1-score': 0.6365222344857051,
   'support': 10448.0},
  '1': {'precision': 0.43637093536732235,
   'recall': 0.31867487540310757,
   'f1-score': 0.36834971196204674,
   'support': 6822.0},
  '2': {'precision': 0.3508771929824561,
   'recall': 0.007319304666056725,
   'f1-score': 0.01433948736332676,
   'support': 5465.0},
  'accuracy': 0.49056520782933805,
  'macro avg': {'precision': 0.43134096294124274,
   'recall': 0.39385487472443304,
   'f1-score': 0.3397371446036928,
   'support': 22735.0},
  'weighted avg': {'precision': 0.44817458008167244,
   'recall': 0.49056520782933805,
   'f1-score': 0.40649357111732176,
   'support': 22735.0}}}

In [28]:
xgboost_classification(enc_trainX, trainY, enc_testX, testY)

{'accuracy': 0.4930283703540796,
 'classification_report': {'0': {'precision': 0.4992343032159265,
   'recall': 0.9048621745788668,
   'f1-score': 0.6434575463671941,
   'support': 10448.0},
  '1': {'precision': 0.4681493684788578,
   'recall': 0.24992670771034886,
   'f1-score': 0.32587920489296635,
   'support': 6822.0},
  '2': {'precision': 0.32051282051282054,
   'recall': 0.009149130832570906,
   'f1-score': 0.01779042874933286,
   'support': 5465.0},
  'accuracy': 0.4930283703540796,
  'macro avg': {'precision': 0.4292988307358683,
   'recall': 0.3879793377072622,
   'f1-score': 0.3290423933364978,
   'support': 22735.0},
  'weighted avg': {'precision': 0.4469460108143978,
   'recall': 0.4930283703540796,
   'f1-score': 0.3977663106813004,
   'support': 22735.0}}}

In [29]:
model = DecisionTreeClassifier(criterion='entropy', max_features='log2')
res = fit_and_evaluate(model, enc_trainX, trainY, enc_testX, testY)
print(res)

{'accuracy': 0.40602595117659995, 'classification_report': {'0': {'precision': 0.5186051042275472, 'recall': 0.5095712098009189, 'f1-score': 0.5140484696340639, 'support': 10448.0}, '1': {'precision': 0.34932296168251226, 'recall': 0.3554676048079742, 'f1-score': 0.3523684975297878, 'support': 6822.0}, '2': {'precision': 0.26813823050479463, 'recall': 0.27118023787740164, 'f1-score': 0.26965065502183405, 'support': 5465.0}, 'accuracy': 0.40602595117659995, 'macro avg': {'precision': 0.378688765471618, 'recall': 0.3787396841620982, 'f1-score': 0.3786892073952286, 'support': 22735.0}, 'weighted avg': {'precision': 0.4076024984946652, 'recall': 0.40602595117659995, 'f1-score': 0.4067858865440614, 'support': 22735.0}}}


In [30]:
model = RandomForestClassifier(n_estimators=100, criterion='entropy')
res = fit_and_evaluate(model, enc_trainX, trainY, enc_testX, testY)
print(res)

{'accuracy': 0.45581702221244774, 'classification_report': {'0': {'precision': 0.5385886240264338, 'recall': 0.6552450229709035, 'f1-score': 0.591217237359126, 'support': 10448.0}, '1': {'precision': 0.39147974826528964, 'recall': 0.35561418938727646, 'f1-score': 0.37268607419924726, 'support': 6822.0}, '2': {'precision': 0.28507969689051477, 'recall': 0.19963403476669717, 'f1-score': 0.23482565647869136, 'support': 5465.0}, 'accuracy': 0.45581702221244774, 'macro avg': {'precision': 0.4050493563940794, 'recall': 0.40349774904162566, 'f1-score': 0.3995763226790216, 'support': 22735.0}, 'weighted avg': {'precision': 0.4335082177260017, 'recall': 0.45581702221244774, 'f1-score': 0.4399746781073879, 'support': 22735.0}}}


In [31]:
model = XGBClassifier(
    colsample_bytree=0.7,enable_categorical=True,
    gamma=0.0, learning_rate=0.01, max_depth=17, 
    min_child_weight=5,n_estimators=100,
    n_jobs=20,objective='multi:softprob')
res = fit_and_evaluate(model, enc_trainX, trainY, enc_testX, testY)
print(res)

{'accuracy': 0.5111062238838795, 'classification_report': {'0': {'precision': 0.5287295106219303, 'recall': 0.855187595712098, 'f1-score': 0.6534537609244159, 'support': 10448.0}, '1': {'precision': 0.4729894394800975, 'recall': 0.3413954851949575, 'f1-score': 0.39656053124467905, 'support': 6822.0}, '2': {'precision': 0.39035087719298245, 'recall': 0.06514181152790485, 'f1-score': 0.11165124666771209, 'support': 5465.0}, 'accuracy': 0.5111062238838795, 'macro avg': {'precision': 0.4640232757650034, 'recall': 0.42057496414498674, 'f1-score': 0.38722184627893563, 'support': 22735.0}, 'weighted avg': {'precision': 0.47874059498442056, 'recall': 0.5111062238838795, 'f1-score': 0.4461312910195094, 'support': 22735.0}}}


In [32]:
model = KNeighborsClassifier(n_neighbors=70)
res = fit_and_evaluate(model, enc_trainX, trainY, enc_testX, testY)
print(res)

{'accuracy': 0.4888058060259512, 'classification_report': {'0': {'precision': 0.5088302150725651, 'recall': 0.8355666156202144, 'f1-score': 0.6324941133852563, 'support': 10448.0}, '1': {'precision': 0.4411764705882353, 'recall': 0.316622691292876, 'f1-score': 0.3686635944700461, 'support': 6822.0}, '2': {'precision': 0.32697947214076245, 'recall': 0.04080512351326624, 'f1-score': 0.07255571823653816, 'support': 5465.0}, 'accuracy': 0.4888058060259512, 'macro avg': {'precision': 0.425662052600521, 'recall': 0.3976648101421189, 'f1-score': 0.3579044753639469, 'support': 22735.0}, 'weighted avg': {'precision': 0.4448166608612434, 'recall': 0.4888058060259512, 'f1-score': 0.41873052730532184, 'support': 22735.0}}}


In [33]:
model = AdaBoostClassifier(n_estimators=200, learning_rate=0.8, random_state=947)
res = fit_and_evaluate(model, enc_trainX, trainY, enc_testX, testY)
print(res)



{'accuracy': 0.4947437871123818, 'classification_report': {'0': {'precision': 0.5113799231783523, 'recall': 0.8537519142419602, 'f1-score': 0.6396328564769997, 'support': 10448.0}, '1': {'precision': 0.44191476941039115, 'recall': 0.33289357959542654, 'f1-score': 0.3797341359418109, 'support': 6822.0}, '2': {'precision': 0.37254901960784315, 'recall': 0.010430009149130833, 'f1-score': 0.02029191883232467, 'support': 5465.0}, 'accuracy': 0.4947437871123818, 'macro avg': {'precision': 0.4419479040655289, 'recall': 0.3990251676621725, 'f1-score': 0.34655297041704514, 'support': 22735.0}, 'weighted avg': {'precision': 0.45716386129060815, 'recall': 0.4947437871123818, 'f1-score': 0.41276998884035104, 'support': 22735.0}}}


In [34]:
import numpy as np
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

def fit_and_evaluate_voting(voting_classifier, trainX, trainY, testX, testY):
    # Convert string labels to integers
    label_encoder = LabelEncoder()
    trainY_encoded = label_encoder.fit_transform(trainY)
    testY_encoded = label_encoder.transform(testY)
    
    # Fit the VotingClassifier
    voting_classifier.fit(trainX, trainY_encoded)
    
    # Make predictions
    testY_pred = voting_classifier.predict(testX)
    
    # Calculate accuracy score
    accuracy = accuracy_score(testY_encoded, testY_pred)
    
    # Generate classification report
    report = classification_report(testY_encoded, testY_pred, target_names=label_encoder.classes_, output_dict=True)
    
    results = {
        'accuracy': accuracy,
        'classification_report': report,
        'voting_classifier': voting_classifier
    }
    
    return results

In [35]:
scaler = StandardScaler()
enc_trainX = pd.DataFrame(scaler.fit_transform(trainX), index=trainX.index, columns=trainX.columns)
enc_testX = pd.DataFrame(scaler.transform(testX), index=testX.index, columns=testX.columns)

In [36]:
from sklearn.ensemble import VotingClassifier

# Define multiple classifiers
ada = AdaBoostClassifier(n_estimators=200, learning_rate=0.8, random_state=947)
knn = KNeighborsClassifier(n_neighbors=70)
xgb = XGBClassifier(
    colsample_bytree=0.7,enable_categorical=True,
    gamma=0.0, learning_rate=0.01, max_depth=17, 
    min_child_weight=5,n_estimators=100,objective='multi:softprob')

votingCLF = VotingClassifier(estimators=[('knn', knn), ('ada', ada), ('xgb', xgb)], voting='soft', weights=[5,7,10])
fit_and_evaluate_voting(votingCLF, enc_trainX, trainY, enc_testX, testY)



{'accuracy': 0.5072355399164284,
 'classification_report': {'EARLY': {'precision': 0.5204802665594301,
   'recall': 0.8671516079632465,
   'f1-score': 0.650511577813678,
   'support': 10448.0},
  'LATE': {'precision': 0.4760016694490818,
   'recall': 0.33435942538844915,
   'f1-score': 0.3928017909419666,
   'support': 6822.0},
  'ONTIME': {'precision': 0.35634328358208955,
   'recall': 0.03494967978042086,
   'f1-score': 0.06365605732377937,
   'support': 5465.0},
  'accuracy': 0.5072355399164284,
  'macro avg': {'precision': 0.4509417398635338,
   'recall': 0.4121535710440389,
   'f1-score': 0.36898980869314135,
   'support': 22735.0},
  'weighted avg': {'precision': 0.46767878859778667,
   'recall': 0.5072355399164284,
   'f1-score': 0.43211432311756576,
   'support': 22735.0}},
 'voting_classifier': VotingClassifier(estimators=[('knn', KNeighborsClassifier(n_neighbors=70)),
                              ('ada',
                               AdaBoostClassifier(learning_rate=0.8,
  

# 3 HOP

In [37]:
data = pd.read_csv("3_HOP.csv", parse_dates=["Date (MM/DD/YYYY)"])

In [38]:
data.head()

Unnamed: 0,Carrier Code,Date (MM/DD/YYYY),Flight Number,Tail Number,Origin Airport,Scheduled Arrival Time,Scheduled Elapsed Time (Minutes),FLIGHT_STATUS,month,day,season,WeekDay,PREV_STAT
0,B6,2010-01-01,22,N608JB,JFK,00:01,76,LATE,1,1,winter,Friday,ONTIME
1,B6,2010-01-01,22,N608JB,JFK,00:01,76,LATE,1,1,winter,Friday,ONTIME
2,B6,2010-01-01,22,N608JB,JFK,00:01,76,LATE,1,1,winter,Friday,ONTIME
3,B6,2010-01-01,44,N586JB,JFK,08:55,75,LATE,1,1,winter,Friday,LATE
4,B6,2010-01-01,44,N586JB,JFK,08:55,75,LATE,1,1,winter,Friday,ONTIME


In [39]:
data['SCHED_ARRV_TIME_CAT'] = data['Scheduled Arrival Time'].apply(convert_to_categorical)

In [40]:
data.head()

Unnamed: 0,Carrier Code,Date (MM/DD/YYYY),Flight Number,Tail Number,Origin Airport,Scheduled Arrival Time,Scheduled Elapsed Time (Minutes),FLIGHT_STATUS,month,day,season,WeekDay,PREV_STAT,SCHED_ARRV_TIME_CAT
0,B6,2010-01-01,22,N608JB,JFK,00:01,76,LATE,1,1,winter,Friday,ONTIME,Late Night
1,B6,2010-01-01,22,N608JB,JFK,00:01,76,LATE,1,1,winter,Friday,ONTIME,Late Night
2,B6,2010-01-01,22,N608JB,JFK,00:01,76,LATE,1,1,winter,Friday,ONTIME,Late Night
3,B6,2010-01-01,44,N586JB,JFK,08:55,75,LATE,1,1,winter,Friday,LATE,Morning
4,B6,2010-01-01,44,N586JB,JFK,08:55,75,LATE,1,1,winter,Friday,ONTIME,Morning


In [41]:
data['WeekDay'] = data['Date (MM/DD/YYYY)'].dt.day_name()

In [42]:
df = data.drop(columns=['Date (MM/DD/YYYY)', 'Flight Number','Tail Number', 'Scheduled Arrival Time'])

In [43]:
df.head()

Unnamed: 0,Carrier Code,Origin Airport,Scheduled Elapsed Time (Minutes),FLIGHT_STATUS,month,day,season,WeekDay,PREV_STAT,SCHED_ARRV_TIME_CAT
0,B6,JFK,76,LATE,1,1,winter,Friday,ONTIME,Late Night
1,B6,JFK,76,LATE,1,1,winter,Friday,ONTIME,Late Night
2,B6,JFK,76,LATE,1,1,winter,Friday,ONTIME,Late Night
3,B6,JFK,75,LATE,1,1,winter,Friday,LATE,Morning
4,B6,JFK,75,LATE,1,1,winter,Friday,ONTIME,Morning


In [44]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder

class MultiColumnOneHotEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None):
        self.columns = columns
        self.encoder = None

    def fit(self, X, y=None):
        self.encoder = OneHotEncoder(sparse_output=False, drop='first')
        self.encoder.fit(X[self.columns])
        return self

    def transform(self, X):
        X_encoded = X.copy()
        encoded_data = self.encoder.transform(X[self.columns])
        encoded_df = pd.DataFrame(encoded_data, columns=self.encoder.get_feature_names_out(self.columns), index=X.index)
        
        # Drop the original columns
        X_encoded = X_encoded.drop(columns=self.columns)
        
        # Concatenate the encoded DataFrame with the original DataFrame, preserving the index
        X_encoded = pd.concat([X_encoded, encoded_df], axis=1)
        return X_encoded

    def fit_transform(self, X, y=None):
        self.fit(X)
        return self.transform(X)

In [45]:
df.columns

Index(['Carrier Code', 'Origin Airport', 'Scheduled Elapsed Time (Minutes)',
       'FLIGHT_STATUS', 'month', 'day', 'season', 'WeekDay', 'PREV_STAT',
       'SCHED_ARRV_TIME_CAT'],
      dtype='object')

In [46]:
df['FLIGHT_STATUS'].value_counts()

FLIGHT_STATUS
EARLY     156720
LATE      102318
ONTIME     81975
Name: count, dtype: int64

In [47]:
encoder = MultiColumnOneHotEncoder(columns=['Carrier Code', 'Origin Airport','season', 'SCHED_ARRV_TIME_CAT', 'month','PREV_STAT', "WeekDay"])

In [48]:
encoded_data = encoder.fit_transform(df.drop(columns=['FLIGHT_STATUS', 'day']))

In [49]:
trainX, testX, trainY, testY = train_test_split(
    encoded_data,
    df['FLIGHT_STATUS'], 
    test_size=0.2, 
    random_state=947,
    stratify=df['FLIGHT_STATUS'],
    shuffle=True
)

In [50]:
from sklearn.preprocessing import StandardScaler

In [51]:
scaler = StandardScaler()


enc_trainX = pd.DataFrame(scaler.fit_transform(trainX), index=trainX.index, columns=trainX.columns)
enc_testX = pd.DataFrame(scaler.transform(testX), index=testX.index, columns=testX.columns)

In [52]:
enc_trainX

Unnamed: 0,Scheduled Elapsed Time (Minutes),Carrier Code_AA,Carrier Code_B6,Carrier Code_DL,Carrier Code_EV,Carrier Code_F9,Carrier Code_G4,Carrier Code_MQ,Carrier Code_OH,Carrier Code_OO,...,month_11,month_12,PREV_STAT_LATE,PREV_STAT_ONTIME,WeekDay_Monday,WeekDay_Saturday,WeekDay_Sunday,WeekDay_Thursday,WeekDay_Tuesday,WeekDay_Wednesday
19431,-0.015570,-0.239316,-0.451807,-0.350329,-0.344944,-0.126158,-0.152445,-0.343278,4.417417,-0.279426,...,-0.306281,-0.300953,1.526073,-0.562751,2.404319,-0.371877,-0.408237,-0.420123,-0.409397,-0.413508
157007,-0.755951,-0.239316,-0.451807,-0.350329,-0.344944,-0.126158,-0.152445,-0.343278,-0.226377,3.578767,...,-0.306281,3.322781,-0.655277,-0.562751,-0.415918,-0.371877,-0.408237,-0.420123,-0.409397,-0.413508
93387,-0.082877,-0.239316,-0.451807,-0.350329,2.899021,-0.126158,-0.152445,-0.343278,-0.226377,-0.279426,...,-0.306281,-0.300953,-0.655277,1.776985,-0.415918,-0.371877,-0.408237,-0.420123,2.442615,-0.413508
325089,-1.058834,-0.239316,-0.451807,-0.350329,-0.344944,-0.126158,-0.152445,-0.343278,-0.226377,-0.279426,...,-0.306281,-0.300953,-0.655277,1.776985,-0.415918,-0.371877,-0.408237,2.380254,-0.409397,-0.413508
26716,-1.058834,-0.239316,2.213335,-0.350329,-0.344944,-0.126158,-0.152445,-0.343278,-0.226377,-0.279426,...,-0.306281,-0.300953,-0.655277,1.776985,-0.415918,-0.371877,-0.408237,-0.420123,-0.409397,2.418332
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159518,1.969998,-0.239316,2.213335,-0.350329,-0.344944,-0.126158,-0.152445,-0.343278,-0.226377,-0.279426,...,-0.306281,-0.300953,-0.655277,-0.562751,-0.415918,-0.371877,2.449558,-0.420123,-0.409397,-0.413508
122068,-0.183838,-0.239316,-0.451807,-0.350329,-0.344944,-0.126158,-0.152445,2.913087,-0.226377,-0.279426,...,-0.306281,-0.300953,-0.655277,-0.562751,-0.415918,-0.371877,2.449558,-0.420123,-0.409397,-0.413508
194247,-1.126142,-0.239316,-0.451807,-0.350329,-0.344944,-0.126158,-0.152445,-0.343278,-0.226377,-0.279426,...,-0.306281,-0.300953,1.526073,-0.562751,-0.415918,-0.371877,-0.408237,-0.420123,-0.409397,2.418332
99846,-0.890566,-0.239316,-0.451807,-0.350329,2.899021,-0.126158,-0.152445,-0.343278,-0.226377,-0.279426,...,3.264972,-0.300953,1.526073,-0.562751,-0.415918,-0.371877,-0.408237,-0.420123,-0.409397,-0.413508


In [53]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

def logistic_regression_classification(trainX, trainY, testX, testY, penalty='l2', C=1.0, max_iter=1000, solver='lbfgs'):
    """
    Perform classification using Regularized Logistic Regression for three-class prediction.
    
    Parameters:
        trainX (DataFrame or array-like): Features for training.
        trainY (Series or array-like): Target variable for training.
        testX (DataFrame or array-like): Features for testing.
        testY (Series or array-like): Target variable for testing.
        penalty (str, optional): Regularization type ('l1' for Lasso, 'l2' for Ridge). Default is 'l2'.
        C (float, optional): Inverse of regularization strength; smaller values specify stronger regularization. Default is 1.0.
        max_iter (int, optional): Maximum number of iterations for optimization algorithm. Default is 1000.
        solver (str, optional): Optimization algorithm to use ('lbfgs', 'sag', 'saga', etc.). Default is 'lbfgs'.
        
    Returns:
        dict: Dictionary containing accuracy score and classification report.
    """
    # Get unique target names
    target_names = trainY.unique() if isinstance(trainY, pd.Series) else testY.unique()
    
    # Initialize and train the Logistic Regression model
    model = LogisticRegression(penalty=penalty, C=C, max_iter=max_iter, solver=solver, verbose=1 if max_iter > 300 else 0)
    model.fit(trainX, trainY)
    
    # Predict on the testing set
    testY_pred = model.predict(testX)
    
    # Calculate accuracy score
    accuracy = accuracy_score(testY, testY_pred)
    
    # Generate classification report
    report = classification_report(testY, testY_pred, target_names=target_names, output_dict=True)
    
    results = {
        'accuracy': accuracy,
        'classification_report': report
    }
    
    return results


In [54]:
report = logistic_regression_classification(enc_trainX, trainY, enc_testX, testY, max_iter=1500, solver='saga')
print(report)

convergence after 67 epochs took 26 seconds
{'accuracy': 0.49374660938668385, 'classification_report': {'LATE': {'precision': 0.5174918411207514, 'recall': 0.8296643695763144, 'f1-score': 0.6374086965047306, 'support': 31344.0}, 'ONTIME': {'precision': 0.431535025858016, 'recall': 0.35882525410476934, 'f1-score': 0.39183564567769474, 'support': 20464.0}, 'EARLY': {'precision': 0.3497326203208556, 'recall': 0.019945105215004574, 'f1-score': 0.03773802654356607, 'support': 16395.0}, 'accuracy': 0.49374660938668385, 'macro avg': {'precision': 0.4329198290998743, 'recall': 0.40281157629869613, 'f1-score': 0.3556607895753305, 'support': 68203.0}, 'weighted avg': {'precision': 0.45137403556159844, 'recall': 0.49374660938668385, 'f1-score': 0.4195735932667828, 'support': 68203.0}}}


In [55]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

def decision_tree_classification(trainX, trainY, testX, testY, criterion='gini', max_depth=None):
    """
    Perform classification using Decision Trees for three-class prediction.
    
    Parameters:
        trainX (DataFrame or array-like): Features for training.
        trainY (Series or array-like): Target variable for training.
        testX (DataFrame or array-like): Features for testing.
        testY (Series or array-like): Target variable for testing.
        criterion (str, optional): Criterion used to measure the quality of a split ('gini' or 'entropy'). Default is 'gini'.
        max_depth (int, optional): Maximum depth of the tree. If None, the tree is fully grown. Default is None.
        
    Returns:
        dict: Dictionary containing accuracy score and classification report.
    """
    # Get unique target names
    target_names = trainY.unique() if isinstance(trainY, pd.Series) else testY.unique()
    
    # Initialize and train the Decision Tree model
    model = DecisionTreeClassifier(criterion=criterion, max_depth=max_depth)
    model.fit(trainX, trainY)
    
    # Predict on the testing set
    testY_pred = model.predict(testX)
    
    # Calculate accuracy score
    accuracy = accuracy_score(testY, testY_pred)
    
    # Generate classification report
    report = classification_report(testY, testY_pred, target_names=target_names, output_dict=True)
    
    results = {
        'accuracy': accuracy,
        'classification_report': report
    }
    
    return results

In [56]:
decision_tree_classification(enc_trainX, trainY, enc_testX, testY)

{'accuracy': 0.5651364309487853,
 'classification_report': {'LATE': {'precision': 0.6162837565497783,
   'recall': 0.731718989280245,
   'f1-score': 0.6690587669015009,
   'support': 31344.0},
  'ONTIME': {'precision': 0.5275328652130299,
   'recall': 0.48827208756841284,
   'f1-score': 0.5071437634817917,
   'support': 20464.0},
  'EARLY': {'precision': 0.46625715945878643,
   'recall': 0.34260445257700517,
   'f1-score': 0.39497925602981504,
   'support': 16395.0},
  'accuracy': 0.5651364309487853,
  'macro avg': {'precision': 0.5366912604071982,
   'recall': 0.520865176475221,
   'f1-score': 0.5237272621377026,
   'support': 68203.0},
  'weighted avg': {'precision': 0.5535902636004648,
   'recall': 0.5651364309487853,
   'f1-score': 0.5545922154195688,
   'support': 68203.0}}}

In [57]:
from sklearn.metrics import accuracy_score, classification_report

from sklearn.preprocessing import LabelEncoder

def fit_and_evaluate(model, trainX, trainY, testX, testY):
    # Convert string labels to integers
    label_encoder = LabelEncoder()
    trainY_encoded = label_encoder.fit_transform(trainY)
    testY_encoded = label_encoder.transform(testY)
    
    model.fit(trainX, trainY_encoded)
    testY_pred = model.predict(testX)
    accuracy = accuracy_score(testY_encoded, testY_pred)
    report = classification_report(testY_encoded, testY_pred, output_dict=True)
    results = {'accuracy': accuracy, 'classification_report': report}
    return results

# Update other classification functions similarly...


# Random Forest
from sklearn.ensemble import RandomForestClassifier

def random_forest_classification(trainX, trainY, testX, testY, n_estimators=100, criterion='gini', max_depth=None):
    model = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth)
    return fit_and_evaluate(model, trainX, trainY, testX, testY)

# Support Vector Machines (SVM)
from sklearn.svm import SVC

def svm_classification(trainX, trainY, testX, testY, kernel='rbf', C=1.0):
    model = SVC(kernel=kernel, C=C)
    return fit_and_evaluate(model, trainX, trainY, testX, testY)

# K-Nearest Neighbors (KNN)
from sklearn.neighbors import KNeighborsClassifier

def knn_classification(trainX, trainY, testX, testY, n_neighbors=5):
    model = KNeighborsClassifier(n_neighbors=n_neighbors)
    return fit_and_evaluate(model, trainX, trainY, testX, testY)

# Gradient Boosting Machines (GBM)
from sklearn.ensemble import GradientBoostingClassifier

def gbm_classification(trainX, trainY, testX, testY, n_estimators=100, learning_rate=0.1, max_depth=3):
    model = GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth)
    return fit_and_evaluate(model, trainX, trainY, testX, testY)

# Naive Bayes
from sklearn.naive_bayes import GaussianNB

def naive_bayes_classification(trainX, trainY, testX, testY):
    model = GaussianNB()
    return fit_and_evaluate(model, trainX, trainY, testX, testY)

# AdaBoost
from sklearn.ensemble import AdaBoostClassifier

def adaboost_classification(trainX, trainY, testX, testY, n_estimators=50, learning_rate=1.0):
    model = AdaBoostClassifier(n_estimators=n_estimators, learning_rate=learning_rate)
    return fit_and_evaluate(model, trainX, trainY, testX, testY)

# XGBoost
from xgboost import XGBClassifier

def xgboost_classification(trainX, trainY, testX, testY, n_estimators=100, learning_rate=0.1, max_depth=3):
    model = XGBClassifier(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth)
    return fit_and_evaluate(model, trainX, trainY, testX, testY)


In [58]:
random_forest_classification(enc_trainX, trainY, enc_testX, testY)

{'accuracy': 0.5579519962464994,
 'classification_report': {'0': {'precision': 0.631508875739645,
   'recall': 0.6809915773353752,
   'f1-score': 0.6553174505710426,
   'support': 31344.0},
  '1': {'precision': 0.5167301345519167,
   'recall': 0.4973123534010946,
   'f1-score': 0.5068353295649792,
   'support': 20464.0},
  '2': {'precision': 0.44411204786510744,
   'recall': 0.3984141506556877,
   'f1-score': 0.4200237919171784,
   'support': 16395.0},
  'accuracy': 0.5579519962464994,
  'macro avg': {'precision': 0.530783686052223,
   'recall': 0.5255726937973858,
   'f1-score': 0.5273921906844001,
   'support': 68203.0},
  'weighted avg': {'precision': 0.552022589906636,
   'recall': 0.5579519962464994,
   'f1-score': 0.5542049238801612,
   'support': 68203.0}}}

In [59]:
knn_classification(enc_trainX, trainY, enc_testX, testY)

{'accuracy': 0.5249915692858086,
 'classification_report': {'0': {'precision': 0.5743490148979122,
   'recall': 0.7170750382848392,
   'f1-score': 0.6378251060629709,
   'support': 31344.0},
  '1': {'precision': 0.47654516640253564,
   'recall': 0.440822908522283,
   'f1-score': 0.4579885261714982,
   'support': 20464.0},
  '2': {'precision': 0.42495069033530575,
   'recall': 0.2628240317169869,
   'f1-score': 0.3247785943094027,
   'support': 16395.0},
  'accuracy': 0.5249915692858086,
  'macro avg': {'precision': 0.49194829054525124,
   'recall': 0.47357399284136975,
   'f1-score': 0.4735307421812906,
   'support': 68203.0},
  'weighted avg': {'precision': 0.5090902508140256,
   'recall': 0.5249915692858086,
   'f1-score': 0.5086141720703482,
   'support': 68203.0}}}

In [60]:
gbm_classification(enc_trainX, trainY, enc_testX, testY)

{'accuracy': 0.4950515373224052,
 'classification_report': {'0': {'precision': 0.5035388004188784,
   'recall': 0.8897715671260847,
   'f1-score': 0.6431223336792344,
   'support': 31344.0},
  '1': {'precision': 0.4627097830536226,
   'recall': 0.276192337763878,
   'f1-score': 0.3459102175709171,
   'support': 20464.0},
  '2': {'precision': 0.3704318936877076,
   'recall': 0.013601707837755413,
   'f1-score': 0.02623992469259281,
   'support': 16395.0},
  'accuracy': 0.4950515373224052,
  'macro avg': {'precision': 0.44556015905340285,
   'recall': 0.3931885375759061,
   'f1-score': 0.3384241586475814,
   'support': 68203.0},
  'weighted avg': {'precision': 0.4592912930186154,
   'recall': 0.4950515373224052,
   'f1-score': 0.40565571433145503,
   'support': 68203.0}}}

In [61]:
naive_bayes_classification(enc_trainX, trainY, enc_testX, testY)

{'accuracy': 0.4379132882717769,
 'classification_report': {'0': {'precision': 0.5239726981473742,
   'recall': 0.6000510464522716,
   'f1-score': 0.5594372313686997,
   'support': 31344.0},
  '1': {'precision': 0.38959936622906294,
   'recall': 0.33644448788115716,
   'f1-score': 0.36107614852108244,
   'support': 20464.0},
  '2': {'precision': 0.2851872096201148,
   'recall': 0.25458981396767305,
   'f1-score': 0.2690213012793658,
   'support': 16395.0},
  'accuracy': 0.4379132882717769,
  'macro avg': {'precision': 0.39958642466551736,
   'recall': 0.3970284494337006,
   'f1-score': 0.396511560389716,
   'support': 68203.0},
  'weighted avg': {'precision': 0.42625406482067685,
   'recall': 0.4379132882717769,
   'f1-score': 0.4301081641252021,
   'support': 68203.0}}}

In [62]:
adaboost_classification(enc_trainX, trainY, enc_testX, testY)



{'accuracy': 0.4870020380335176,
 'classification_report': {'0': {'precision': 0.5029830368998859,
   'recall': 0.858027054619704,
   'f1-score': 0.6341952295049108,
   'support': 31344.0},
  '1': {'precision': 0.4297743115466906,
   'recall': 0.304290461297889,
   'f1-score': 0.3563070408834721,
   'support': 20464.0},
  '2': {'precision': 0.3836734693877551,
   'recall': 0.005733455321744434,
   'f1-score': 0.011298076923076923,
   'support': 16395.0},
  'accuracy': 0.4870020380335176,
  'macro avg': {'precision': 0.43881027261144384,
   'recall': 0.3893503237464458,
   'f1-score': 0.33393344910381995,
   'support': 68203.0},
  'weighted avg': {'precision': 0.45233682317044327,
   'recall': 0.4870020380335176,
   'f1-score': 0.40108081065928397,
   'support': 68203.0}}}

In [63]:
xgboost_classification(enc_trainX, trainY, enc_testX, testY)

{'accuracy': 0.4916499274225474,
 'classification_report': {'0': {'precision': 0.49784045835169677,
   'recall': 0.9009698825931598,
   'f1-score': 0.6413153322962677,
   'support': 31344.0},
  '1': {'precision': 0.4646254189691095,
   'recall': 0.25063526192337765,
   'f1-score': 0.325619782242961,
   'support': 20464.0},
  '2': {'precision': 0.3712984054669704,
   'recall': 0.00994205550472705,
   'f1-score': 0.019365569680408697,
   'support': 16395.0},
  'accuracy': 0.4916499274225474,
  'macro avg': {'precision': 0.4445880942625922,
   'recall': 0.3871824000070882,
   'f1-score': 0.32876689473987913,
   'support': 68203.0},
  'weighted avg': {'precision': 0.4574555849154791,
   'recall': 0.4916499274225474,
   'f1-score': 0.39708472522065696,
   'support': 68203.0}}}

In [64]:
model = DecisionTreeClassifier(criterion='entropy', max_features='log2')
res = fit_and_evaluate(model, enc_trainX, trainY, enc_testX, testY)
print(res)

{'accuracy': 0.5622773191795082, 'classification_report': {'0': {'precision': 0.613586971103539, 'recall': 0.7296133231240429, 'f1-score': 0.6665889382788021, 'support': 31344.0}, '1': {'precision': 0.5250356144146046, 'recall': 0.48626856919468336, 'f1-score': 0.5049090493949311, 'support': 20464.0}, '2': {'precision': 0.46155772602053596, 'recall': 0.3372369624885636, 'f1-score': 0.3897229858320998, 'support': 16395.0}, 'accuracy': 0.5622773191795082, 'macro avg': {'precision': 0.5333934371795598, 'recall': 0.5177062849357633, 'f1-score': 0.520406991168611, 'support': 68203.0}, 'weighted avg': {'precision': 0.550471940438932, 'recall': 0.5622773191795082, 'f1-score': 0.5515231121936853, 'support': 68203.0}}}


In [65]:
model = RandomForestClassifier(n_estimators=100, criterion='entropy')
res = fit_and_evaluate(model, enc_trainX, trainY, enc_testX, testY)
print(res)

{'accuracy': 0.5586411154934534, 'classification_report': {'0': {'precision': 0.6309021735924736, 'recall': 0.6824910668708525, 'f1-score': 0.6556834378017196, 'support': 31344.0}, '1': {'precision': 0.5192444399309434, 'recall': 0.4997068021892103, 'f1-score': 0.509288311170875, 'support': 20464.0}, '2': {'precision': 0.4439802766744282, 'recall': 0.39542543458371454, 'f1-score': 0.41829854502048586, 'support': 16395.0}, 'accuracy': 0.5586411154934534, 'macro avg': {'precision': 0.5313756300659483, 'recall': 0.5258744345479257, 'f1-score': 0.5277567646643602, 'support': 68203.0}, 'weighted avg': {'precision': 0.5524664983051562, 'recall': 0.5586411154934534, 'f1-score': 0.5546944022970948, 'support': 68203.0}}}


In [66]:
model = XGBClassifier(
    colsample_bytree=0.7,enable_categorical=True,
    gamma=0.0, learning_rate=0.01, max_depth=17, 
    min_child_weight=5,n_estimators=100,
    n_jobs=20,objective='multi:softprob')
res = fit_and_evaluate(model, enc_trainX, trainY, enc_testX, testY)
print(res)

{'accuracy': 0.5422342125712946, 'classification_report': {'0': {'precision': 0.5486045435338149, 'recall': 0.8698315467075038, 'f1-score': 0.6728446095186387, 'support': 31344.0}, '1': {'precision': 0.5304534478132547, 'recall': 0.3864347146207975, 'f1-score': 0.44713332579441367, 'support': 20464.0}, '2': {'precision': 0.5030572540300167, 'recall': 0.11039951204635559, 'f1-score': 0.18106337218026308, 'support': 16395.0}, 'accuracy': 0.5422342125712946, 'macro avg': {'precision': 0.5273717484590287, 'recall': 0.4555552577915523, 'f1-score': 0.4336804358311051, 'support': 68203.0}, 'weighted avg': {'precision': 0.5322094900282459, 'recall': 0.5422342125712946, 'f1-score': 0.486903974996767, 'support': 68203.0}}}


In [67]:
model = KNeighborsClassifier(n_neighbors=70)
res = fit_and_evaluate(model, enc_trainX, trainY, enc_testX, testY)
print(res)

{'accuracy': 0.5087606117032976, 'classification_report': {'0': {'precision': 0.5322418768034124, 'recall': 0.8121171516079633, 'f1-score': 0.64304660856385, 'support': 31344.0}, '1': {'precision': 0.46756756756756757, 'recall': 0.3719702892885067, 'f1-score': 0.4143261484868278, 'support': 20464.0}, '2': {'precision': 0.3983402489626556, 'recall': 0.09954254345837145, 'f1-score': 0.1592816708959594, 'support': 16395.0}, 'accuracy': 0.5087606117032976, 'macro avg': {'precision': 0.4660498977778785, 'recall': 0.4278766614516138, 'f1-score': 0.40555147598221236, 'support': 68203.0}, 'weighted avg': {'precision': 0.48064865872427304, 'recall': 0.5087606117032976, 'f1-score': 0.45813008512527326, 'support': 68203.0}}}


In [68]:
model = AdaBoostClassifier(n_estimators=200, learning_rate=0.8, random_state=947)
res = fit_and_evaluate(model, enc_trainX, trainY, enc_testX, testY)
print(res)



{'accuracy': 0.4922217497764028, 'classification_report': {'0': {'precision': 0.5103201347935973, 'recall': 0.8503381827462991, 'f1-score': 0.6378452113147944, 'support': 31344.0}, '1': {'precision': 0.435108099386899, 'recall': 0.3294566067240031, 'f1-score': 0.37498261909396813, 'support': 20464.0}, '2': {'precision': 0.36666666666666664, 'recall': 0.010734980176883195, 'f1-score': 0.020859259259259258, 'support': 16395.0}, 'accuracy': 0.4922217497764028, 'macro avg': {'precision': 0.4373649669490543, 'recall': 0.39684325654906183, 'f1-score': 0.344562363222674, 'support': 68203.0}, 'weighted avg': {'precision': 0.4532209206460715, 'recall': 0.4922217497764028, 'f1-score': 0.41066012017279935, 'support': 68203.0}}}


In [92]:
import numpy as np
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

def fit_and_evaluate_voting(voting_classifier, trainX, trainY, testX, testY):
    # Convert string labels to integers
    label_encoder = LabelEncoder()
    trainY_encoded = label_encoder.fit_transform(trainY)
    testY_encoded = label_encoder.transform(testY)
    
    # Fit the VotingClassifier
    voting_classifier.fit(trainX, trainY_encoded)
    
    # Make predictions
    testY_pred = voting_classifier.predict(testX)
    
    # Calculate accuracy score
    accuracy = accuracy_score(testY_encoded, testY_pred)
    
    # Generate classification report
    report = classification_report(testY_encoded, testY_pred, target_names=label_encoder.classes_, output_dict=True)
    
    results = {
        'accuracy': accuracy,
        'classification_report': report,
        'voting_classifier': voting_classifier
    }
    
    return results, voting_classifier

In [82]:
scaler = StandardScaler()
enc_trainX = pd.DataFrame(scaler.fit_transform(trainX), index=trainX.index, columns=trainX.columns)
enc_testX = pd.DataFrame(scaler.transform(testX), index=testX.index, columns=testX.columns)

In [83]:
from sklearn.ensemble import VotingClassifier

# Define multiple classifiers
ada = AdaBoostClassifier(n_estimators=200, learning_rate=0.8, random_state=947)
knn = KNeighborsClassifier(n_neighbors=70)
dt = DecisionTreeClassifier(criterion='entropy', max_features='log2')
rf = RandomForestClassifier(n_estimators=100, criterion='entropy')
xgb = XGBClassifier(
    colsample_bytree=0.7,enable_categorical=True,
    gamma=0.0, learning_rate=0.01, max_depth=17, 
    min_child_weight=5,n_estimators=100,objective='multi:softprob')

votingCLF = VotingClassifier(estimators=[('knn', knn), ('ada', ada), ('xgb', xgb), ('dt', dt),('rf', rf)], voting='soft', weights=[5,7,10,6,5])
result, votingCLF = fit_and_evaluate_voting(votingCLF, enc_trainX, trainY, enc_testX, testY)



In [84]:
result

{'accuracy': 0.5706053985895049,
 'classification_report': {'EARLY': {'precision': 0.6203599850451317,
   'recall': 0.7411306789178153,
   'f1-score': 0.6753888646605611,
   'support': 31344.0},
  'LATE': {'precision': 0.5340566287377613,
   'recall': 0.49310985144644254,
   'f1-score': 0.5127670926598745,
   'support': 20464.0},
  'ONTIME': {'precision': 0.471758556735795,
   'recall': 0.34132357426044524,
   'f1-score': 0.39607884771914925,
   'support': 16395.0},
  'accuracy': 0.5706053985895049,
  'macro avg': {'precision': 0.5420583901728959,
   'recall': 0.5251880348749011,
   'f1-score': 0.5280782683465283,
   'support': 68203.0},
  'weighted avg': {'precision': 0.5587434535053812,
   'recall': 0.5706053985895049,
   'f1-score': 0.5594529136030343,
   'support': 68203.0}},
 'voting_classifier': VotingClassifier(estimators=[('knn', KNeighborsClassifier(n_neighbors=70)),
                              ('ada',
                               AdaBoostClassifier(learning_rate=0.8,
    

In [85]:
final_test = pd.read_csv("CIS662_SECOND_TASK_INPUTS.csv")

In [86]:
final_test

Unnamed: 0,Scheduled Elapsed Time (Minutes),Carrier Code_AA,Carrier Code_B6,Carrier Code_DL,Carrier Code_EV,Carrier Code_F9,Carrier Code_G4,Carrier Code_MQ,Carrier Code_OH,Carrier Code_OO,...,month_11,month_12,PREV_STAT_LATE,PREV_STAT_ONTIME,WeekDay_Monday,WeekDay_Saturday,WeekDay_Sunday,WeekDay_Thursday,WeekDay_Tuesday,WeekDay_Wednesday
0,173,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,173,1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
2,173,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
3,86,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,86,0,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
5,86,0,0,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
6,170,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
7,170,0,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
8,170,0,1,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
9,173,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [87]:
trainX.head()

Unnamed: 0,Scheduled Elapsed Time (Minutes),Carrier Code_AA,Carrier Code_B6,Carrier Code_DL,Carrier Code_EV,Carrier Code_F9,Carrier Code_G4,Carrier Code_MQ,Carrier Code_OH,Carrier Code_OO,...,month_11,month_12,PREV_STAT_LATE,PREV_STAT_ONTIME,WeekDay_Monday,WeekDay_Saturday,WeekDay_Sunday,WeekDay_Thursday,WeekDay_Tuesday,WeekDay_Wednesday
19431,103,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
157007,81,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
93387,101,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
325089,72,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
26716,72,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


In [95]:
label_encoder = LabelEncoder()
trainY_encoded = label_encoder.fit_transform(trainY)
testY_encoded = label_encoder.transform(testY)

In [96]:
label_encoder.classes_

array(['EARLY', 'LATE', 'ONTIME'], dtype=object)

In [88]:
enc_final_test = pd.DataFrame(scaler.transform(final_test), index=final_test.index, columns=final_test.columns)

In [89]:
enc_final_test

Unnamed: 0,Scheduled Elapsed Time (Minutes),Carrier Code_AA,Carrier Code_B6,Carrier Code_DL,Carrier Code_EV,Carrier Code_F9,Carrier Code_G4,Carrier Code_MQ,Carrier Code_OH,Carrier Code_OO,...,month_11,month_12,PREV_STAT_LATE,PREV_STAT_ONTIME,WeekDay_Monday,WeekDay_Saturday,WeekDay_Sunday,WeekDay_Thursday,WeekDay_Tuesday,WeekDay_Wednesday
0,2.340189,4.178582,-0.451807,-0.350329,-0.344944,-0.126158,-0.152445,-0.343278,-0.226377,-0.279426,...,-0.306281,-0.300953,-0.655277,-0.562751,-0.415918,-0.371877,-0.408237,-0.420123,-0.409397,2.418332
1,2.340189,4.178582,-0.451807,-0.350329,-0.344944,-0.126158,-0.152445,-0.343278,-0.226377,-0.279426,...,-0.306281,-0.300953,-0.655277,1.776985,-0.415918,-0.371877,-0.408237,-0.420123,-0.409397,2.418332
2,2.340189,4.178582,-0.451807,-0.350329,-0.344944,-0.126158,-0.152445,-0.343278,-0.226377,-0.279426,...,-0.306281,-0.300953,1.526073,-0.562751,-0.415918,-0.371877,-0.408237,-0.420123,-0.409397,2.418332
3,-0.587682,-0.239316,-0.451807,2.85446,-0.344944,-0.126158,-0.152445,-0.343278,-0.226377,-0.279426,...,-0.306281,-0.300953,-0.655277,-0.562751,-0.415918,-0.371877,-0.408237,-0.420123,-0.409397,2.418332
4,-0.587682,-0.239316,-0.451807,2.85446,-0.344944,-0.126158,-0.152445,-0.343278,-0.226377,-0.279426,...,-0.306281,-0.300953,-0.655277,1.776985,-0.415918,-0.371877,-0.408237,-0.420123,-0.409397,2.418332
5,-0.587682,-0.239316,-0.451807,2.85446,-0.344944,-0.126158,-0.152445,-0.343278,-0.226377,-0.279426,...,-0.306281,-0.300953,1.526073,-0.562751,-0.415918,-0.371877,-0.408237,-0.420123,-0.409397,2.418332
6,2.239228,-0.239316,2.213335,-0.350329,-0.344944,-0.126158,-0.152445,-0.343278,-0.226377,-0.279426,...,-0.306281,-0.300953,-0.655277,-0.562751,-0.415918,-0.371877,-0.408237,-0.420123,-0.409397,2.418332
7,2.239228,-0.239316,2.213335,-0.350329,-0.344944,-0.126158,-0.152445,-0.343278,-0.226377,-0.279426,...,-0.306281,-0.300953,-0.655277,1.776985,-0.415918,-0.371877,-0.408237,-0.420123,-0.409397,2.418332
8,2.239228,-0.239316,2.213335,-0.350329,-0.344944,-0.126158,-0.152445,-0.343278,-0.226377,-0.279426,...,-0.306281,-0.300953,1.526073,-0.562751,-0.415918,-0.371877,-0.408237,-0.420123,-0.409397,2.418332
9,2.340189,4.178582,-0.451807,-0.350329,-0.344944,-0.126158,-0.152445,-0.343278,-0.226377,-0.279426,...,-0.306281,-0.300953,-0.655277,-0.562751,-0.415918,-0.371877,-0.408237,2.380254,-0.409397,-0.413508


In [90]:
print(result)

{'accuracy': 0.5706053985895049, 'classification_report': {'EARLY': {'precision': 0.6203599850451317, 'recall': 0.7411306789178153, 'f1-score': 0.6753888646605611, 'support': 31344.0}, 'LATE': {'precision': 0.5340566287377613, 'recall': 0.49310985144644254, 'f1-score': 0.5127670926598745, 'support': 20464.0}, 'ONTIME': {'precision': 0.471758556735795, 'recall': 0.34132357426044524, 'f1-score': 0.39607884771914925, 'support': 16395.0}, 'accuracy': 0.5706053985895049, 'macro avg': {'precision': 0.5420583901728959, 'recall': 0.5251880348749011, 'f1-score': 0.5280782683465283, 'support': 68203.0}, 'weighted avg': {'precision': 0.5587434535053812, 'recall': 0.5706053985895049, 'f1-score': 0.5594529136030343, 'support': 68203.0}}, 'voting_classifier': VotingClassifier(estimators=[('knn', KNeighborsClassifier(n_neighbors=70)),
                             ('ada',
                              AdaBoostClassifier(learning_rate=0.8,
                                                 n_estimators=2

In [94]:
final_model = result['voting_classifier']

In [98]:
final_results = label_encoder.inverse_transform(final_model.predict(enc_final_test))

In [100]:
len(final_results)

33

In [101]:
final_results

array(['EARLY', 'ONTIME', 'EARLY', 'LATE', 'LATE', 'LATE', 'EARLY',
       'EARLY', 'LATE', 'LATE', 'LATE', 'EARLY', 'LATE', 'LATE', 'LATE',
       'ONTIME', 'LATE', 'ONTIME', 'LATE', 'ONTIME', 'EARLY', 'LATE',
       'LATE', 'LATE', 'LATE', 'EARLY', 'LATE', 'EARLY', 'LATE', 'EARLY',
       'EARLY', 'EARLY', 'EARLY'], dtype=object)