# Import Libraries

In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression

from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OrdinalEncoder())
])

In [3]:
raw_df = pd.read_csv("../database/classification_airlines_delay.csv")

In [4]:
raw_df.head(2)

Unnamed: 0,Flight,Time,Length,Airline,AirportFrom,AirportTo,DayOfWeek,Class
0,2313.0,1296.0,141.0,DL,ATL,HOU,1,0
1,6948.0,360.0,146.0,OO,COS,ORD,4,0


In [5]:
features = raw_df.drop('Class', axis=1)
target = raw_df['Class']

train_x, test_x, train_y, test_y = train_test_split(features, target, test_size=0.2, random_state=42)

In [6]:
numeric_cols = ['Flight', 'Time', 'Length']
categorical_cols = ['Airline', 'AirportFrom', 'AirportTo',
       'DayOfWeek']

In [7]:
preprocessor = ColumnTransformer(transformers=[
    ('numeric', numeric_transformer, numeric_cols),
    ('categorical', categorical_transformer, categorical_cols)
])

In [8]:
model_rf = RandomForestClassifier()
model_svc = SVC()
model_knn = KNeighborsClassifier()
model_lr = LinearRegression()

In [9]:
pipeline_rf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model_rf', model_rf)
]
)
pipeline_svc = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model_svc', model_svc)
]
)
pipeline_knn = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model_knn', model_knn)
]
)
pipeline_lr = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model_lr', model_lr)
]
)


In [10]:
pipeline_model_rf = pipeline_rf.fit(train_x, train_y)
pipeline_model_svc = pipeline_svc.fit(train_x, train_y)
pipeline_model_knn = pipeline_knn.fit(train_x, train_y)
pipeline_model_lr = pipeline_lr.fit(train_x, train_y)

In [11]:
pred_y_rf = pipeline_model_rf.predict(test_x)
pred_y_svc = pipeline_model_svc.predict(test_x)
pred_y_knn = pipeline_model_knn.predict(test_x)
pred_y_lr = pipeline_model_lr.predict(test_x)

In [12]:
print(classification_report(pred_y_rf, test_y))
print(classification_report(pred_y_svc, test_y))
print(classification_report(pred_y_knn, test_y))
print(classification_report(pred_y_lr, test_y))

              precision    recall  f1-score   support

           0       0.67      0.65      0.66     61402
           1       0.56      0.58      0.57     46475

    accuracy                           0.62    107877
   macro avg       0.61      0.61      0.61    107877
weighted avg       0.62      0.62      0.62    107877

              precision    recall  f1-score   support

           0       0.92      0.57      0.70     96181
           1       0.14      0.57      0.22     11696

    accuracy                           0.57    107877
   macro avg       0.53      0.57      0.46    107877
weighted avg       0.83      0.57      0.65    107877

              precision    recall  f1-score   support

           0       0.70      0.65      0.67     63421
           1       0.55      0.59      0.57     44456

    accuracy                           0.63    107877
   macro avg       0.62      0.62      0.62    107877
weighted avg       0.63      0.63      0.63    107877



ValueError: Classification metrics can't handle a mix of continuous and binary targets