# CS 4501 Machine Learning Project
**Team Members:** Leo Wang (yw7uc), Alicia Wu (yw7vv), Simon Zhu (mz4cr)
1. [Data Acquiring](#data_acquiring)
2. [Data Pre-processing](#data_pre-processing)
3. [Data Splitting](#data_splitting)
4. [Data Discovery](#data_discovery)
5. [Data Cleaning and Feature Scaling](#cleaning_and_scaling)
6. [Data Training](#data_training)

---
## 1. Data Acquiring <a name="data_acquiring"></a>

In [0]:
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd

from mlxtend.classifier import SoftmaxRegression
from pandas.plotting import scatter_matrix # optional
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_squared_error, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import cross_val_predict, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer, LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.svm import LinearSVC, SVC

np.random.seed(42)

import warnings
from sklearn.exceptions import DataConversionWarning, ConvergenceWarning, UndefinedMetricWarning
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UndefinedMetricWarning)
warnings.filterwarnings(action = 'ignore', category = DataConversionWarning)
warnings.filterwarnings(action = 'ignore', category = ConvergenceWarning)

flight = pd.read_csv('flights_and_weather_drop_delay_missing.csv', low_memory=False)
flight.head()

Unnamed: 0,FlightDate,Reporting_Airline,CRSDepTime,DepDelayMinutes,DepTimeBlk,Cancelled,Distance,PRCP,SNOW,TMIN
0,2015-01-01,AA,625,0.0,0600-0659,0.0,1172.0,0.0,0.0,15.0
1,2015-01-02,AA,625,0.0,0600-0659,0.0,1172.0,0.0,0.0,16.0
2,2015-01-03,AA,625,0.0,0600-0659,0.0,1172.0,0.03,0.0,23.0
3,2015-01-04,AA,625,0.0,0600-0659,0.0,1172.0,1.13,0.0,32.0
4,2015-01-05,AA,625,0.0,0600-0659,0.0,1172.0,0.1,0.0,32.0


---
## 2. Data Pre-processing <a name="data_pre-processing"></a>
1. [Modify some features from string class to numeric class](#str_to_num)
2. [Class to add delay category](#delay_cat_class)
3. [Class to add frozen category](#frozen_cat_class)
4. [Pipeline to add delay and frozen categories](#add_pipeline)
5. [Drop unnecessary features](#drop_unnecessary_features)
6. [Encode categorical features](#encode_categorical_features)

### 2.1. Modify some features from string class to numeric class <a name="str_to_num"></a>

In [0]:
cols = ['PRCP','SNOW']
flight[cols] = flight[cols].apply(pd.to_numeric, errors='coerce')

### 2.2. Class to add delay category <a name="delay_cat_class"></a>

In [0]:
class AddDelayCat(BaseEstimator, TransformerMixin):
    def __init__(self, add_delayCat=True): # no *args or **kargs
        self.add_delayCat = add_delayCat
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X, y=None):
        row_num, col_num = X.shape[0], X.shape[1]
        delayTime_idx = 'DepDelayMinutes'
        cancelled_idx = 'Cancelled'
        X.insert(col_num, "DelayCategory", 0)
        delayCat_idx = "DelayCategory"

        for i in range(0, row_num):
            if X.loc[i, cancelled_idx] == 1.0: X.loc[i, delayCat_idx] = "Heavy Delay"
            elif X.loc[i, delayTime_idx] <= 30.0: X.loc[i, delayCat_idx] = "No Delay"
            elif 30.0 < X.loc[i, delayTime_idx] <= 120.0: X.loc[i, delayCat_idx] = "Slight Delay"
            else: X.loc[i, delayCat_idx] = "Heavy Delay"

        return X

### 2.3. Class to add frozen category <a name="frozen_cat_class"></a>

In [0]:
class AddFrozenCat(BaseEstimator, TransformerMixin):
    def __init__(self, add_frozen_cat=True): # no *args or **kargs
        self.add_frozen_cat = add_frozen_cat
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X, y=None):
        row_num, col_num = X.shape[0], X.shape[1]
        PRCP_idx = 'PRCP'
        SNOW_idx = 'SNOW'
        TMIN_idx = 'TMIN'
        X.insert(col_num, "Frozen", 0)
        frozen_cat_idx = 'Frozen'
        
        for i in range(0, row_num):
            if (X.loc[i, PRCP_idx] > 0 or X.loc[i, SNOW_idx] > 0) and X.loc[i, TMIN_idx] <= 38: X.loc[i, frozen_cat_idx] = 1
            else: X.loc[i, TMIN_idx] = 0

        return X

### 2.4. Pipeline to add delay and frozen categories <a name="add_pipeline"></a>

In [0]:
add_pipeline = Pipeline([
    ('frozen', AddFrozenCat()),
    ('delay', AddDelayCat()),
])

flight_prepared = add_pipeline.fit_transform(flight)
flight_prepared.head()

Unnamed: 0,FlightDate,Reporting_Airline,CRSDepTime,DepDelayMinutes,DepTimeBlk,Cancelled,Distance,PRCP,SNOW,TMIN,Frozen,DelayCategory
0,2015-01-01,AA,625,0.0,0600-0659,0.0,1172.0,0.0,0.0,0.0,0,No Delay
1,2015-01-02,AA,625,0.0,0600-0659,0.0,1172.0,0.0,0.0,0.0,0,No Delay
2,2015-01-03,AA,625,0.0,0600-0659,0.0,1172.0,0.03,0.0,23.0,1,No Delay
3,2015-01-04,AA,625,0.0,0600-0659,0.0,1172.0,1.13,0.0,32.0,1,No Delay
4,2015-01-05,AA,625,0.0,0600-0659,0.0,1172.0,0.1,0.0,32.0,1,No Delay


### 2.5. Drop unnecessary features <a name="drop_unnecessary_features"></a>

In [0]:
attributes = ['Reporting_Airline', 'DepTimeBlk', 'Distance', 'PRCP', 'SNOW', 'TMIN', 'Frozen', 'DelayCategory']
flight_prepared = flight_prepared[attributes]
flight_prepared.head()

Unnamed: 0,Reporting_Airline,DepTimeBlk,Distance,PRCP,SNOW,TMIN,Frozen,DelayCategory
0,AA,0600-0659,1172.0,0.0,0.0,0.0,0,No Delay
1,AA,0600-0659,1172.0,0.0,0.0,0.0,0,No Delay
2,AA,0600-0659,1172.0,0.03,0.0,23.0,1,No Delay
3,AA,0600-0659,1172.0,1.13,0.0,32.0,1,No Delay
4,AA,0600-0659,1172.0,0.1,0.0,32.0,1,No Delay


### 2.6. Encode categorical features <a name="encode_categorical_features"></a>
* Encode *Reporting_Airline* and *DepTimeBlk* in advance to avoid mismatch in train and test set after using OneHotEncoder

In [0]:
airline_encoded = pd.get_dummies(flight_prepared['Reporting_Airline'])
dep_time_encoded = pd.get_dummies(flight_prepared['DepTimeBlk'])
flight_encoded = pd.concat([flight_prepared, airline_encoded, dep_time_encoded], axis=1)
# flight_encoded = pd.concat([flight_prepared, airline_encoded], axis=1)
# flight_encoded = flight_prepared
flight_encoded = flight_encoded.drop('Reporting_Airline', axis=1)
flight_encoded = flight_encoded.drop('DepTimeBlk', axis=1)
flight_encoded.head()

Unnamed: 0,Distance,PRCP,SNOW,TMIN,Frozen,DelayCategory,9E,AA,AS,B6,...,1400-1459,1500-1559,1600-1659,1700-1759,1800-1859,1900-1959,2000-2059,2100-2159,2200-2259,2300-2359
0,1172.0,0.0,0.0,0.0,0,No Delay,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1172.0,0.0,0.0,0.0,0,No Delay,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1172.0,0.03,0.0,23.0,1,No Delay,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1172.0,1.13,0.0,32.0,1,No Delay,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1172.0,0.1,0.0,32.0,1,No Delay,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [0]:
flight_encoded

Unnamed: 0,Distance,PRCP,SNOW,TMIN,Frozen,DelayCategory,9E,AA,AS,B6,...,1400-1459,1500-1559,1600-1659,1700-1759,1800-1859,1900-1959,2000-2059,2100-2159,2200-2259,2300-2359
0,1172.0,0.00,0.0,0.0,0,No Delay,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1172.0,0.00,0.0,0.0,0,No Delay,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1172.0,0.03,0.0,23.0,1,No Delay,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1172.0,1.13,0.0,32.0,1,No Delay,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1172.0,0.10,0.0,32.0,1,No Delay,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
5,1172.0,0.11,1.1,19.0,1,No Delay,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
6,1172.0,0.18,3.1,11.0,1,No Delay,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
7,1172.0,0.00,0.0,0.0,0,No Delay,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
8,1172.0,0.00,0.0,0.0,0,No Delay,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
9,1172.0,0.16,0.0,8.0,1,No Delay,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [0]:
balanced_train_set = pd.DataFrame(columns=list(flight_encoded))
no_delay_count = 8000
slight_delay_count = 8000
heavy_delay_count = 3000

for i in range(96397):
    if flight_encoded.iloc[i, 5] == 'No Delay' and no_delay_count > 0:
        balanced_train_set.loc[i] = flight_encoded.iloc[i]
        no_delay_count -= 1
    elif flight_encoded.iloc[i, 5] == 'Slight Delay' and slight_delay_count > 0:
        balanced_train_set.loc[i] = flight_encoded.iloc[i]
        slight_delay_count -= 1
    elif flight_encoded.iloc[i, 5] == 'Heavy Delay' and heavy_delay_count > 0:
        balanced_train_set.loc[i] = flight_encoded.iloc[i]
        heavy_delay_count -= 1
        
balanced_train_set

Unnamed: 0,Distance,PRCP,SNOW,TMIN,Frozen,DelayCategory,9E,AA,AS,B6,...,1400-1459,1500-1559,1600-1659,1700-1759,1800-1859,1900-1959,2000-2059,2100-2159,2200-2259,2300-2359
0,1172.0,0.00,0.0,0.0,0,No Delay,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1172.0,0.00,0.0,0.0,0,No Delay,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1172.0,0.03,0.0,23.0,1,No Delay,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1172.0,1.13,0.0,32.0,1,No Delay,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1172.0,0.10,0.0,32.0,1,No Delay,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
5,1172.0,0.11,1.1,19.0,1,No Delay,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
6,1172.0,0.18,3.1,11.0,1,No Delay,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
7,1172.0,0.00,0.0,0.0,0,No Delay,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
8,1172.0,0.00,0.0,0.0,0,No Delay,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
9,1172.0,0.16,0.0,8.0,1,No Delay,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


---
## 3. Data Splitting <a name="data_splitting"></a>

In [0]:
train_set, test_set = train_test_split(balanced_train_set, test_size=0.2, random_state=42)

X_train = train_set.drop('DelayCategory', axis=1)
# X_train = train_set[['Distance', 'PRCP', 'SNOW', 'TMIN', 'Frozen']]
y_train = train_set[['DelayCategory']]
X_test = test_set.drop('DelayCategory', axis=1)
# X_test = test_set[['Distance', 'PRCP', 'SNOW', 'TMIN', 'Frozen']]
y_test = test_set[['DelayCategory']]

---
## 4. Data Discovery <a name="data_discovery"></a>

In [0]:
flight_prepared.corr()

Unnamed: 0,Distance,PRCP,SNOW,TMIN,Frozen
Distance,1.0,-0.013017,-0.012905,0.011346,0.009003
PRCP,-0.013017,1.0,0.127289,0.159678,0.139696
SNOW,-0.012905,0.127289,1.0,0.258823,0.320052
TMIN,0.011346,0.159678,0.258823,1.0,0.961899
Frozen,0.009003,0.139696,0.320052,0.961899,1.0


---
## 5. Data Cleaning and Feature Scaling <a name="cleaning_and_scaling"></a>
1. [Detect missing value](#detect_missing_value)
2. [Scale and transform X](#scale_and_transform_X)

### 5.1. Detect missing value <a name="detect_missing_value"></a>

In [0]:
sample_incomplete_rows = flight_encoded[flight_encoded.isnull().any(axis = 1)]
sample_incomplete_rows.head()

Unnamed: 0,Distance,PRCP,SNOW,TMIN,Frozen,DelayCategory,9E,AA,AS,B6,...,1400-1459,1500-1559,1600-1659,1700-1759,1800-1859,1900-1959,2000-2059,2100-2159,2200-2259,2300-2359
9426,288.0,,0.0,0.0,0,No Delay,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
9428,908.0,,0.0,0.0,0,No Delay,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
9511,1452.0,,0.0,0.0,0,No Delay,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
9512,2419.0,,0.0,0.0,0,No Delay,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
9513,2419.0,,0.0,0.0,0,No Delay,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


### 5.2. Scale and transform X <a name="scale_and_transform_X"></a>

In [0]:
num_attributes = ['Distance', 'PRCP', 'SNOW', 'TMIN']

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('std_scaler', StandardScaler())
])

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attributes)
], remainder='passthrough')

X_train_prepared = full_pipeline.fit_transform(X_train)
X_test_prepared = full_pipeline.fit_transform(X_test)

---
## 6. Data Training <a name="data_training"></a>
1. [Logistic Regression](#logistic_regression)
2. [Linear SVM](#linear_svm)
3. [Polynomial Kernel SVM](#poly_kernel_svm)
4. [Gaussian RBF Kernel SVM](#rbf_kernel_svm)
5. [Random Forest](#random_forest)
5. [K Neighbors Classifier](#k_neighbors_classifier)

### 6.1. Softmax Logistic Regression <a name="logistic_regression"></a>

In [0]:
softmax_reg = LogisticRegression(multi_class="multinomial", solver="lbfgs", C=0.01)
print("C = 0.01")
softmax_reg.fit(X_train_prepared, y_train)
y_pred = softmax_reg.predict(X_test_prepared)
print("Accuracy = ", accuracy_score(y_test, y_pred) * 100, '%')
print("Precision = ", precision_score(y_test, y_pred, average=None, labels=['No Delay', 'Slight Delay', 'Heavy Delay']) * 100, '%')
print("Recall = ", recall_score(y_test, y_pred, average=None, labels=['No Delay', 'Slight Delay', 'Heavy Delay']) * 100, '%')
print("F1 score = ", f1_score(y_test, y_pred, average=None, labels=['No Delay', 'Slight Delay', 'Heavy Delay']) * 100, '%')
print(confusion_matrix(y_test, y_pred, labels=['No Delay', 'Slight Delay', 'Heavy Delay']))

C = 0.01
Accuracy =  56.42105263157895 %
Precision =  [59.40540541 53.58974359  0.        ] %
Recall =  [68.21849783 66.39135959  0.        ] %
F1 score =  [63.50765675 59.30760499  0.        ] %
[[1099  512    0]
 [ 529 1045    0]
 [ 222  393    0]]


In [0]:
softmax_reg = LogisticRegression(multi_class="multinomial", solver="lbfgs", C=0.1)
print("C = 0.1")
softmax_reg.fit(X_train_prepared, y_train)
y_pred = softmax_reg.predict(X_test_prepared)
print("Accuracy = ", accuracy_score(y_test, y_pred) * 100, '%')
print("Precision = ", precision_score(y_test, y_pred, average=None, labels=['No Delay', 'Slight Delay', 'Heavy Delay']) * 100, '%')
print("Recall = ", recall_score(y_test, y_pred, average=None, labels=['No Delay', 'Slight Delay', 'Heavy Delay']) * 100, '%')
print("F1 score = ", f1_score(y_test, y_pred, average=None, labels=['No Delay', 'Slight Delay', 'Heavy Delay']) * 100, '%')
print(confusion_matrix(y_test, y_pred, labels=['No Delay', 'Slight Delay', 'Heavy Delay']))

C = 0.1
Accuracy =  56.15789473684211 %
Precision =  [59.53031131 53.02491103 50.        ] %
Recall =  [67.65983861 66.26429479  0.16260163] %
F1 score =  [63.33527019 58.90991245  0.32414911] %
[[1090  521    0]
 [ 530 1043    1]
 [ 211  403    1]]


In [0]:
softmax_reg = LogisticRegression(multi_class="multinomial", solver="lbfgs", C=1)
print("C = 1")
softmax_reg.fit(X_train_prepared, y_train)
y_pred = softmax_reg.predict(X_test_prepared)
print("Accuracy = ", accuracy_score(y_test, y_pred) * 100, '%')
print("Precision = ", precision_score(y_test, y_pred, average=None, labels=['No Delay', 'Slight Delay', 'Heavy Delay']) * 100, '%')
print("Recall = ", recall_score(y_test, y_pred, average=None, labels=['No Delay', 'Slight Delay', 'Heavy Delay']) * 100, '%')
print("F1 score = ", f1_score(y_test, y_pred, average=None, labels=['No Delay', 'Slight Delay', 'Heavy Delay']) * 100, '%')
print(confusion_matrix(y_test, y_pred, labels=['No Delay', 'Slight Delay', 'Heavy Delay']))

C = 1
Accuracy =  56.15789473684211 %
Precision =  [59.48885264 53.06435138 33.33333333] %
Recall =  [67.9081316  66.01016518  0.16260163] %
F1 score =  [63.42028986 58.83352208  0.3236246 ] %
[[1094  516    1]
 [ 534 1039    1]
 [ 211  403    1]]


In [0]:
softmax_reg = LogisticRegression(multi_class="multinomial", solver="lbfgs", C=10)
print("C = 10")
softmax_reg.fit(X_train_prepared, y_train)
y_pred = softmax_reg.predict(X_test_prepared)
print("Accuracy = ", accuracy_score(y_test, y_pred) * 100, '%')
print("Precision = ", precision_score(y_test, y_pred, average=None, labels=['No Delay', 'Slight Delay', 'Heavy Delay']) * 100, '%')
print("Recall = ", recall_score(y_test, y_pred, average=None, labels=['No Delay', 'Slight Delay', 'Heavy Delay']) * 100, '%')
print("F1 score = ", f1_score(y_test, y_pred, average=None, labels=['No Delay', 'Slight Delay', 'Heavy Delay']) * 100, '%')
print(confusion_matrix(y_test, y_pred, labels=['No Delay', 'Slight Delay', 'Heavy Delay']))

C = 10
Accuracy =  56.15789473684211 %
Precision =  [59.48885264 53.06435138 33.33333333] %
Recall =  [67.9081316  66.01016518  0.16260163] %
F1 score =  [63.42028986 58.83352208  0.3236246 ] %
[[1094  516    1]
 [ 534 1039    1]
 [ 211  403    1]]


In [0]:
softmax_reg = LogisticRegression(multi_class="multinomial", solver="lbfgs", C=100)
print("C = 100")
softmax_reg.fit(X_train_prepared, y_train)
y_pred = softmax_reg.predict(X_test_prepared)
print("Accuracy = ", accuracy_score(y_test, y_pred) * 100, '%')
print("Precision = ", precision_score(y_test, y_pred, average=None, labels=['No Delay', 'Slight Delay', 'Heavy Delay']) * 100, '%')
print("Recall = ", recall_score(y_test, y_pred, average=None, labels=['No Delay', 'Slight Delay', 'Heavy Delay']) * 100, '%')
print("F1 score = ", f1_score(y_test, y_pred, average=None, labels=['No Delay', 'Slight Delay', 'Heavy Delay']) * 100, '%')
print(confusion_matrix(y_test, y_pred, labels=['No Delay', 'Slight Delay', 'Heavy Delay']))

C = 100
Accuracy =  56.184210526315795 %
Precision =  [59.52121872 53.08831036 33.33333333] %
Recall =  [67.9081316  66.07369759  0.16260163] %
F1 score =  [63.43867788 58.87347863  0.3236246 ] %
[[1094  516    1]
 [ 533 1040    1]
 [ 211  403    1]]


### 6.2. Linear SVM <a name="linear_svm"></a>

In [0]:
svm_clf = LinearSVC(C=0.01, loss="hinge", random_state=42)
print("C = 0.01")
svm_clf.fit(X_train_prepared, y_train)
y_pred = svm_clf.predict(X_test_prepared)
print("Accuracy = ", accuracy_score(y_test, y_pred) * 100, '%')
print("Precision = ", precision_score(y_test, y_pred, average=None, labels=['No Delay', 'Slight Delay', 'Heavy Delay']) * 100, '%')
print("Recall = ", recall_score(y_test, y_pred, average=None, labels=['No Delay', 'Slight Delay', 'Heavy Delay']) * 100, '%')
print("F1 score = ", f1_score(y_test, y_pred, average=None, labels=['No Delay', 'Slight Delay', 'Heavy Delay']) * 100, '%')
print(confusion_matrix(y_test, y_pred, labels=['No Delay', 'Slight Delay', 'Heavy Delay']))

C = 0.01
Accuracy =  56.05263157894736 %
Precision =  [57.36917907 54.45544554  0.        ] %
Recall =  [74.17752948 59.40279543  0.        ] %
F1 score =  [64.69951272 56.82163476  0.        ] %
[[1195  416    0]
 [ 639  935    0]
 [ 249  366    0]]


In [0]:
svm_clf = LinearSVC(C=0.1, loss="hinge", random_state=42)
print("C = 0.1")
svm_clf.fit(X_train_prepared, y_train)
y_pred = svm_clf.predict(X_test_prepared)
print("Accuracy = ", accuracy_score(y_test, y_pred) * 100, '%')
print("Precision = ", precision_score(y_test, y_pred, average=None, labels=['No Delay', 'Slight Delay', 'Heavy Delay']) * 100, '%')
print("Recall = ", recall_score(y_test, y_pred, average=None, labels=['No Delay', 'Slight Delay', 'Heavy Delay']) * 100, '%')
print("F1 score = ", f1_score(y_test, y_pred, average=None, labels=['No Delay', 'Slight Delay', 'Heavy Delay']) * 100, '%')
print(confusion_matrix(y_test, y_pred, labels=['No Delay', 'Slight Delay', 'Heavy Delay']))

C = 0.1
Accuracy =  50.421052631578945 %
Precision =  [53.56643357 51.47375988 19.31818182] %
Recall =  [71.32216015 45.48919949  8.29268293] %
F1 score =  [61.18210863 48.29679595 11.60409556] %
[[1149  378   84]
 [ 729  716  129]
 [ 267  297   51]]


In [0]:
svm_clf = LinearSVC(C=1, loss="hinge", random_state=42)
print("C = 1")
svm_clf.fit(X_train_prepared, y_train)
y_pred = svm_clf.predict(X_test_prepared)
print("Accuracy = ", accuracy_score(y_test, y_pred) * 100, '%')
print("Precision = ", precision_score(y_test, y_pred, average=None, labels=['No Delay', 'Slight Delay', 'Heavy Delay']) * 100, '%')
print("Recall = ", recall_score(y_test, y_pred, average=None, labels=['No Delay', 'Slight Delay', 'Heavy Delay']) * 100, '%')
print("F1 score = ", f1_score(y_test, y_pred, average=None, labels=['No Delay', 'Slight Delay', 'Heavy Delay']) * 100, '%')
print(confusion_matrix(y_test, y_pred, labels=['No Delay', 'Slight Delay', 'Heavy Delay']))

C = 1
Accuracy =  47.921052631578945 %
Precision =  [52.20588235 53.49075975 18.09338521] %
Recall =  [74.92240844 33.10038119 15.12195122] %
F1 score =  [61.53453989 40.89481947 16.47475642] %
[[1207  224  180]
 [ 812  521  241]
 [ 293  229   93]]


### 6.3. Polynomial Kernal SVM <a name="poly_kernel_svm"></a>

In [0]:
# poly_kernel_svm_clf = SVC(kernel='poly', degree=2, coef0=1, C=1)
# print("Degree = 2, C = 1")
# poly_kernel_svm_clf.fit(X_train_prepared, y_train)
# y_pred = poly_kernel_svm_clf.predict(X_test_prepared)
# print("Accuracy = ", accuracy_score(y_test, y_pred) * 100, '%')
# print("Precision = ", precision_score(y_test, y_pred, average=None, labels=['No Delay', 'Slight Delay', 'Heavy Delay']) * 100, '%')
# print("Recall = ", recall_score(y_test, y_pred, average=None, labels=['No Delay', 'Slight Delay', 'Heavy Delay']) * 100, '%')
# print("F1 score = ", f1_score(y_test, y_pred, average=None, labels=['No Delay', 'Slight Delay', 'Heavy Delay']) * 100, '%')
# print(confusion_matrix(y_test, y_pred, labels=['No Delay', 'Slight Delay', 'Heavy Delay']))

### 6.4. Gaussian RBF Kernal SVM <a name="rbf_kernel_svm"></a>

In [0]:
# rbf_kernel_svm_clf = SVC(kernel='rbf', gamma=5, C=0.001)
# print("Gamma = 5, C = 0.001")
# rbf_kernel_svm_clf.fit(X_train_prepared, y_train)
# y_pred = rbf_kernel_svm_clf.predict(X_test_prepared)
# print("Accuracy = ", accuracy_score(y_test, y_pred) * 100, '%')
# print("Precision = ", precision_score(y_test, y_pred, average=None, labels=['No Delay', 'Slight Delay', 'Heavy Delay']) * 100, '%')
# print("Recall = ", recall_score(y_test, y_pred, average=None, labels=['No Delay', 'Slight Delay', 'Heavy Delay']) * 100, '%')
# print("F1 score = ", f1_score(y_test, y_pred, average=None, labels=['No Delay', 'Slight Delay', 'Heavy Delay']) * 100, '%')
# print(confusion_matrix(y_test, y_pred, labels=['No Delay', 'Slight Delay', 'Heavy Delay']))

### 6.5. Random Forest <a name="random_forest"></a>

In [0]:
rnd_clf = RandomForestClassifier(n_estimators=100, max_leaf_nodes=16, n_jobs=-1)
print("n_estimators = 100, max_leaf_nodes = 16")
rnd_clf.fit(X_train_prepared, y_train)
y_pred = rnd_clf.predict(X_test_prepared)
print("Accuracy = ", accuracy_score(y_test, y_pred) * 100, '%')
print("Precision = ", precision_score(y_test, y_pred, average=None, labels=['No Delay', 'Slight Delay', 'Heavy Delay']) * 100, '%')
print("Recall = ", recall_score(y_test, y_pred, average=None, labels=['No Delay', 'Slight Delay', 'Heavy Delay']) * 100, '%')
print("F1 score = ", f1_score(y_test, y_pred, average=None, labels=['No Delay', 'Slight Delay', 'Heavy Delay']) * 100, '%')
print(confusion_matrix(y_test, y_pred, labels=['No Delay', 'Slight Delay', 'Heavy Delay']))

n_estimators = 100, max_leaf_nodes = 16
Accuracy =  58.21052631578948 %
Precision =  [61.25598723 55.23165018  0.        ] %
Recall =  [71.44630664 67.40787802  0.        ] %
F1 score =  [65.95988539 60.71530758  0.        ] %
[[1151  460    0]
 [ 513 1061    0]
 [ 215  400    0]]


In [0]:
rnd_clf = RandomForestClassifier(n_estimators=100, max_leaf_nodes=64, n_jobs=-1)
print("n_estimators = 100, max_leaf_nodes = 64")
rnd_clf.fit(X_train_prepared, y_train)
y_pred = rnd_clf.predict(X_test_prepared)
print("Accuracy = ", accuracy_score(y_test, y_pred) * 100, '%')
print("Precision = ", precision_score(y_test, y_pred, average=None, labels=['No Delay', 'Slight Delay', 'Heavy Delay']) * 100, '%')
print("Recall = ", recall_score(y_test, y_pred, average=None, labels=['No Delay', 'Slight Delay', 'Heavy Delay']) * 100, '%')
print("F1 score = ", f1_score(y_test, y_pred, average=None, labels=['No Delay', 'Slight Delay', 'Heavy Delay']) * 100, '%')
print(confusion_matrix(y_test, y_pred, labels=['No Delay', 'Slight Delay', 'Heavy Delay']))

n_estimators = 100, max_leaf_nodes = 64
Accuracy =  60.131578947368425 %
Precision =  [63.46255975 56.85967658  0.        ] %
Recall =  [74.17752948 69.25031766  0.        ] %
F1 score =  [68.40297653 62.44629046  0.        ] %
[[1195  416    0]
 [ 484 1090    0]
 [ 204  411    0]]


In [0]:
rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=8, n_jobs=-1)
print("n_estimators = 500, max_leaf_nodes = 8")
rnd_clf.fit(X_train_prepared, y_train)
y_pred = rnd_clf.predict(X_test_prepared)
print("Accuracy = ", accuracy_score(y_test, y_pred) * 100, '%')
print("Precision = ", precision_score(y_test, y_pred, average=None, labels=['No Delay', 'Slight Delay', 'Heavy Delay']) * 100, '%')
print("Recall = ", recall_score(y_test, y_pred, average=None, labels=['No Delay', 'Slight Delay', 'Heavy Delay']) * 100, '%')
print("F1 score = ", f1_score(y_test, y_pred, average=None, labels=['No Delay', 'Slight Delay', 'Heavy Delay']) * 100, '%')
print(confusion_matrix(y_test, y_pred, labels=['No Delay', 'Slight Delay', 'Heavy Delay']))

n_estimators = 500, max_leaf_nodes = 8
Accuracy =  57.36842105263158 %
Precision =  [60.25437202 54.52169367  0.        ] %
Recall =  [70.57728119 66.26429479  0.        ] %
F1 score =  [65.00857633 59.82219673  0.        ] %
[[1137  474    0]
 [ 531 1043    0]
 [ 219  396    0]]


In [0]:
rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)
print("n_estimators = 500, max_leaf_nodes = 16")
rnd_clf.fit(X_train_prepared, y_train)
y_pred = rnd_clf.predict(X_test_prepared)
print("Accuracy = ", accuracy_score(y_test, y_pred) * 100, '%')
print("Precision = ", precision_score(y_test, y_pred, average=None, labels=['No Delay', 'Slight Delay', 'Heavy Delay']) * 100, '%')
print("Recall = ", recall_score(y_test, y_pred, average=None, labels=['No Delay', 'Slight Delay', 'Heavy Delay']) * 100, '%')
print("F1 score = ", f1_score(y_test, y_pred, average=None, labels=['No Delay', 'Slight Delay', 'Heavy Delay']) * 100, '%')
print(confusion_matrix(y_test, y_pred, labels=['No Delay', 'Slight Delay', 'Heavy Delay']))

n_estimators = 500, max_leaf_nodes = 16
Accuracy =  58.28947368421053 %
Precision =  [61.24275935 55.33929511  0.        ] %
Recall =  [72.1911856 66.8360864  0.       ] %
F1 score =  [66.26780627 60.54676259  0.        ] %
[[1163  448    0]
 [ 522 1052    0]
 [ 214  401    0]]


In [0]:
rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=64, n_jobs=-1)
print("n_estimators = 500, max_leaf_nodes = 64")
rnd_clf.fit(X_train_prepared, y_train)
y_pred = rnd_clf.predict(X_test_prepared)
print("Accuracy = ", accuracy_score(y_test, y_pred) * 100, '%')
print("Precision = ", precision_score(y_test, y_pred, average=None, labels=['No Delay', 'Slight Delay', 'Heavy Delay']) * 100, '%')
print("Recall = ", recall_score(y_test, y_pred, average=None, labels=['No Delay', 'Slight Delay', 'Heavy Delay']) * 100, '%')
print("F1 score = ", f1_score(y_test, y_pred, average=None, labels=['No Delay', 'Slight Delay', 'Heavy Delay']) * 100, '%')
print(confusion_matrix(y_test, y_pred, labels=['No Delay', 'Slight Delay', 'Heavy Delay']))

n_estimators = 500, max_leaf_nodes = 64
Accuracy =  60.3421052631579 %
Precision =  [63.90374332 56.89119171  0.        ] %
Recall =  [74.17752948 69.75857687  0.        ] %
F1 score =  [68.65843149 62.67123288  0.        ] %
[[1195  416    0]
 [ 476 1098    0]
 [ 199  416    0]]


In [0]:
rnd_clf = RandomForestClassifier(n_estimators=1000, max_leaf_nodes=16, n_jobs=-1)
print("n_estimators = 1000, max_leaf_nodes = 16")
rnd_clf.fit(X_train_prepared, y_train)
y_pred = rnd_clf.predict(X_test_prepared)
print("Accuracy = ", accuracy_score(y_test, y_pred) * 100, '%')
print("Precision = ", precision_score(y_test, y_pred, average=None, labels=['No Delay', 'Slight Delay', 'Heavy Delay']) * 100, '%')
print("Recall = ", recall_score(y_test, y_pred, average=None, labels=['No Delay', 'Slight Delay', 'Heavy Delay']) * 100, '%')
print("F1 score = ", f1_score(y_test, y_pred, average=None, labels=['No Delay', 'Slight Delay', 'Heavy Delay']) * 100, '%')
print(confusion_matrix(y_test, y_pred, labels=['No Delay', 'Slight Delay', 'Heavy Delay']))

n_estimators = 1000, max_leaf_nodes = 16
Accuracy =  58.36842105263158 %
Precision =  [61.4893617 55.3125     0.       ] %
Recall =  [71.75667287 67.47141042  0.        ] %
F1 score =  [66.22744199 60.78992559  0.        ] %
[[1156  455    0]
 [ 512 1062    0]
 [ 212  403    0]]


In [0]:
rnd_clf = RandomForestClassifier(n_estimators=1000, max_leaf_nodes=64, n_jobs=-1)
print("n_estimators = 1000, max_leaf_nodes = 64")
rnd_clf.fit(X_train_prepared, y_train)
y_pred = rnd_clf.predict(X_test_prepared)
print("Accuracy = ", accuracy_score(y_test, y_pred) * 100, '%')
print("Precision = ", precision_score(y_test, y_pred, average=None, labels=['No Delay', 'Slight Delay', 'Heavy Delay']) * 100, '%')
print("Recall = ", recall_score(y_test, y_pred, average=None, labels=['No Delay', 'Slight Delay', 'Heavy Delay']) * 100, '%')
print("F1 score = ", f1_score(y_test, y_pred, average=None, labels=['No Delay', 'Slight Delay', 'Heavy Delay']) * 100, '%')
print(confusion_matrix(y_test, y_pred, labels=['No Delay', 'Slight Delay', 'Heavy Delay']))

n_estimators = 1000, max_leaf_nodes = 64
Accuracy =  60.21052631578947 %
Precision =  [63.87546967 56.68559628  0.        ] %
Recall =  [73.86716325 69.75857687  0.        ] %
F1 score =  [68.50892343 62.54628311  0.        ] %
[[1190  421    0]
 [ 476 1098    0]
 [ 197  418    0]]


### 6.6. K Neighbors Classifier <a name="k_neighbors_classifier"></a>

In [0]:
neigh = KNeighborsClassifier(n_neighbors=3)
print("n_neighbors = 3")
neigh.fit(X_train_prepared, y_train)
y_pred = neigh.predict(X_test_prepared)
print("Accuracy = ", accuracy_score(y_test, y_pred) * 100, '%')
print("Precision = ", precision_score(y_test, y_pred, average=None, labels=['No Delay', 'Slight Delay', 'Heavy Delay']) * 100, '%')
print("Recall = ", recall_score(y_test, y_pred, average=None, labels=['No Delay', 'Slight Delay', 'Heavy Delay']) * 100, '%')
print("F1 score = ", f1_score(y_test, y_pred, average=None, labels=['No Delay', 'Slight Delay', 'Heavy Delay']) * 100, '%')
print(confusion_matrix(y_test, y_pred, labels=['No Delay', 'Slight Delay', 'Heavy Delay']))

n_neighbors = 3
Accuracy =  55.65789473684211 %
Precision =  [65.29069767 57.25915875 24.42244224] %
Recall =  [69.70825574 53.62134689 24.06504065] %
F1 score =  [67.42719904 55.38057743 24.24242424] %
[[1123  313  175]
 [ 447  844  283]
 [ 150  317  148]]


In [0]:
neigh = KNeighborsClassifier(n_neighbors=5)
print("n_neighbors = 5")
neigh.fit(X_train_prepared, y_train)
y_pred = neigh.predict(X_test_prepared)
print("Accuracy = ", accuracy_score(y_test, y_pred) * 100, '%')
print("Precision = ", precision_score(y_test, y_pred, average=None, labels=['No Delay', 'Slight Delay', 'Heavy Delay']) * 100, '%')
print("Recall = ", recall_score(y_test, y_pred, average=None, labels=['No Delay', 'Slight Delay', 'Heavy Delay']) * 100, '%')
print("F1 score = ", f1_score(y_test, y_pred, average=None, labels=['No Delay', 'Slight Delay', 'Heavy Delay']) * 100, '%')
print(confusion_matrix(y_test, y_pred, labels=['No Delay', 'Slight Delay', 'Heavy Delay']))

n_neighbors = 5
Accuracy =  56.84210526315789 %
Precision =  [62.57637475 58.28611898 25.47169811] %
Recall =  [76.28801986 52.28716645 17.56097561] %
F1 score =  [68.75524476 55.12391159 20.7892204 ] %
[[1229  265  117]
 [ 552  823  199]
 [ 183  324  108]]


In [0]:
neigh = KNeighborsClassifier(n_neighbors=10)
print("n_neighbors = 10")
neigh.fit(X_train_prepared, y_train)
y_pred = neigh.predict(X_test_prepared)
print("Accuracy = ", accuracy_score(y_test, y_pred) * 100, '%')
print("Precision = ", precision_score(y_test, y_pred, average=None, labels=['No Delay', 'Slight Delay', 'Heavy Delay']) * 100, '%')
print("Recall = ", recall_score(y_test, y_pred, average=None, labels=['No Delay', 'Slight Delay', 'Heavy Delay']) * 100, '%')
print("F1 score = ", f1_score(y_test, y_pred, average=None, labels=['No Delay', 'Slight Delay', 'Heavy Delay']) * 100, '%')
print(confusion_matrix(y_test, y_pred, labels=['No Delay', 'Slight Delay', 'Heavy Delay']))

n_neighbors = 10
Accuracy =  58.21052631578948 %
Precision =  [61.4738806  58.85304659 27.96934866] %
Recall =  [81.8125388  52.16010165 11.8699187 ] %
F1 score =  [70.19973369 55.30481644 16.66666667] %
[[1318  240   53]
 [ 618  821  135]
 [ 208  334   73]]


In [0]:
neigh = KNeighborsClassifier(n_neighbors=20)
print("n_neighbors = 20")
neigh.fit(X_train_prepared, y_train)
y_pred = neigh.predict(X_test_prepared)
print("Accuracy = ", accuracy_score(y_test, y_pred) * 100, '%')
print("Precision = ", precision_score(y_test, y_pred, average=None, labels=['No Delay', 'Slight Delay', 'Heavy Delay']) * 100, '%')
print("Recall = ", recall_score(y_test, y_pred, average=None, labels=['No Delay', 'Slight Delay', 'Heavy Delay']) * 100, '%')
print("F1 score = ", f1_score(y_test, y_pred, average=None, labels=['No Delay', 'Slight Delay', 'Heavy Delay']) * 100, '%')
print(confusion_matrix(y_test, y_pred, labels=['No Delay', 'Slight Delay', 'Heavy Delay']))

n_neighbors = 20
Accuracy =  58.73684210526315 %
Precision =  [60.61159288 58.73655914 24.79338843] %
Recall =  [82.43327126 55.52731893  4.87804878] %
F1 score =  [69.85796949 57.08687133  8.15217391] %
[[1328  254   29]
 [ 638  874   62]
 [ 225  360   30]]


In [0]:
neigh = KNeighborsClassifier(n_neighbors=30)
print("n_neighbors = 30")
neigh.fit(X_train_prepared, y_train)
y_pred = neigh.predict(X_test_prepared)
print("Accuracy = ", accuracy_score(y_test, y_pred) * 100, '%')
print("Precision = ", precision_score(y_test, y_pred, average=None, labels=['No Delay', 'Slight Delay', 'Heavy Delay']) * 100, '%')
print("Recall = ", recall_score(y_test, y_pred, average=None, labels=['No Delay', 'Slight Delay', 'Heavy Delay']) * 100, '%')
print("F1 score = ", f1_score(y_test, y_pred, average=None, labels=['No Delay', 'Slight Delay', 'Heavy Delay']) * 100, '%')
print(confusion_matrix(y_test, y_pred, labels=['No Delay', 'Slight Delay', 'Heavy Delay']))

n_neighbors = 30
Accuracy =  57.99999999999999 %
Precision =  [59.13396482 58.26144658 19.73684211] %
Recall =  [81.37802607 55.78144854  2.43902439] %
F1 score =  [68.49529781 56.99448231  4.34153401] %
[[1311  276   24]
 [ 659  878   37]
 [ 247  353   15]]


In [0]:
neigh = KNeighborsClassifier(n_neighbors=40)
print("n_neighbors = 40")
neigh.fit(X_train_prepared, y_train)
y_pred = neigh.predict(X_test_prepared)
print("Accuracy = ", accuracy_score(y_test, y_pred) * 100, '%')
print("Precision = ", precision_score(y_test, y_pred, average=None, labels=['No Delay', 'Slight Delay', 'Heavy Delay']) * 100, '%')
print("Recall = ", recall_score(y_test, y_pred, average=None, labels=['No Delay', 'Slight Delay', 'Heavy Delay']) * 100, '%')
print("F1 score = ", f1_score(y_test, y_pred, average=None, labels=['No Delay', 'Slight Delay', 'Heavy Delay']) * 100, '%')
print(confusion_matrix(y_test, y_pred, labels=['No Delay', 'Slight Delay', 'Heavy Delay']))

n_neighbors = 40
Accuracy =  58.631578947368425 %
Precision =  [59.39883356 58.5620915  19.51219512] %
Recall =  [82.18497827 56.92503177  1.30081301] %
F1 score =  [68.95833333 57.73195876  2.43902439] %
[[1324  270   17]
 [ 662  896   16]
 [ 243  364    8]]


In [0]:
neigh = KNeighborsClassifier(n_neighbors=50)
print("n_neighbors = 50")
neigh.fit(X_train_prepared, y_train)
y_pred = neigh.predict(X_test_prepared)
print("Accuracy = ", accuracy_score(y_test, y_pred) * 100, '%')
print("Precision = ", precision_score(y_test, y_pred, average=None, labels=['No Delay', 'Slight Delay', 'Heavy Delay']) * 100, '%')
print("Recall = ", recall_score(y_test, y_pred, average=None, labels=['No Delay', 'Slight Delay', 'Heavy Delay']) * 100, '%')
print("F1 score = ", f1_score(y_test, y_pred, average=None, labels=['No Delay', 'Slight Delay', 'Heavy Delay']) * 100, '%')
print(confusion_matrix(y_test, y_pred, labels=['No Delay', 'Slight Delay', 'Heavy Delay']))

n_neighbors = 50
Accuracy =  58.78947368421053 %
Precision =  [60.05509642 57.89804909 18.18181818] %
Recall =  [81.19180633 58.4498094   0.97560976] %
F1 score =  [69.04196358 58.17262093  1.85185185] %
[[1308  288   15]
 [ 642  920   12]
 [ 228  381    6]]
