Order of Contents of the Notebook:
1. Preprocessing
2. Scaling, Encoding, and Splitting
3. Exploratory Visualizations
4. Defining Model Comparision Function
5. Model Comparisions

Summary of Findings:

It seems that the one-hot encoded data tends to perform slightly better than the label encoded data, although this is not always quite the case.  Furthermore, it seems that the Random Forest model performs the best on either data set, with the labled data slightly outperforming the one-hot encoded data.  

In [None]:
#preprocessing

import numpy as np
import pandas as pd
import os

data = pd.read_csv("/kaggle/input/flight-take-off-data-jfk-airport/M1_final.csv")

#change column names
data.columns = ['Month', 'Day_Month', 'Day_Week', 'Airline', 'Tail_Num',
       'Dest', 'Dep_Delay', 'Flight_Time', 'Distance', 'Sch_Push_Time',
       'Actual_Push_Time', 'Sch_Arr_Time', 'Temperature', 'Dew_Point', 'Humidity',
       'Wind', 'Wind_Speed', 'Wind_Gust', 'Pressure', 'Condition', 'Dep_Traffic',
       'Arr_Traffic', 'Taxi_Out']

#change wind into degrees
dic = {'NNW': 340, 'CALM': 0, 'NNE':20, 'NE':45, 'VAR':0, 'WSW':230, 'S':180, 'SSW':200, 'WNW':290, 'ESE':115, 'N': 360, 'SW':225, 'E':90, 'W':270, 'SSE':155, 'ENE':70, 'NW':315, 'SE':135}

for item in dic:
    data.loc[data['Wind'] == item, "Wind"] = dic[item]
    
data.loc[data['Wind'].isnull() == True, "Wind"] = int(data.loc[data['Wind'].isnull() == False, "Wind"].mean())

# print(data.describe(include="all"))
# print(data.DISTANCE.isnull().any())
# print(set(data.Wind.values))
# print(data.dtypes)

data['Dew_Point'] = data['Dew_Point'].astype('int')
data['Wind'] = data['Wind'].astype('int')
# for name in data.columns:
#     print(name, pd.unique(data[name]))

# data.describe(include='all')
data.dtypes

In [None]:
#scaling, encoding, and splitting

#need to scale since we are using gradient descent and distance based models
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

scaler = StandardScaler()
datascaled = pd.DataFrame(data =scaler.fit_transform(data.select_dtypes(include=['int64', 'float64'])), columns=data.select_dtypes(include=['int64','float64']).columns )
datascaled = data.select_dtypes(exclude=['int64','float64']).join(datascaled)


#label encode data
dftrain_lab = datascaled.copy()

for column in dftrain_lab.select_dtypes(include=['object']):
    dftrain_lab[column] = dftrain_lab[column].astype('category')
    dftrain_lab[column] = dftrain_lab[column].cat.codes

dftrain_lab

#one-hot encode data

#need to figure out why this is not working
dftrain_o_h = datascaled.copy()
dftrain_o_h = dftrain_o_h.drop("Tail_Num",axis='columns')
#drop Tail Num since it adds little value and messes up linear regression
dftrain_o_h = pd.get_dummies(data = dftrain_o_h, columns=dftrain_o_h.select_dtypes(include=['object']).columns)

#organizing and splitting data
X_l, y_l = dftrain_lab.drop(['Taxi_Out'], axis='columns'), dftrain_lab['Taxi_Out']
X_o, y_o = dftrain_o_h.drop(['Taxi_Out'], axis='columns'), dftrain_o_h['Taxi_Out']

X_l_train, X_l_test, y_l_train, y_l_test = train_test_split(X_l, y_l, test_size=0.1)
X_o_train, X_o_test, y_o_train, y_o_test = train_test_split(X_o, y_o, test_size=0.1)



In [None]:
#exploratory visualizations

import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(20,20))

sns.heatmap(dftrain_lab.corr('pearson'), annot=True)

datascaled.select_dtypes(include=['int64','float64']).plot(kind='box', figsize=(30,30))
dftrain_o_h.plot(kind='box', figsize=(30,30))

In [None]:
#defining model comparison function

#inputs below as in order:
#list of models, list of model names, list of [trainfeat,testfeat], list of [trainlab,testlab], list of dataset names, error function, error function name, root of error
def compare_models(models, model_names, lst_datafeatures, lst_datalables, dataset_names, error_func, error_name, root=1):
    df = pd.DataFrame(data=np.zeros([len(dataset_names), len(model_names) + 1]),columns=(['Dataset_Name']+model_names))
    print(df.columns)
    for i in range(len(models)):
        for j in range(len(dataset_names)):
            df.iloc[j,0] = dataset_names[j]
            X_train, y_train = lst_datafeatures[j][0], lst_datalables[j][0]
            X_test, y_test = lst_datafeatures[j][1], lst_datalables[j][1]
            models[i].fit(X_train, y_train)
            predictions = models[i].predict(X_test)
            error = (error_func(predictions, y_test))**(1/root)
            df.iloc[j, i+1] = error
    
    plt.figure(figsize=(20,20))
    for j in range(len(dataset_names)):
        print(df.iloc[j,1:])
        plt.plot(model_names, df.iloc[j,1:], marker='*')

    plt.legend(dataset_names)
    plt.xlabel("Model_Names")
    plt.ylabel(error_name)

    plt.show()
    return df

In [None]:
#model comparisions

#Linear Models
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

#Nonlinear Models
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.linear_model import BayesianRidge
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor

#Error Metrics
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

linreg = LinearRegression(normalize=True)
lassoreg = Lasso(alpha=0.05, normalize=True)
ridgereg = Ridge(alpha=0.05, normalize=True)
kNNreg = KNeighborsRegressor(n_neighbors=50)
SVreg = SVR()
bayridgereg = BayesianRidge()
rfreg = RandomForestRegressor()
LGBMreg = LGBMRegressor()


models=[linreg,lassoreg, ridgereg, kNNreg, SVreg, bayridgereg, rfreg, LGBMreg]
model_names=["Linear", "Lasso", "Ridge", "kNN", "SV", "Bayesian Ridge", "Random Forest", "LGBM"]
datasets_X = [[X_o_train,X_o_test],[X_l_train,X_l_test]]
datasets_y = [[y_o_train,y_o_test],[y_l_train,y_l_test]]
dataset_names=["One Hot", "Label Encoding"]

#compare_models(models, model_names, datasets_X, datasets_y, dataset_names, mean_absolute_error, "MAE")
#compare_models(models, model_names, datasets_X, datasets_y, dataset_names, mean_squared_error, "MSE", root=1)
compare_models(models, model_names, datasets_X, datasets_y, dataset_names, mean_squared_error, "RMSE", root=2)