In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math

df = pd.read_csv("../input/flight-take-off-data-jfk-airport/M1_final.csv")
print(df)

In [None]:
df.info()

In [None]:
X = df.drop(['TAXI_OUT'], axis = 1)
Y = df["TAXI_OUT"]
print(X.shape, Y.shape)

In [None]:
from sklearn.preprocessing import LabelEncoder
LE = LabelEncoder()

X['OP_UNIQUE_CARRIER'] = LE.fit_transform(X['OP_UNIQUE_CARRIER'].astype(str))
X['TAIL_NUM'] = LE.fit_transform(X['TAIL_NUM'].astype(str))
X['DEST'] = LE.fit_transform(X['DEST'].astype(str))
X['Wind'] = LE.fit_transform(X['Wind'].astype(str))
X['Condition'] = LE.fit_transform(X['Condition'].astype(str))

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.1, random_state = 2)
print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)

In [None]:
rmseL = np.empty(8, dtype = float)

# Label Encoding

## 1. Linear Regression with Label Encoding

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

LinearReg = LinearRegression()
LinearReg.fit(X_train, Y_train)

pred0 = LinearReg.predict(X_test)
rmseL[0] = mean_squared_error(Y_test, pred0)**0.5
print(rmseL[0])

## 2. Ridge Regression with Label Encoding

In [None]:
from sklearn.linear_model import Ridge

RidgeReg = Ridge(alpha = 0.05, normalize = True)
RidgeReg.fit(X_train, Y_train)

pred1 = RidgeReg.predict(X_test)
rmseL[1] = mean_squared_error(Y_test, pred1)**0.5
print(rmseL[1])

## 3. Lasso Regression with Label Encoding

In [None]:
from sklearn.linear_model import Lasso

LassoReg = Lasso(alpha = 0.05, normalize = True)
LassoReg.fit(X_train, Y_train)

pred2 = LassoReg.predict(X_test)
rmseL[2] = mean_squared_error(Y_test,pred2)**0.5
print(rmseL[2])

## 4. KNN Regression with Label Encoding

In [None]:
from sklearn.neighbors import KNeighborsRegressor

KNNReg = KNeighborsRegressor(n_neighbors = 200)
KNNReg.fit(X_train, Y_train)

pred3 = KNNReg.predict(X_test)
rmseL[3] = mean_squared_error(Y_test,pred3)**0.5
print(rmseL[3])

## 5. Support Vector Regression with Label Encoding

In [None]:
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler

SC = StandardScaler()

X1 = SC.fit_transform(X_train)
X2 = SC.fit_transform(X_test)

svr = SVR()
svr.fit(X1, Y_train)

pred4 = svr.predict(X2)
rmseL[4] = mean_squared_error(Y_test, pred4)**0.5
print(rmseL[4])

## 6. Random Forest with Label Encoding

In [None]:
from sklearn.ensemble import RandomForestRegressor

RandReg = RandomForestRegressor()
RandReg.fit(X_train, Y_train)

pred5 = RandReg.predict(X_test)
rmseL[5] = mean_squared_error(Y_test, pred5)**0.5
print(rmseL[5])

## 7. Light GBM model with Label Encoding

In [None]:
from lightgbm import LGBMRegressor

X1 = X_train.drop(["Dew Point"], axis = 1)
X2 = X_test.drop(["Dew Point"], axis = 1)

LGBMReg = LGBMRegressor()
LGBMReg.fit(X1, Y_train)

pred6 = LGBMReg.predict(X2)
rmseL[6] = mean_squared_error(Y_test, pred6)**0.5
print(rmseL[6])

## 8. Naive Bayes (Bayesian Ridge) with Label Encoding

In [None]:
from sklearn.linear_model import BayesianRidge

BayReg = BayesianRidge()
BayReg.fit(X_train, Y_train)

pred7 = BayReg.predict(X_test)
rmseL[7] = mean_squared_error(Y_test, pred7)**0.5
print(rmseL[7])

# One Hot Encoding

In [None]:
df.info()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
LE = LabelEncoder()

X0 = df.drop("TAXI_OUT",axis=1)
Y0 = df["TAXI_OUT"]
X0 = pd.get_dummies(X0, columns=["MONTH", "DAY_OF_MONTH", "DAY_OF_WEEK", "OP_UNIQUE_CARRIER", "DEST"])
X0['TAIL_NUM'] = LE.fit_transform(X0['TAIL_NUM'].astype(str))
X0['Wind'] = LE.fit_transform(X0['Wind'].astype(str))
X0['Condition'] = LE.fit_transform(X0['Condition'].astype(str))

X_train0, X_test0, Y_train0, Y_test0 = train_test_split(X0, Y0, test_size = 0.1)
print(X_train0.shape, X_test0.shape, Y_train0.shape, Y_test0.shape)

In [None]:
rmseH = np.empty(8, dtype = float)

## 1. Linear Regression with One Hot Encoding

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

LinearReg = LinearRegression()
LinearReg.fit(X_train0, Y_train0)

pred0 = LinearReg.predict(X_test0)
rmseH[0] = mean_squared_error(Y_test0, pred0)**0.5
print(rmseH[0])

## 2. Ridge Regression with One Hot Encoding

In [None]:
from sklearn.linear_model import Ridge

RidgeReg = Ridge(alpha = 0.05, normalize = True)
RidgeReg.fit(X_train0, Y_train0)

pred1 = RidgeReg.predict(X_test0)
rmseH[1] = mean_squared_error(Y_test0, pred1)**0.5
print(rmseH[1])

## 3. Lasso Regression with One Hot Encoding

In [None]:
from sklearn.linear_model import Lasso

LassoReg = Lasso(alpha = 0.05, normalize = True)
LassoReg.fit(X_train0, Y_train0)

pred2 = LassoReg.predict(X_test0)
rmseH[2] = mean_squared_error(Y_test0, pred2)**0.5
print(rmseH[2])

## 4. KNN regression with One Hot Encoding

In [None]:
from sklearn.neighbors import KNeighborsRegressor

KNNReg = KNeighborsRegressor(n_neighbors = 200)
KNNReg.fit(X_train0, Y_train0)

pred3 = KNNReg.predict(X_test0)
rmseH[3] = mean_squared_error(Y_test0, pred3)**0.5
print(rmseH[3])

## 5. Support Vector Regression with One Hot Encoding

In [None]:
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler

SC = StandardScaler()

X1_ = SC.fit_transform(X_train0)
X2_ = SC.fit_transform(X_test0)

svr = SVR()

svr.fit(X1_, Y_train0)

pred4 = svr.predict(X2_)
rmseH[4] = mean_squared_error(Y_test0, pred4)**0.5
print(rmseH[4])

## 6. Random Forest Regression with One Hot Encoding

In [None]:
from sklearn.ensemble import RandomForestRegressor

RandReg = RandomForestRegressor()
RandReg.fit(X_train0, Y_train0)

pred5 = RandReg.predict(X_test0)
rmseH[5] = mean_squared_error(Y_test0, pred5)**0.5
print(rmseH[5])

## 7. Light GBM model with One Hot Encoding

In [None]:
from lightgbm import LGBMRegressor

X_train1 = X_train0.drop(["Dew Point"], axis = 1)
X_test1 = X_test0.drop(["Dew Point"], axis = 1)

LGBMReg = LGBMRegressor()
LGBMReg.fit(X_train1, Y_train0)

pred6 = LGBMReg.predict(X_test1)
rmseH[6] = mean_squared_error(Y_test0, pred6)**0.5
print(rmseH[6])

## 8. Naive Bayes (Bayesian Ridge) with One Hot Encoding

In [None]:
from sklearn.linear_model import BayesianRidge

BayReg = BayesianRidge()
BayReg.fit(X_train0, Y_train0)

pred7 = BayReg.predict(X_test0)
rmseH[7] = mean_squared_error(Y_test0, pred7)**0.5
print(rmseH[7])

In [None]:
models = ["Linear Regression", "Ridge Regression", "Lasso Regression", "KNN Regression", "Support Vector Regression", "Random forest regression", "Light GBM", "Naive Bayes"]
mod1 = ["LR", "RR", "LSR", "KNNR", "SVR", "RFR", "LGBM", "NBR"]

plt.plot(mod1, rmseL)
plt.plot(mod1, rmseH, 'r')
plt.legend(["Label Encoding", "One Hot Encoding"])
for i in range(8):
    print(mod1[i], ":", models[i])
plt.show()