In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv("../input/flight-take-off-data-jfk-airport/M1_final.csv")
pd.set_option('display.max_column',None)
df.head()

In [None]:
df.info()
df.shape

In [None]:
df["Dew Point"] = df["Dew Point"].astype(int)
df = df.dropna()
df.corr()

In [None]:
plt.figure(figsize = (10, 10))
df.corr().iloc[:-1, -1].plot(kind = "bar", title = "Correlation of features with TAXI_OUT")

# Label Encoding

In [None]:
df1 = df[:]
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()

df['OP_UNIQUE_CARRIER']=le.fit_transform(df['OP_UNIQUE_CARRIER'])
df['TAIL_NUM']=le.fit_transform(df['TAIL_NUM'])
df['DEST']=le.fit_transform(df['DEST'])
df['Wind']=le.fit_transform(df['Wind'])
df['Condition']=le.fit_transform(df['Condition'])

df.head()

**Train - Test Split**

In [None]:
X, y = df.iloc[:, :-1], df.iloc[:, -1]
print(X.shape, y.shape)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

e1 = [] #array to store the RMSE

**Linear Regression**

In [None]:
from sklearn.linear_model import LinearRegression 
from sklearn.metrics import mean_squared_error as mse

In [None]:
lr = LinearRegression()
lr.fit(X_train, y_train)
p = lr.predict(X_test)
e1.append(mse(y_test, p)**0.5)
print("RMSE:", e1[0])

**Ridge Regression**

In [None]:
from sklearn.linear_model import Ridge

ridge = Ridge(alpha = 1)
ridge.fit(X_train, y_train)
p1 = ridge.predict(X_test)
e1.append(mse(y_test, p1, squared = False))
print("RMSE:",e1[1])

**Lasso regression**

In [None]:
from sklearn.linear_model import Lasso

la = Lasso(alpha = 0.1)
la.fit(X_train, y_train)
p2 = la.predict(X_test)
e1.append(mse(y_test, p2, squared = False))
print("RMSE:",e1[2])

**KNN Neighbors Model**

In [None]:
from sklearn.neighbors import KNeighborsRegressor

knn=KNeighborsRegressor(n_neighbors=200)
knn.fit(X_train,y_train)
p3=knn.predict(X_test)
e1.append(mse(y_test, p3, squared = False))
print("RMSE:",e1[3])

**Support Vector Regression**

In [None]:
from sklearn.svm import SVR

svr = SVR()
svr.fit(X_train, y_train)
p4 = svr.predict(X_test)
e1.append(mse(y_test, p4, squared = False))
print("RMSE:",e1[4])

**Naive Bayes**

In [None]:
from sklearn.linear_model import BayesianRidge

br = BayesianRidge()
br.fit(X_train, y_train)
p5 = br.predict(X_test)
e1.append(mse(y_test, p5, squared = False))
print("RMSE:",e1[5])

**Random Forest**

In [None]:
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor()
rfr.fit(X_train, y_train)
p6 = rfr.predict(X_test)
e1.append(mse(y_test, p6, squared = False))
print("RMSE:",e1[6])

**LightGBM**

In [None]:
from lightgbm import LGBMRegressor

lgbm = LGBMRegressor()
lgbm.fit(X_train, y_train)
p7 = lgbm.predict(X_test)
e1.append(mse(y_test, p7, squared = False))
print("RMSE:",e1[7])

# One Hot Encoding

In [None]:
df = df1[:]
df = pd.get_dummies(df, columns = ['OP_UNIQUE_CARRIER', 'TAIL_NUM', 'DEST', 'Wind', 'Condition'])
df.head()

In [None]:
df.shape

**Train - Test Split**

In [None]:
X, y = df.drop(["TAXI_OUT"], axis = 1), df["TAXI_OUT"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

e2 = [] #array to store the RMSE

**Linear Regression**

In [None]:
lr = LinearRegression()
lr.fit(X_train, y_train)
q = lr.predict(X_test)
e2.append(mse(y_test, q, squared = False))
print("RMSE:", e2[0])

**Ridge Regression**

In [None]:
ridge = Ridge(alpha = 1)
ridge.fit(X_train, y_train)
p1 = ridge.predict(X_test)
e2.append(mse(y_test, p1, squared = False))
print("RMSE:",e2[1])

**Lasso regression**

In [None]:
la = Lasso(alpha = 0.1)
la.fit(X_train, y_train)
p2 = la.predict(X_test)
e2.append(mse(y_test, p2, squared = False))
print("RMSE:",e2[2])

**kNN Neighbors Model**

In [None]:
knn=KNeighborsRegressor(n_neighbors=200)
knn.fit(X_train,y_train)
p3=knn.predict(X_test)
e2.append(mse(y_test, p3, squared = False))
print("RMSE:",e2[3])

**Support Vector Regression**

In [None]:
svr = SVR()
svr.fit(X_train, y_train)
p4 = svr.predict(X_test)
e2.append(mse(y_test, p4, squared = False))
print("RMSE:",e2[4])

**Naive Bayes**

In [None]:
br = BayesianRidge()
br.fit(X_train, y_train)
p5 = br.predict(X_test)
e2.append(mse(y_test, p5, squared = False))
print("RMSE:",e2[5])

**Random Forest**

In [None]:
rfr = RandomForestRegressor()
rfr.fit(X_train, y_train)
p6 = rfr.predict(X_test)
e2.append(mse(y_test, p6, squared = False))
print("RMSE:",e2[6])

**LightGBM**

In [None]:
lgbm = LGBMRegressor()
lgbm.fit(X_train, y_train)
p7 = lgbm.predict(X_test)
e2.append(mse(y_test, p7, squared = False))
print("RMSE:",e2[7])

# Evaluation

In [None]:
algo = ["LNR", "RR", "LSR", "KNN", "SVR", "NB", "RF", "LGBM"]

plt.plot(algo, e1)
plt.plot(algo, e2, 'r')

plt.legend(["Label Encoding","One Hot Encoding"])

plt.xlabel("Models")
plt.ylabel("RMSE")

plt.show

LNR : Linear Regression<br>
RR  : Ridge Regression<br>
LSR : Lasso Regression<br>
KNN : KNN Neighbors Model<br>
SVR : Support Vector Regression<br>
NB  : Naive Bayes<br>
RF  : Random Forest<br>
LGBM: LightGBM

Clearly, we can see that One Hot Encoding improves the accuracy of the training models.