In [None]:
import numpy as np
import pandas as pd
import math
import sklearn
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
dataset = pd.read_csv('../input/flight-take-off-data-jfk-airport/M1_final.csv')
dataset.head(10)

In [None]:
dataset.info()

In [None]:
# converting Dew Point values from string to numeric
dataset["Dew Point"] = pd.to_numeric(dataset["Dew Point"], errors = 'coerce')

# dropping rows with null values
dataset.dropna(inplace = True)
dataset.info()

In [None]:
dataset1 = dataset.copy() # for Label Encoding
dataset2 = dataset.copy() # for One Hot Encoding

# getting columns of object type
ObjList = dataset1.select_dtypes(include = "object").columns
print(ObjList)

# Label Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
dataset1[ObjList] = dataset1[ObjList].apply(le.fit_transform)
dataset1.info()

In [None]:
from sklearn.model_selection import train_test_split

x = dataset1.iloc[:,:-1].values
y = dataset1.iloc[:,-1].values

# splitting dataset
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.1, random_state = 42)

In [None]:
# featuring data
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
x_train_feat = ss.fit_transform(x_train)
x_test_feat = ss.transform(x_test)

## Linear Regression with Label Encoding

In [None]:
from sklearn.metrics import mean_squared_error as mse

from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(x_train_feat,y_train)
pred_lr = lr.predict(x_test_feat)
mse_lr = mse(y_test, pred_lr, squared = False)
print("error for Linear Regression = {}".format(mse_lr))

## Ridge Regression with Label Encoding

In [None]:
from sklearn.linear_model import Ridge

ridge = Ridge(alpha = 1.0)
ridge.fit(x_train_feat, y_train)
pred_ridge = ridge.predict(x_test_feat)
mse_ridge = mse(y_test, pred_ridge, squared = False)
print("error for Ridge Regression = {}".format(mse_ridge))

## Lasso Regression with Label Encoding

In [None]:
from sklearn.linear_model import Lasso

lasso = Lasso(alpha = 0.5)
lasso.fit(x_train_feat, y_train)
pred_lasso = lasso.predict(x_test_feat)
mse_lasso = mse(y_test, pred_lasso, squared = False)
print("error for Lasso Regression = {}".format(mse_lasso))

## KNN with Label Encoding

In [None]:
from sklearn.neighbors import KNeighborsRegressor
i = int(math.sqrt(len(dataset1)/2))
knr = KNeighborsRegressor(i)
knr.fit(x_train_feat, y_train)
pred_knr = knr.predict(x_test_feat)
mse_knr = mse(y_test, pred_knr, squared = False)
print("error when k = {} for KNN  is {}".format(i, mse_knr))

## Random Forest with Label Encoding

In [None]:
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor()
rfr.fit(x_train, y_train)
pred_rfr = rfr.predict(x_test)
mse_rfr = mse(y_test, pred_rfr,squared = False)
print("error for Random Forest Regressor = {}".format(mse_rfr))

## Naive Bayes with Label Encoding

In [None]:
from sklearn.linear_model import BayesianRidge
br = BayesianRidge()
br.fit(x_train, y_train)
pred_br = br.predict(x_test)
mse_br = mse(y_test, pred_br, squared = False)
print("error for Bayesian Ridge = {}".format(mse_br))

## SVR with Label Encoding

In [None]:
from sklearn.svm import SVR
svr = SVR()
svr.fit(x_train_feat, y_train)
pred_svr = svr.predict(x_test_feat)
mse_svr = mse(y_test,pred_svr, squared = False)
print("error for Support Vector Regressor = {}".format(mse_svr))

## LGBM with Label Encoding

In [None]:
from lightgbm import LGBMRegressor


lgbm = LGBMRegressor()
lgbm.fit(x_train,y_train)
pred_lgbm = lgbm.predict(x_test)
mse_lgbm = mse(y_test,pred_lgbm,squared=False)
print("error for LGBMRegressor = {}".format(mse_lgbm))

In [None]:
mse_label = [mse_lr,mse_ridge, mse_lasso, mse_knr, mse_rfr, mse_br, mse_svr, mse_lgbm]

# One Hot Encoding

In [None]:
dataset2.info()

In [None]:
ObjList = dataset2.select_dtypes(include = "object").columns
ObjList

In [None]:
for i in range(len(ObjList)):
    print(dataset2[ObjList[i]].unique().shape)

In [None]:
dataset2.drop('TAIL_NUM', axis = 1, inplace = True)

In [None]:
Op_Unique_Carrier = pd.get_dummies(dataset2['OP_UNIQUE_CARRIER'], drop_first = True)
Dest = pd.get_dummies(dataset2['DEST'], drop_first = True)
Wind = pd.get_dummies(dataset2['Wind'], drop_first = True)
Condition = pd.get_dummies(dataset2['Condition'], drop_first = True)

In [None]:
dataset2 = pd.concat([dataset2,Op_Unique_Carrier,Dest,Wind,Condition], axis = 1)
dataset2.drop(['OP_UNIQUE_CARRIER', 'DEST', 'Wind', 'Condition'],axis = 1, inplace = True)

In [None]:
from sklearn.model_selection import train_test_split


x = dataset2.drop('TAXI_OUT',axis = 1).values
y = dataset2['TAXI_OUT'].values

#splitting dataset
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.1, random_state = 42)

In [None]:
#featuring data
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
x_train_feat = ss.fit_transform(x_train)
x_test_feat = ss.transform(x_test)

## Linear Regression with One Hot Encoding

In [None]:
from sklearn.metrics import mean_squared_error as mse

from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(x_train_feat,y_train)
pred_lr = lr.predict(x_test_feat)
mse_lr = mse(y_test, pred_lr, squared = False)
print("error for Linear Regression = {}".format(mse_lr))

## Ridge Regression with One Hot Encoding

In [None]:
from sklearn.linear_model import Ridge

ridge = Ridge(alpha = 1.0)
ridge.fit(x_train_feat, y_train)
pred_ridge = ridge.predict(x_test_feat)
mse_ridge = mse(y_test, pred_ridge, squared = False)
print("error for Ridge Regression = {}".format(mse_ridge))

## Lasso Regression with One Hot Encoding

In [None]:
from sklearn.linear_model import Lasso

lasso = Lasso(alpha = 0.5)
lasso.fit(x_train_feat, y_train)
pred_lasso = lasso.predict(x_test_feat)
mse_lasso = mse(y_test, pred_lasso, squared = False)
print("error for Lasso Regression = {}".format(mse_lasso))

## KNN with One Hot Encoding

In [None]:
from sklearn.neighbors import KNeighborsRegressor
i = int(math.sqrt(len(dataset2)/2))
knr = KNeighborsRegressor(i)
knr.fit(x_train_feat, y_train)
pred_knr = knr.predict(x_test_feat)
mse_knr = mse(y_test, pred_knr, squared = False)
print("error when k = {} for KNN  is {}".format(i, mse_knr))

## Random Forest with One Hot Encoding

In [None]:
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor()
rfr.fit(x_train, y_train)
pred_rfr = rfr.predict(x_test)
mse_rfr = mse(y_test, pred_rfr,squared = False)
print("error for Random Forest Regressor = {}".format(mse_rfr))

## Naive Bayes with One Hot Encoding

In [None]:
from sklearn.linear_model import BayesianRidge
br = BayesianRidge()
br.fit(x_train, y_train)
pred_br = br.predict(x_test)
mse_br = mse(y_test, pred_br, squared = False)
print("error for Bayesian Ridge = {}".format(mse_br))

## SVR with One Hot Encoding

In [None]:
from sklearn.svm import SVR
svr = SVR()
svr.fit(x_train_feat, y_train)
pred_svr = svr.predict(x_test_feat)
mse_svr = mse(y_test,pred_svr, squared = False)
print("error for Support Vector Regressor = {}".format(mse_svr))

## LGBM with One Hot Encoding

In [None]:
from lightgbm import LGBMRegressor


lgbm = LGBMRegressor()
lgbm.fit(x_train,y_train)
pred_lgbm = lgbm.predict(x_test)
mse_lgbm = mse(y_test,pred_lgbm,squared=False)
print("error for LGBMRegressor = {}".format(mse_lgbm))

In [None]:
mse_one_hot = [mse_lr,mse_ridge, mse_lasso, mse_knr, mse_rfr, mse_br, mse_svr, mse_lgbm]
models = ["Linear", "Ridge", "Lasso", "KNN", "Random_forest", "Bayesian_Ridge", "SVR", "LGBM"]

## Comparison using Graph

In [None]:
plt.figure(figsize = (15,5))
plt.plot(models,mse_label,label = "Label encoded",linestyle='dashed', linewidth = 3,
         marker='o', markerfacecolor='blue', markersize=12)
plt.plot(models, mse_one_hot,label = "One Hot Encoded",linestyle='dashed', linewidth = 3,
         marker='o', markerfacecolor='red', markersize=12)
plt.title("Comparison between One Hot Encoded and Label Ecoded data")
plt.xlabel("Models")
plt.ylabel("Mean Squared Error")
plt.legend()
plt.show()