In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Importing Libraries**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

**Reading Dataset**

In [None]:
path = "../input/flight-take-off-data-jfk-airport/M1_final.csv"
data = pd.read_csv(path)
pd.set_option('display.max_column',None)
data.head()

**Checking Correlation**

In [None]:
plt.figure(figsize = (15,15))
sns.heatmap(data.corr('spearman'),cmap='YlGnBu',annot = True)

In [None]:
data.drop(['TAXI_OUT'],axis = 1).corrwith(data['TAXI_OUT']).plot(kind = 'bar', title = 'correlation of features with target(TAXI_OUT)')

In [None]:
y = data['TAXI_OUT']
X = data.drop(['TAXI_OUT'],axis = 1)

X["Dew Point"] = X["Dew Point"].astype(int)

obj_cols = list(X.select_dtypes(include = 'object').columns) #object columns
num_cols = list(set(X.columns) - set(obj_cols)) #numerical columns

# **Label Encoding**

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

labelencoder = LabelEncoder()

for col in obj_cols:
    X[col] = labelencoder.fit_transform(X[col].astype(str))
    
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.1, random_state =10)

**Importing ML Models**

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.linear_model import BayesianRidge
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor

**Scaling Data**

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols]= scaler.transform(X_test[num_cols])

**creating function to display Error/Score**

In [None]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
import math

X_MAE = []
X_RMSE = []
X_R2 = []

def Score_diplay(predictions, y_test):
    print("MAE: {}".format(mean_absolute_error(predictions, y_test)))
    X_MAE.append(mean_absolute_error(predictions, y_test))
    print("RMSE: {}".format(math.sqrt(mean_squared_error(predictions, y_test))))
    X_RMSE.append(math.sqrt(mean_squared_error(predictions, y_test)))
    print("r2_score: {}".format(r2_score(predictions, y_test)))
    X_R2.append(r2_score(predictions, y_test))
    print('\n\n')

# **ERROR FOR LABEL ENCODED DATA**

In [None]:
# Label Encoded 
lreg = LinearRegression()
lreg.fit(X_train,y_train)
predictions = lreg.predict(X_test)

print('Linear Regression:\n')
Score_diplay(predictions, y_test)

lasreg=Lasso(alpha=0.05, normalize=True)
lasreg.fit(X_train,y_train)
predictions=lasreg.predict(X_test)

print('Lasso Regression:\n')
Score_diplay(predictions, y_test)

ridgereg=Ridge(alpha=0.05, normalize=True)
ridgereg.fit(X_train,y_train)
predictions=ridgereg.predict(X_test)

print('Ridge Regression:\n')
Score_diplay(predictions, y_test)

rfreg = RandomForestRegressor()
rfreg.fit(X_train,y_train)
predictions = rfreg.predict(X_test)

print('Random Forest Regression:\n')
Score_diplay(predictions, y_test)

knnreg = KNeighborsRegressor(n_neighbors = 200)
knnreg.fit(X_train, y_train)
predictions = knnreg.predict(X_test)

print('K-Neighbours Regression:\n')
Score_diplay(predictions, y_test)

lgbm=LGBMRegressor()
lgbm.fit(X_train,y_train)
predictions=lgbm.predict(X_test)

print('LGBM Regression:\n')
Score_diplay(predictions, y_test)

bayreg=BayesianRidge()
bayreg.fit(X_train,y_train)
predictions=bayreg.predict(X_test)

print('Gaussian Naive Bayes:\n')
Score_diplay(predictions, y_test)

svreg = SVR()
svreg.fit(X_train, y_train)
predictions = svreg.predict(X_test)

print('SVM:\n')
Score_diplay(predictions, y_test)

LE_MAE = X_MAE
LE_RMSE = X_RMSE
LE_R2 = X_R2

X_MAE = []
X_RMSE = []
X_R2 = []

In [None]:
X[obj_cols].nunique()

# **One Hot Encoding**

In [None]:
X_OE = pd.get_dummies(X, columns=['OP_UNIQUE_CARRIER', 'DEST', 'Dew Point', 'Wind', 'Condition'])

X_train, X_test, y_train, y_test = train_test_split(X_OE,y, test_size = 0.1, random_state =10)

# **ERROR FOR ONE HOT ENCODED DATA**

In [None]:
# One Hot Encoded
lreg = LinearRegression()
lreg.fit(X_train,y_train)
predictions = lreg.predict(X_test)

print('Linear Regression:\n')
Score_diplay(predictions, y_test)

lasreg=Lasso(alpha=0.05, normalize=True)
lasreg.fit(X_train,y_train)
predictions=lasreg.predict(X_test)

print('Lasso Regression:\n')
Score_diplay(predictions, y_test)

ridgereg=Ridge(alpha=0.05, normalize=True)
ridgereg.fit(X_train,y_train)
predictions=ridgereg.predict(X_test)

print('Ridge Regression:\n')
Score_diplay(predictions, y_test)

rfreg = RandomForestRegressor()
rfreg.fit(X_train,y_train)
predictions = rfreg.predict(X_test)

print('Random Forest Regression:\n')
Score_diplay(predictions, y_test)

knnreg = KNeighborsRegressor(n_neighbors = 200)
knnreg.fit(X_train, y_train)
predictions = knnreg.predict(X_test)

print('K-Neighbours Regression:\n')
Score_diplay(predictions, y_test)

lgbm=LGBMRegressor()
lgbm.fit(X_train,y_train)
predictions=lgbm.predict(X_test)

print('LGBM Regression:\n')
Score_diplay(predictions, y_test)

bayreg=BayesianRidge()
bayreg.fit(X_train,y_train)
predictions=bayreg.predict(X_test)

print('Gaussian Naive Bayes:\n')
Score_diplay(predictions, y_test)

svreg = SVR()
svreg.fit(X_train, y_train)
predictions = svreg.predict(X_test)

print('SVM:\n')
Score_diplay(predictions, y_test)

OHE_MAE = X_MAE
OHE_RMSE = X_RMSE
OHE_R2 = X_R2

In [None]:
array1 = np.array(LE_MAE)
array2 = np.array(OHE_MAE)

model_names = ["Linear","Lasso","Ridge","Random Forest","KNN","LGBM","Naive Bayes","SVM"]

plt.plot(model_names,array1)
plt.plot(model_names,array2)

plt.legend(["Label Encoding","One Hot Encoding"])

plt.xlabel("models")
plt.ylabel("MAE")
plt.show()

# MAE COMPARISON

In [None]:
array1 = np.array(LE_RMSE)
array2 = np.array(OHE_RMSE)

model_names = ["Linear","Lasso","Ridge","Random Forest","KNN", "LGBM","Naive Bayes","SVM"]

plt.plot(model_names,array1)
plt.plot(model_names,array2)

plt.legend(["Label Encoding","One Hot Encoding"])

plt.xlabel("models")
plt.ylabel("RMSE")
plt.show()

# RMSE COMPARISON

In [None]:
array1 = np.array(LE_R2)
array2 = np.array(OHE_R2)

model_names = ["Linear","Lasso","Ridge","Random Forest","KNN", "LGBM","Naive Bayes","SVM"]

plt.plot(model_names,array1)
plt.plot(model_names,array2)

plt.legend(["Label Encoding","One Hot Encoding"])

plt.xlabel("models")
plt.ylabel("R2_score")
plt.show()

# R2 SCORE COMPARISON