## Importing Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from sklearn import preprocessing

In [None]:
from sklearn.model_selection import train_test_split

### Data is about flights leaving from JKF ariport between Nov 2019-Dec-2020.

## Importing Dataset

In [None]:
df=pd.read_csv("../input/flight-take-off-data-jfk-airport/M1_final.csv")
df.head(10)

In [None]:
df.columns

## Data Preprocessing

In [None]:
df['Dew Point'].unique()

In [None]:
df['Dew Point']=df['Dew Point'].apply(lambda x: str(x).replace(u'\xa0',u''))

In [None]:
df['Dew Point'].unique()

In [None]:
df['Dew Point']=df['Dew Point'].astype(int)

In [None]:
df.dtypes

In [None]:
df.isnull().sum()

In [None]:
df.dropna(inplace=True)

In [None]:
df.drop_duplicates()

In [None]:
df.shape

In [None]:
plt.figure(figsize=(15,8))
sns.heatmap(df.corr(),annot=True,cmap="crest", linewidths=1)
plt.show()

In [None]:
df.drop(['TAXI_OUT'],axis = 1).corrwith(df['TAXI_OUT']).plot(kind = 'bar', title = 'correlation of features with TAXI_OUT')
plt.show()

### TAXI_OUT does not have high correlation with any of the features

In [None]:
df.columns=[i for i in range(1,24)]

#### 1- 'MONTH' , 2- 'DAY_OF_MONTH' ,3- 'DAY_OF_WEEK' ,4- 'OP_UNIQUE_CARRIER' (Carrier Code) ,
#### 5- 'TAIL_NUM' (Airflight Number),6- 'DEST' (Destination), 7-  'DEP_DELAY' (Departure delay of the flight)
#### 8- 'CRS_ELAPSED_TIME' (Scheduled hourney time of the flight), 9- 'DISTANCE' (Distance of the flight.),
#### 10-  'CRS_DEP_M' (Scheduled Departure Time),
#### 11- 'DEP_TIME_M' (Actual Departure TIme(Gate checkout of the flight not the take off time)),
#### 12- 'CRS_ARR_M' (Scheduled Arrival Time),13- 'Temperature',14- 'Dew Point',15- 'Humidity',16- 'Wind',
#### 17- 'Wind Speed',18- 'Wind Gust',19- 'Pressure', 20- 'Condition' (Condition of the climate),
####  21- 'sch_dep' (No. of flights scheduled for departure.),
####  22- 'sch_arr' (No. of flights scheduled for arrival.),23- 'TAXI_OUT' (Run away time)
    

In [None]:
object=df.select_dtypes(include="object")
object

## LABEL ENCODING

In [None]:
label_encoder = preprocessing.LabelEncoder()
df[4]= label_encoder.fit_transform(df[4])
df[5]= label_encoder.fit_transform(df[5])
df[6]= label_encoder.fit_transform(df[6])
df[16]= label_encoder.fit_transform(df[16])
df[20]= label_encoder.fit_transform(df[20])


In [None]:
x = df.drop([23],axis=1)
y = df[23]

In [None]:
from sklearn.preprocessing import StandardScaler 
std = StandardScaler() 
x= std.fit_transform(x)

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.10,random_state=0)
print(x_train.shape,x_test.shape,y_train.shape,y_test.shape)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.linear_model import BayesianRidge
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor

In [None]:
#finding error (Root mean Square Error)
from sklearn.metrics import mean_squared_error 
RMSE=[]
def error(y_pred,y_test):
 rmse = mean_squared_error(y_pred,y_test, squared=False)
 RMSE.append(np.sqrt(mean_squared_error(y_pred,y_test)))
 return(rmse)

## RMSE (Label Encoding)

In [None]:
from sklearn.metrics import mean_squared_error

#Linear Regression
lr = LinearRegression()
lr.fit(x_train,y_train)
y_pred= lr.predict(x_test)
print("Linear Regression:\n RMSE:", error(y_pred,y_test))

#Ridge Regression
ridge=Ridge(alpha=0.05, normalize=True)
ridge.fit(x_train,y_train)
y_pred=ridge.predict(x_test)
print("Ridge Regression:\n RMSE:", error(y_pred,y_test))

#Lasso Regression
lasso=Lasso(alpha=0.05, normalize=True)
lasso.fit(x_train,y_train)
y_pred=lasso.predict(x_test)
print("Lasso Regression:\n RMSE:", error(y_pred,y_test))

#Support Vector Regression
svr = SVR()
svr.fit(x_train, y_train)
y_pred= svr.predict(x_test)
print("SVR :\n RMSE:", error(y_pred,y_test))

#Random Forest Regression
rf = RandomForestRegressor()
rf.fit(x_train,y_train)
y_pred = rf.predict(x_test)
print("Random Forest Regression:\n RMSE:", error(y_pred,y_test))

#KNN Regression
knn = KNeighborsRegressor(n_neighbors = 100)
knn.fit(x_train, y_train)
y_pred= knn.predict(x_test)
print("KNN Regression:\n RMSE:", error(y_pred,y_test))

#LGBM Regression
lgbm=LGBMRegressor()
lgbm.fit(x_train,y_train)
y_pred=lgbm.predict(x_test)
print("LGBM Regression:\n RMSE:", error(y_pred,y_test))

#Naive Bayes
nb=BayesianRidge()
nb.fit(x_train,y_train)
y_pred=nb.predict(x_test)
print("Naive Bayes:\n RMSE:", error(y_pred,y_test))


In [None]:
LE_Rmse=RMSE
print(LE_Rmse)
RMSE=[]

In [None]:
df.dtypes

In [None]:
print("4 :",df[4].nunique())
print("5 :",df[5].nunique())
print("6 :",df[6].nunique())
print("16 :",df[16].nunique())
print("20 :",df[20].nunique())

## One Hot Encoding

In [None]:
df=pd.get_dummies(df,columns=[4,6,16,20],drop_first=True)
df.head()

In [None]:
x_1= df.drop([23],axis=1)
y_1= df[23]

In [None]:
#standardization of independent variables
from sklearn.preprocessing import StandardScaler 
std = StandardScaler() 
x_1= std.fit_transform(x_1)

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x_1,y_1,test_size=0.10,random_state=0)
print(x_train.shape,x_test.shape,y_train.shape,y_test.shape)

In [None]:
#finding error (Root mean Square Error)
from sklearn.metrics import mean_squared_error 
RMSE_1=[]
def error(y_pred,y_test):
 rmse = mean_squared_error(y_pred,y_test, squared=False)
 RMSE_1.append(np.sqrt(mean_squared_error(y_pred,y_test)))
 return(rmse)

## RMSE ( One Hot Encoding)

In [None]:
#Linear Regression
lr = LinearRegression()
lr.fit(x_train,y_train)
y_pred= lr.predict(x_test)
print("Linear Regression:\n RMSE_1:", error(y_pred,y_test))

#Ridge Regression
ridge=Ridge(alpha=0.05, normalize=True)
ridge.fit(x_train,y_train)
y_pred=ridge.predict(x_test)
print("Ridge Regression:\n RMSE_1:", error(y_pred,y_test))

#Lasso Regression
lasso=Lasso(alpha=0.05, normalize=True)
lasso.fit(x_train,y_train)
y_pred=lasso.predict(x_test)
print("Lasso Regression:\n RMSE_1:", error(y_pred,y_test))

#Support Vector Regression
svr = SVR()
svr.fit(x_train, y_train)
y_pred= svr.predict(x_test)
print("SVR :\n RMSE_1:", error(y_pred,y_test))

#Random Forest Regression
rf = RandomForestRegressor()
rf.fit(x_train,y_train)
y_pred = rf.predict(x_test)
print("Random Forest Regression:\n RMSE_1:", error(y_pred,y_test))

#KNN Regression
knn = KNeighborsRegressor(n_neighbors = 100)
knn.fit(x_train, y_train)
y_pred= knn.predict(x_test)
print("KNN Regression:\n RMSE_1:", error(y_pred,y_test))

#LGBM Regression
lgbm=LGBMRegressor()
lgbm.fit(x_train,y_train)
y_pred=lgbm.predict(x_test)
print("LGBM Regression:\n RMSE_1:", error(y_pred,y_test))

#Naive Bayes
nb=BayesianRidge()
nb.fit(x_train,y_train)
y_pred=nb.predict(x_test)
print("Naive Bayes:\n RMSE_1:", error(y_pred,y_test))

In [None]:
OHE_Rmse=RMSE_1
print(OHE_Rmse)
RMSE_1=[]

In [None]:
le = np.array(LE_Rmse)
ohe = np.array(OHE_Rmse)

model_names = ["Linear","Ridge","Lasso","SVR","Random Forest","KNN", "LGBM","Naive Bayes"]
fig = plt.figure(figsize =(15, 6))
plt.plot(model_names,le)
plt.plot(model_names,ohe)
plt.legend(["Label Encoding","One Hot Encoding"])
plt.title("Comparison between One Hot Encoding and Label Encoding with various models")
plt.xlabel("Models")
plt.ylabel("RMSE")
plt.show()

### One Hot Encoding is better than Label Encoding as the rmse is lower when One Hot Encoding is used than  Label Encoding.
### Random Forest is the best algorithm among all the 8 algorithms.