### Import Libraries to Get Started

In [None]:
import pandas as pd 
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import sklearn 
import seaborn as sns
import math


In [None]:
df = pd.read_csv("../input/flight-take-off-data-jfk-airport/M1_final.csv")
df.head()

### Missing Data

In [None]:
sns.heatmap(df.isnull(),yticklabels=False,cbar=False,cmap='viridis')

Didn't get any clue of how much data is missing . Maybe, reason is data set is large but missing data is negligible.

In [None]:
df.isnull().sum()

In [None]:
#since we have very larger dataset so we can drop 2 rows easily 
df = df.dropna()
df.isnull().sum()
print('Dataframe dimensions:', df.shape)
df.head()

In [None]:
df.drop('TAIL_NUM',axis=1,inplace=True)
df.head()

### Histogram Plots

In [None]:
df.hist(bins = 50, figsize = (40, 30))
plt.show()

Histogram plots provides some idea of the distribution of independent variables (numerical and dates).
(1). Most of the 'Wind Gust' values are near 0.
(2). Not all the values of the column 'Dew Point' is Numerical. some of them are in string format. This is why its histogram is not shown.
(3). Cetegorical variables are - 
      'OP_UNIQUE_CARRIER', 'DEST', 'Dew Point', 'Wind', 'Condition' (We have to apply Label and one hot encoding both on these variables)

#### Before going to ML models we have to handle 'Dew Point' column.

In [None]:
df = df[df['Dew Point'].map(lambda x: x.isascii())]
print('Dataframe dimensions:', df.shape)

### Input and Output variables in Dataframe

In [None]:
X = df.drop(['TAXI_OUT'],axis = 1)
Y = df.TAXI_OUT
print('Input dimensions:', X.shape)
print('Output dimensions:', Y.shape)
X.head()

### Train-Test Splitting

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size = 0.1)
print('Input training dimensions:', X_train.shape)
print('Input testing dimensions:', X_test.shape)
print('Output training dimensions:', Y_train.shape)
print('Output testing dimensions:', Y_test.shape)

# (A). Label Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()

In [None]:
X1_train = X_train.copy()
X1_test = X_test.copy()

In [None]:
X1_train['OP_UNIQUE_CARRIER'] =labelencoder.fit_transform(X1_train['OP_UNIQUE_CARRIER'].astype(str))
X1_test['OP_UNIQUE_CARRIER'] =labelencoder.fit_transform(X1_test['OP_UNIQUE_CARRIER'].astype(str))
X1_train['DEST'] = labelencoder.fit_transform(X1_train['DEST'].astype(str))
X1_train['Dew Point'] = labelencoder.fit_transform(X1_train['Dew Point'].astype(str))
X1_train['Wind'] = labelencoder.fit_transform(X1_train['Wind'].astype(str))
X1_train['Condition'] = labelencoder.fit_transform(X1_train['Condition'].astype(str))
X1_test['DEST'] = labelencoder.fit_transform(X1_test['DEST'].astype(str))
X1_test['Dew Point'] = labelencoder.fit_transform(X1_test['Dew Point'].astype(str))
X1_test['Wind'] = labelencoder.fit_transform(X1_test['Wind'].astype(str))
X1_test['Condition'] = labelencoder.fit_transform(X1_test['Condition'].astype(str))

In [None]:
X1_train.head()

## Training and Predicting

In [None]:
from sklearn.metrics import mean_squared_error

#### 1. Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
linear_reg = LinearRegression()
linear_reg.fit(X1_train,Y_train)
Y_predicted1 = linear_reg.predict(X1_test)
l1 = mean_squared_error(Y_predicted1,Y_test,squared = False)
print('Linear Regression Error: ', l1)

#### 2. Ridge Regression

In [None]:
from sklearn.linear_model import Ridge
ridge_reg = Ridge()
ridge_reg.fit(X1_train,Y_train)
Y_predicted2 = ridge_reg.predict(X1_test)
l2 = mean_squared_error(Y_predicted2, Y_test, squared = False)
print('Ridge Regression Error: ', l2)

#### 3. Lasso Regression

In [None]:
from sklearn.linear_model import Lasso
lasso_reg = Lasso()
lasso_reg.fit(X1_train,Y_train)
Y_predicted3 = lasso_reg.predict(X1_test)
l3 = mean_squared_error(Y_predicted3, Y_test, squared = False)
print('Lasso Regression Error: ', l3)

#### 4. KNN Model

In [None]:
from sklearn.neighbors import KNeighborsRegressor
knn_reg = KNeighborsRegressor(n_neighbors = 200)
knn_reg.fit(X1_train,Y_train)
Y_predicted4=knn_reg.predict(X1_test)
l4 = mean_squared_error(Y_predicted4,Y_test, squared =False)
print('KNN Model Error: ', l4)

#### 5. SVR

In [None]:
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
x1_train = X1_train.copy()
x1_test = X1_test.copy()
s1 = StandardScaler()
s2 = StandardScaler()
x1_train = s1.fit_transform(x1_train)
x1_test = s2.fit_transform(x1_test)
sv_reg  = SVR()
sv_reg.fit(x1_train,Y_train)
Y_predicted5 = sv_reg.predict(x1_test)
l5 = mean_squared_error(Y_predicted5, Y_test, squared= False)
print('SVR Error: ', l5)

#### 6. Naive Bayes

In [None]:
from sklearn.linear_model import BayesianRidge
bayes_reg = BayesianRidge()
bayes_reg.fit(X1_train, Y_train)
Y_predicted6 = bayes_reg.predict(X1_test)
l6 = mean_squared_error(Y_predicted6, Y_test,squared=False)
print('Naive Bayes Error:', l6)

#### 7. Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
random_for = RandomForestRegressor()
random_for.fit(X1_train, Y_train)
Y_predicted7 = random_for.predict(X1_test)
l7 = mean_squared_error(Y_predicted7, Y_test, squared = False)
print('Random Forest Error:', l7)

#### 8. LightGBM

In [None]:
from lightgbm import LGBMRegressor
light_reg = LGBMRegressor()
light_reg.fit(X1_train,Y_train)
Y_predicted8 = light_reg.predict(X1_test)
l8 = mean_squared_error(Y_predicted8, Y_test,squared=False)
print('LightGBM Error:', l8)


# (B). One-Hot Encoding

In [None]:
X2 = X.copy()
X2 = pd.get_dummies(X2, columns = ['OP_UNIQUE_CARRIER','Wind','Condition'])
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
X2['DEST']=label_encoder.fit_transform(X2['DEST'].astype(str))
X2['Dew Point']=label_encoder.fit_transform(X2['Dew Point'])

# One_Hot encoding only for 'OP_UNIQUE_CARRIER','Wind','Condition' variables
# Label encoding for DEST and Dew Point because unique values for them are more than 50.

In [None]:
from sklearn.model_selection import train_test_split
X2_train,X2_test,Y_train,Y_test = train_test_split(X2,Y,test_size = 0.1)
print('Input training dimensions:', X2_train.shape)
print('Input testing dimensions:', X2_test.shape)
print('Output training dimensions:', Y_train.shape)
print('Output testing dimensions:', Y_test.shape)

### Training and Predicting

In [None]:
from sklearn.metrics import mean_squared_error

#### 1. Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
linear_reg = LinearRegression()
linear_reg.fit(X2_train,Y_train)
Y_predicted1 = linear_reg.predict(X2_test)
h1 = mean_squared_error(Y_predicted1,Y_test,squared = False)
print('Linear Regression Error: ', h1)

#### 2. Ridge Regression

In [None]:
from sklearn.linear_model import Ridge
ridge_reg = Ridge()
ridge_reg.fit(X2_train,Y_train)
Y_predicted2 = ridge_reg.predict(X2_test)
h2 = mean_squared_error(Y_predicted2, Y_test, squared = False)
print('Ridge Regression Error: ', h2)

#### 3. Lasso Regression

In [None]:
from sklearn.linear_model import Lasso
lasso_reg = Lasso()
lasso_reg.fit(X2_train,Y_train)
Y_predicted3 = lasso_reg.predict(X2_test)
h3 = mean_squared_error(Y_predicted3, Y_test, squared = False)
print('Lasso Regression Error: ', h3)

#### 4. KNN Model

In [None]:
from sklearn.neighbors import KNeighborsRegressor
knn_reg = KNeighborsRegressor(n_neighbors = 200)
knn_reg.fit(X2_train,Y_train)
Y_predicted4=knn_reg.predict(X2_test)
h4 = mean_squared_error(Y_predicted4,Y_test, squared =False)
print('KNN Model Error: ', h4)

#### 5. SVR

In [None]:
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
x2_train = X2_train.copy()
x2_test = X2_test.copy()
s1 = StandardScaler()
s2 = StandardScaler()
x2_train = s1.fit_transform(x2_train)
x2_test = s2.fit_transform(x2_test)
sv_reg  = SVR()
sv_reg.fit(x2_train,Y_train)
Y_predicted5 = sv_reg.predict(x2_test)
h5 = mean_squared_error(Y_predicted5, Y_test, squared= False)
print('SVR Error: ', h5)

#### 6. Naive Bays

In [None]:
from sklearn.linear_model import BayesianRidge
bayes_reg = BayesianRidge()
bayes_reg.fit(X2_train, Y_train)
Y_predicted6 = bayes_reg.predict(X2_test)
h6 = mean_squared_error(Y_predicted6, Y_test,squared=False)
print('Naive Bayes Error:', h6)

#### 7. Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
random_for = RandomForestRegressor()
random_for.fit(X2_train, Y_train)
Y_predicted7 = random_for.predict(X2_test)
h7 = mean_squared_error(Y_predicted7, Y_test, squared = False)
print('Random Forest Error:', h7)

#### 8. LightBGM

In [None]:
from lightgbm import LGBMRegressor
light_reg = LGBMRegressor()
light_reg.fit(X2_train,Y_train)
Y_predicted8 = light_reg.predict(X2_test)
h8 = mean_squared_error(Y_predicted8, Y_test,squared=False)
print('LightGBM Error:', h8)

### Comparison And Conclusion

#### Label Encoding Errors(In Order):

In [None]:
print(l1,l2,l3,l4,l5,l6,l7,l8)

#### One Hot Encoding Errors(in Order)

In [None]:
print(h1,h2,h3,h4,h5,h6,h7,h8)

In [None]:
import matplotlib.pyplot as plt
from matplotlib import style

In [None]:
label_errors = np.array([l1,l2,l3,l4,l5,l6,l7,l8])
onehot_errors = np.array([h1,h2,h3,h4,h5,h6,h7,h8])
style.use("ggplot")
plt.plot(["linear","Ridge","Lasso","KNN","SVR","BR","RandFor","LGBM"],label_errors, "g",label="Label Encoding")
plt.plot(["linear","Ridge","Lasso","KNN","SVR","BR","RandFor","LGBM"],onehot_errors, "b",label="One Hot Encoding")
plt.legend(["Label Encoding","One Hot Encoding"])
plt.xlabel("ML Models")
plt.ylabel("RMSE")
plt.show

#### 1. Generally One hot encoding gives good predictions almost in every Machine Learning Model.
#### 2. Random Forest is best out of all 8 algorithms for prediction analysis.