In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model
import seaborn as sns
%matplotlib inline

In [None]:
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_column',None)

In [None]:
#read the dataset
df=pd.read_csv('../input/flight-take-off-data-jfk-airport/M1_final.csv')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.dtypes

Data Cleaning:

In [None]:
#checking for null values in all columns 
df.isna().sum()

In [None]:
#drop null values by rows
df=df.dropna()

In [None]:
#checking for duplicates 
df=df.drop_duplicates()

In [None]:
#new shape of data 
df.shape

In [None]:
df['Dew Point'].unique()

In [None]:
df['Dew Point']=df['Dew Point'].apply(lambda x:str(x).replace(u'\xa0',u'0'))

In [None]:
df['Dew Point'].unique()

In [None]:
df['Dew Point']=df['Dew Point'].astype(int)

In [None]:
df.dtypes

In [None]:
#correlation heat map
plt.figure(figsize=(15,15))
sns.heatmap(df.corr(), annot=True,cmap="YlGnBu")
plt.show()

In [None]:
#correlation of variables with taxi out time
correlations = df.corr()['TAXI_OUT'].sort_values()
print(correlations)

MODELLING:-

In [None]:
#creating dependent and independent variable
x_1 = df.drop(["TAXI_OUT"],axis=1)
y_1 = df.TAXI_OUT

In [None]:
#categorical columns 
categorical_data = x_1.select_dtypes(exclude=[np.number])
cat_col=categorical_data.columns.tolist()
cat_col

Label Encoding-

In [None]:
#label encoding for categorical columns
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
x_1[cat_col]=x_1[cat_col].apply(le.fit_transform)
x_1[cat_col]

In [None]:
#standardization of independent variables
from sklearn.preprocessing import StandardScaler 
std = StandardScaler() 
x_1= std.fit_transform(x_1)

In [None]:
#Splitting of data set into test and train
from sklearn.model_selection import  train_test_split
X_train,X_test,y_train,y_test=train_test_split(x_1,y_1,test_size=0.1,random_state=0)
X_train.shape,X_test.shape,y_train.shape,y_test.shape

In [None]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
MAE = []
RMSE = []
def model_error(y_pred, y_test):
    print("MAE: {}".format(mean_absolute_error(y_pred, y_test)))
    MAE.append(mean_absolute_error(y_pred, y_test))
    print("RMSE: {}".format(np.sqrt(mean_squared_error(y_pred,y_test))))
    RMSE.append(np.sqrt(mean_squared_error(y_pred,y_test)))
    print('\n')

In [None]:
#importing models
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.linear_model import BayesianRidge
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor

In [None]:
# Predictions with label encoding
#linear regression
lr = LinearRegression()
lr.fit(X_train,y_train)
y_pred= lr.predict(X_test)
print('Linear Regression:\n')
model_error(y_pred, y_test)

#Ridge Regression
ridge=Ridge(alpha=0.05, normalize=True)
ridge.fit(X_train,y_train)
y_pred=ridge.predict(X_test)
print('Ridge Regression:\n')
model_error(y_pred, y_test)

#Lasso Regression
lasso=Lasso(alpha=0.05, normalize=True)
lasso.fit(X_train,y_train)
y_pred=lasso.predict(X_test)
print('Lasso Regression:\n')
model_error(y_pred, y_test)

#Support Vector Regression
svr = SVR()
svr.fit(X_train, y_train)
y_pred= svr.predict(X_test)
print('SVM:\n')
model_error(y_pred, y_test)

#Random Forest
rf = RandomForestRegressor()
rf.fit(X_train,y_train)
y_pred = rf.predict(X_test)
print('Random Forest Regression:\n')
model_error(y_pred, y_test)

#KNN model
knn = KNeighborsRegressor(n_neighbors = 200)
knn.fit(X_train, y_train)
y_pred= knn.predict(X_test)
print('KNN Regression:\n')
model_error(y_pred, y_test)

#Light GBM
lgbm=LGBMRegressor()
lgbm.fit(X_train,y_train)
y_pred=lgbm.predict(X_test)
print('LGBM Regression:\n')
model_error(y_pred, y_test)

#Naive Bayes'
nb=BayesianRidge()
nb.fit(X_train,y_train)
y_pred=nb.predict(X_test)
print('Naive Bayes:\n')
model_error(y_pred, y_test)

LABEL_MAE = MAE
LABEL_RMSE = RMSE

In [None]:
df=df.drop(['TAIL_NUM'],axis=1)

In [None]:
#categorical columns
categorical_data = df.select_dtypes(exclude=[np.number])
cat_col=categorical_data.columns.tolist()
cat_col

In [None]:
#one hot encoding
df=pd.get_dummies(df,columns=cat_col,drop_first=True)
df.head()

In [None]:
x_2= df.drop(["TAXI_OUT"],axis=1)
y_2= df.TAXI_OUT

In [None]:
#standardization of independent variables
from sklearn.preprocessing import StandardScaler 
std = StandardScaler() 
x_2= std.fit_transform(x_2)

In [None]:
#Splitting of data set into test and train
from sklearn.model_selection import  train_test_split
X_train,X_test,y_train,y_test=train_test_split(x_2,y_2,test_size=0.1,random_state=0)
X_train.shape,X_test.shape,y_train.shape,y_test.shape

In [None]:
# Predictions with one hot encoding
#linear regression
lr = LinearRegression()
lr.fit(X_train,y_train)
y_pred= lr.predict(X_test)
print('Linear Regression:\n')
model_error(y_pred, y_test)

#Ridge Regression
ridge=Ridge(alpha=0.05, normalize=True)
ridge.fit(X_train,y_train)
y_pred=ridge.predict(X_test)
print('Ridge Regression:\n')
model_error(y_pred, y_test)

#Lasso Regression
lasso=Lasso(alpha=0.05, normalize=True)
lasso.fit(X_train,y_train)
y_pred=lasso.predict(X_test)
print('Lasso Regression:\n')
model_error(y_pred, y_test)

#Support Vector Regression
svr = SVR()
svr.fit(X_train, y_train)
y_pred= svr.predict(X_test)
print('SVM:\n')
model_error(y_pred, y_test)

#Random Forest
rf = RandomForestRegressor()
rf.fit(X_train,y_train)
y_pred = rf.predict(X_test)
print('Random Forest Regression:\n')
model_error(y_pred, y_test)

#KNN model
knn = KNeighborsRegressor(n_neighbors = 200)
knn.fit(X_train, y_train)
y_pred= knn.predict(X_test)
print('KNN Regression:\n')
model_error(y_pred, y_test)

#Light GBM
lgbm=LGBMRegressor()
lgbm.fit(X_train,y_train)
y_pred=lgbm.predict(X_test)
print('LGBM Regression:\n')
model_error(y_pred, y_test)

#Naive Bayes'
nb=BayesianRidge()
nb.fit(X_train,y_train)
y_pred=nb.predict(X_test)
print('Naive Bayes:\n')
model_error(y_pred, y_test)

OHE_MAE = MAE
OHE_RMSE = RMSE

MAE = []
RMSE = []

In [None]:
array1 = np.array(LABEL_MAE)
array2 = np.array(OHE_MAE)

models=["Linear Regression","Ridge Regression","Lasso Regression","Support Vector Regression","Random forest regression","KNN Regression","Light GBM","Naive Bayes"]
mod=["LR","RR","LS","SVR","RF","KNN","LGBM","NB"]
plt.plot(mod,array1[0:8])
plt.plot(mod,array2[8:])
plt.legend(["Label Encoding","One Hot Encoding"])
plt.xlabel("models")
plt.ylabel("MAE")
for i in range(7) :
  print(mod[i],"-",models[i])
plt.show()

In [None]:
array1 = np.array(LABEL_RMSE)
array2 = np.array(OHE_RMSE)

models=["Linear Regression","Ridge Regression","Lasso Regression","Support Vector Regression","Random forest regression","KNN Regression","Light GBM","Naive Bayes"]
mod=["LR","RR","LS","SVR","RF","KNN","LGBM","NB"]
plt.plot(mod,array1[0:8])
plt.plot(mod,array2[8:])
plt.legend(["Label Encoding","One Hot Encoding"])
plt.xlabel("models")
plt.ylabel("RMSE")
for i in range(7) :
  print(mod[i],"-",models[i])
plt.show()