In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
 
%matplotlib inline

In [None]:
path = "../input/flight-take-off-data-jfk-airport/M1_final.csv"
df = pd.read_csv(path)
pd.set_option('display.max_columns', None)
df.head()

In [None]:
df = df.drop(['DAY_OF_MONTH','DAY_OF_WEEK','MONTH','TAIL_NUM'], axis=1)
df = df.dropna()
df.info()


In [None]:
df['TAXI_OUT'].unique()

Pre-Processing dew point column

In [None]:
df['Dew Point'] = df['Dew Point'].astype('int')

In [None]:
df['Dew Point'].unique()

Train Test Split

In [None]:
X= df.drop(['TAXI_OUT'], axis=1).values
y = df['TAXI_OUT'].values

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1,random_state=101)

Regression using One_hot_encoder

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler

def get_pipeline(machine_learning_model, one_hot_cols):
    #One hot encoder
    one_hotter = ColumnTransformer([
        ('onehot_cols',
        OneHotEncoder(sparse=False, categories='auto', handle_unknown='ignore'),
        one_hot_cols)
    ], remainder = 'passthrough')
    
    #Min max scaler
    min_maxer = MinMaxScaler()
    #regression step
    regressor = machine_learning_model
    
    pipeline = Pipeline([
        ('one_hot', one_hotter),
        #('min_maxer', min_maxer),
        ('regressor', regressor)
    ])
    return pipeline
    

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.linear_model import BayesianRidge
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor


def evaluate1(regressors, datasets, verbose=True):
    results = {'Dataset':[], 'Regressor':[], 'RMSE':[]}
    for data in datasets:
        dataset_label = data['label']
        print(f'For {data["label"]}:')
        for regressor in regressors:
            results['Dataset'].append(dataset_label)
            pipeline = get_pipeline(regressor, data['1h'])
            pipeline.fit(X_train, y_train)
            
            regressor_name = str(regressor).split('(')[0]
            results['Regressor'].append(regressor_name)
            rmse = np.sqrt(mean_squared_error(y_test, pipeline.predict(X_test)))
            
            if verbose:
                print(f'Done {dataset_label} using {regressor_name}: {rmse}')
            results['RMSE'].append(rmse)
        
        
    evaluate1.df_one_hot_encoder = pd.DataFrame(results)
    plt.figure(figsize=(10,10))
    sns.barplot(x='RMSE', y='Regressor', data=evaluate1.df_one_hot_encoder)

    
evaluate1([
    LinearRegression(),
    Ridge(),
    Lasso(),
    KNeighborsRegressor(n_neighbors=3),
    SVR(),
    BayesianRidge(),
    RandomForestRegressor(),
    LGBMRegressor()
],
[
    {'label':'Taxi time', 'df':df , '1h':[0,1,11,15]}
])

Regression using label encoder

In [None]:
y = df['TAXI_OUT']
X = df.drop(['TAXI_OUT'],axis = 1)

X["Dew Point"] = X["Dew Point"].astype(int)

obj_cols = list(X.select_dtypes(include = 'object').columns) #object columns
num_cols = list(set(X.columns) - set(obj_cols)) #numerical columns

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

labelencoder = LabelEncoder()

for col in obj_cols:
    X[col] = labelencoder.fit_transform(X[col].astype(str))
    
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.1, random_state =101)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler

def get_pipeline(machine_learning_model, one_hot_cols):
    #One hot encoder
    one_hotter = ColumnTransformer([
        ('onehot_cols',
        OneHotEncoder(sparse=False, categories='auto', handle_unknown='ignore'),
        one_hot_cols)
    ], remainder = 'passthrough')
    
    #Min max scaler
    min_maxer = MinMaxScaler()
    #regression step
    regressor = machine_learning_model
    
    pipeline = Pipeline([
        ('one_hot', one_hotter),
        #('min_maxer', min_maxer),
        ('regressor', regressor)
    ])
    return pipeline
    

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.linear_model import BayesianRidge
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor


def evaluate2(regressors, datasets, verbose=True):
    results = {'Dataset':[], 'Regressor':[], 'RMSE':[]}
    for data in datasets:
        dataset_label = data['label']
        print(f'For {data["label"]}:')
        for regressor in regressors:
            results['Dataset'].append(dataset_label)
            pipeline = get_pipeline(regressor, data['1h'])
            pipeline.fit(X_train, y_train)
            
            regressor_name = str(regressor).split('(')[0]
            results['Regressor'].append(regressor_name)
            rmse = np.sqrt(mean_squared_error(y_test, pipeline.predict(X_test)))
            
            if verbose:
                print(f'Done {dataset_label} using {regressor_name}: {rmse}')
            results['RMSE'].append(rmse)
        
        
    evaluate2.df_label_encoder = pd.DataFrame(results)
    plt.figure(figsize=(10,10))
    sns.barplot(x='RMSE', y='Regressor', data=evaluate2.df_label_encoder)

    
evaluate2([
    LinearRegression(),
    Ridge(),
    Lasso(),
    KNeighborsRegressor(n_neighbors=3),
    SVR(),
    BayesianRidge(),
    RandomForestRegressor(),
    LGBMRegressor()
],
[
    {'label':'Taxi time', 'df':df , '1h':[]}
])

In [None]:
array1 = np.array(evaluate2.df_label_encoder['RMSE'])
array2 = np.array(evaluate1.df_one_hot_encoder['RMSE'])
plt.figure(figsize=(12,10))
model_names =['Linear', 'Ridge','Lasso','KNN','SVM', 'Naive Bayes','Random Forest', 'LGBM']
plt.plot(model_names, array1)
plt.plot(model_names, array2)

plt.legend(['Label Encoder', 'One Hot Encoder'])

plt.xlabel('Models')
plt.ylabel('RMSE')
plt.show()