In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score,GridSearchCV
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

In [2]:
data = pd.read_csv("flight-data.csv")

In [3]:
data.head()

Unnamed: 0,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,temp,...,precip,pressure,visib,type,manufacturer,model,engines,seats,engine,arr_delay
0,B6,1201,N643JB,JFK,FLL,135.0,1069,21,50,73.04,...,0.0,1019.4,10.0,Fixed wing multi engine,AIRBUS,A320-232,2.0,200.0,Turbo-fan,1.0
1,DL,1773,N3743H,JFK,LAS,287.0,2248,15,35,71.06,...,0.0,1020.4,10.0,Fixed wing multi engine,BOEING,737-832,2.0,189.0,Turbo-jet,0.0
2,EV,4572,N11107,EWR,GSP,84.0,594,15,9,78.8,...,0.0,,5.0,Fixed wing multi engine,EMBRAER,EMB-145XR,2.0,55.0,Turbo-fan,1.0
3,B6,179,N526JB,JFK,PHX,305.0,2153,17,35,44.06,...,0.0,1009.2,10.0,,,,,,,1.0
4,US,1733,N162UW,LGA,CLT,78.0,544,7,53,69.98,...,0.0,1021.3,10.0,Fixed wing multi engine,AIRBUS INDUSTRIE,A321-211,2.0,199.0,Turbo-jet,0.0


In [4]:
data.columns

Index(['carrier', 'flight', 'tailnum', 'origin', 'dest', 'air_time',
       'distance', 'hour', 'minute', 'temp', 'dewp', 'humid', 'wind_dir',
       'wind_speed', 'wind_gust', 'precip', 'pressure', 'visib', 'type',
       'manufacturer', 'model', 'engines', 'seats', 'engine', 'arr_delay'],
      dtype='object')

In [5]:
data.isnull().sum()

carrier            0
flight             0
tailnum           74
origin             0
dest               0
air_time         284
distance           0
hour               0
minute             0
temp              48
dewp              48
humid             48
wind_dir         269
wind_speed        49
wind_gust       7613
precip            46
pressure        1192
visib             46
type            1501
manufacturer    1501
model           1501
engines         1501
seats           1501
engine          1501
arr_delay        284
dtype: int64

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 25 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   carrier       10000 non-null  object 
 1   flight        10000 non-null  int64  
 2   tailnum       9926 non-null   object 
 3   origin        10000 non-null  object 
 4   dest          10000 non-null  object 
 5   air_time      9716 non-null   float64
 6   distance      10000 non-null  int64  
 7   hour          10000 non-null  int64  
 8   minute        10000 non-null  int64  
 9   temp          9952 non-null   float64
 10  dewp          9952 non-null   float64
 11  humid         9952 non-null   float64
 12  wind_dir      9731 non-null   float64
 13  wind_speed    9951 non-null   float64
 14  wind_gust     2387 non-null   float64
 15  precip        9954 non-null   float64
 16  pressure      8808 non-null   float64
 17  visib         9954 non-null   float64
 18  type          8499 non-null

In [7]:
data['arr_delay'] = data['arr_delay'].fillna(0)

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 25 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   carrier       10000 non-null  object 
 1   flight        10000 non-null  int64  
 2   tailnum       9926 non-null   object 
 3   origin        10000 non-null  object 
 4   dest          10000 non-null  object 
 5   air_time      9716 non-null   float64
 6   distance      10000 non-null  int64  
 7   hour          10000 non-null  int64  
 8   minute        10000 non-null  int64  
 9   temp          9952 non-null   float64
 10  dewp          9952 non-null   float64
 11  humid         9952 non-null   float64
 12  wind_dir      9731 non-null   float64
 13  wind_speed    9951 non-null   float64
 14  wind_gust     2387 non-null   float64
 15  precip        9954 non-null   float64
 16  pressure      8808 non-null   float64
 17  visib         9954 non-null   float64
 18  type          8499 non-null

In [9]:
data.isna().sum()

carrier            0
flight             0
tailnum           74
origin             0
dest               0
air_time         284
distance           0
hour               0
minute             0
temp              48
dewp              48
humid             48
wind_dir         269
wind_speed        49
wind_gust       7613
precip            46
pressure        1192
visib             46
type            1501
manufacturer    1501
model           1501
engines         1501
seats           1501
engine          1501
arr_delay          0
dtype: int64

In [10]:
x=data.drop(["arr_delay"],axis=1)
y=data["arr_delay"]

In [11]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1)

In [12]:
data.drop(columns=['tailnum','dest','manufacturer','model','carrier'],inplace=True)

cats=['engine','origin','type']
nums=['temp','dewp','humid','wind_dir','wind_speed','wind_gust','precip','pressure','visib','air_time']

num_pipe = Pipeline([('impute_missing', SimpleImputer(missing_values = np.nan,strategy='mean')),
                           ('standardize_num', StandardScaler())
                        ])

cat_pipe = Pipeline([('impute_missing', SimpleImputer(missing_values = np.nan ,strategy='most_frequent')),
                          ('create_dummies', OneHotEncoder(handle_unknown='ignore',sparse=False))])


processing_pipeline = ColumnTransformer(transformers=[('proc_numeric', num_pipe, nums),
                                                      ('create_dummies', cat_pipe, cats)
                                                     ])
print("Pipeline created")

Pipeline created


In [13]:
pipe1 = Pipeline([('processing_pipeline', processing_pipeline),
               ('LR', LogisticRegression(C=0.1, solver='lbfgs', max_iter=500, penalty='l2'))])

pipe2 = Pipeline([('processing_pipeline', processing_pipeline),
               ('DT', DecisionTreeClassifier(max_depth=2, criterion='entropy'))])

pipe3 = Pipeline([('processing_pipeline', processing_pipeline),
               ('SVC', SVC(kernel='rbf', C=1, probability=True))])

labs = ['Logistic Regression', 'Decision Tree', 'SVM']
clfs = [pipe1, pipe2, pipe3]
clfs = zip(labs, clfs)

for lab, clf in clfs:
    scores = cross_val_score(estimator=clf, X=x_train, y=y_train, cv=10, scoring='roc_auc')
    print(f'ROC AUC {scores.mean():.2f} (+/- {scores.std():.2f}) [{lab}]')

ROC AUC 0.58 (+/- 0.02) [Logistic Regression]
ROC AUC 0.58 (+/- 0.02) [Decision Tree]
ROC AUC 0.60 (+/- 0.02) [SVM]


In [14]:
from sklearn.ensemble import VotingClassifier

ems = [('lr', pipe1),('dt', pipe2),('svc', pipe3)]
clf4 = VotingClassifier(estimators= ems, weights=None, voting='soft')

scores = cross_val_score(estimator=clf4, X=x_train, y=y_train, cv=10, scoring='roc_auc')
print(f'ROC AUC {scores.mean():.2f} (+/- {scores.std():.2f}) [Ensemble]')

ROC AUC 0.61 (+/- 0.02) [Ensemble]


In [None]:
from sklearn.model_selection import GridSearchCV

params = {'lr__LR__C':[0.001, 0.1, 1, 10], 
          'dt__DT__max_depth': [1, 2, 3, 5],
          'svc__SVC__C':[0.001, 0.01,0.1, 1]
         }

gs = GridSearchCV(estimator=clf4, param_grid=params, scoring='roc_auc',refit=True, cv=10)
gs = gs.fit(x_train, y_train)
gs_score = gs.score(X_test, y_test)

print(f'VotingClassifier Test ROC AUC: {gs_score:.2%}')

In [None]:
from sklearn.metrics import accuracy_score

tree = p2
tree = tree.fit(x_train, y_train)
tree_train_predict = tree.predict(x_train)
tree_test_predict = tree.predict(x_test)

tree_train_score = accuracy_score(y_train, tree_train_predict)
tree_test_score = accuracy_score(y_test, tree_test_predict)

print(f'Tree Training Score: {tree_train_score:.2%}')
print(f'Tree Test Score: {tree_test_score:.2%}')

In [None]:
from sklearn.ensemble import AdaBoostClassifier

p4 = Pipeline([('processing', processing_pipeline),
               ('AdaBoost', AdaBoostClassifier())])
ada = p4
ada = ada.fit(x_train, y_train)

ada_train_predict = ada.predict(x_train.values)
ada_test_predict = ada.predict(x_test.values)

ada_train_score = accuracy_score(y_train, ada_train_predict)
ada_test_score = accuracy_score(y_test, ada_test_predict)

print(f'AdaBoost Training Score: {ada_train_score:.2%}')
print(f'AdaBoost Test Score: {ada_test_score:.2%}')