In [43]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import VarianceThreshold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB

In [11]:
f_data = pd.read_csv('FlightDelays.csv')

In [12]:
from collections import defaultdict

In [13]:
d = defaultdict(LabelEncoder)

In [14]:
#selecting cols that need to be transformed
df = pd.DataFrame(f_data, columns = ['CARRIER', 'DEST', 'FL_DATE', 'ORIGIN','TAIL_NUM','Flight_Status'])

In [15]:
# Encoding the variable
fit = df.apply(lambda x: d[x.name].fit_transform(x))

In [16]:
# Inverse the encoded
fit.apply(lambda x: d[x.name].inverse_transform(x))

Unnamed: 0,CARRIER,DEST,FL_DATE,ORIGIN,TAIL_NUM,Flight_Status
0,OH,JFK,01-01-2004,BWI,N940CA,ontime
1,DH,JFK,01-01-2004,DCA,N405FJ,ontime
2,DH,LGA,01-01-2004,IAD,N695BR,ontime
3,DH,LGA,01-01-2004,IAD,N662BR,ontime
4,DH,LGA,01-01-2004,IAD,N698BR,ontime
...,...,...,...,...,...,...
2196,RU,EWR,1/31/2004,DCA,N15555,ontime
2197,RU,EWR,1/31/2004,IAD,N16976,ontime
2198,RU,EWR,1/31/2004,DCA,N14902,ontime
2199,RU,EWR,1/31/2004,DCA,N16961,ontime


In [17]:
# Using the dictionary to label future data
flight_df=pd.DataFrame(df.apply(lambda x: d[x.name].transform(x)))

In [20]:
#add the rest of the cols to the dataframe
flight_df['CRS_DEP_TIME']=f_data['CRS_DEP_TIME']
flight_df['DEP_TIME']=f_data['DEP_TIME']
flight_df['DISTANCE']=f_data['DISTANCE']
flight_df['FL_NUM']=f_data['FL_NUM']
flight_df['Weather']=f_data['Weather']
flight_df['DAY_WEEK']=f_data['DAY_WEEK']
flight_df['DAY_OF_MONTH']=f_data['DAY_OF_MONTH']

In [21]:
#print top 10 values of the dataset
flight_df.head(10)

Unnamed: 0,CARRIER,DEST,FL_DATE,ORIGIN,TAIL_NUM,Flight_Status,CRS_DEP_TIME,DEP_TIME,DISTANCE,FL_NUM,Weather,DAY_WEEK,DAY_OF_MONTH
0,4,1,0,0,525,1,1455,1455,184,5935,0,4,1
1,1,1,0,1,262,1,1640,1640,213,6155,0,4,1
2,1,2,0,2,381,1,1245,1245,229,7208,0,4,1
3,1,2,0,2,349,1,1715,1709,229,7215,0,4,1
4,1,2,0,2,384,1,1039,1035,229,7792,0,4,1
5,1,1,0,2,373,1,840,839,228,7800,0,4,1
6,1,1,0,2,240,1,1240,1243,228,7806,0,4,1
7,1,1,0,2,226,1,1645,1644,228,7810,0,4,1
8,1,1,0,2,245,1,1715,1710,228,7812,0,4,1
9,1,1,0,2,371,1,2120,2129,228,7814,0,4,1


In [22]:
#select dependent and independent variables

X = flight_df.drop({'Flight_Status'}, axis=1)
y = flight_df['Flight_Status']

##  Variance thresholding

In [23]:
# Create VarianceThreshold object with a variance with a threshold of 0.5
thresholder = VarianceThreshold(threshold=1.5)

# Conduct variance thresholding
X_high_variance =pd.DataFrame(thresholder.fit_transform(X))

In [24]:
#high variance features
X_high_variance.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,4,0,525,1455,1455,184,5935,4,1
1,1,0,262,1640,1640,213,6155,4,1
2,1,0,381,1245,1245,229,7208,4,1
3,1,0,349,1715,1709,229,7215,4,1
4,1,0,384,1039,1035,229,7792,4,1


In [25]:
#Variable Selection- droping the less useful features
X_filter = flight_df.drop({'Weather','DEST','ORIGIN','Flight_Status'}, axis=1)
#reduced model
X_filter.head()

Unnamed: 0,CARRIER,FL_DATE,TAIL_NUM,CRS_DEP_TIME,DEP_TIME,DISTANCE,FL_NUM,DAY_WEEK,DAY_OF_MONTH
0,4,0,525,1455,1455,184,5935,4,1
1,1,0,262,1640,1640,213,6155,4,1
2,1,0,381,1245,1245,229,7208,4,1
3,1,0,349,1715,1709,229,7215,4,1
4,1,0,384,1039,1035,229,7792,4,1


In [26]:
#split dataset to training and test data 70:30 ratio
X_train, X_test, y_train, y_test = train_test_split(X_filter, y, test_size=0.3, random_state=12)

# Logistic Regression

In [27]:
# Fitting Logistic Regression to the Training set
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred_logreg = classifier.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [28]:
print('Accuracy of Logistic Regression classifier on test set: {:.2f}'.format(accuracy_score(y_test, y_pred_logreg)))

Accuracy of Logistic Regression classifier on test set: 0.89


# Decision Tree Classifier

In [29]:
# Fitting Decision Tree Classification to the Training set
classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred_dtc = classifier.predict(X_test)

In [30]:
print('Accuracy of Decision Tree classifier on test set: {:.2f}'.format(accuracy_score(y_test, y_pred_dtc)))

Accuracy of Decision Tree classifier on test set: 0.86


# Random Forest Classifier

In [31]:
# Fitting Random Forest Classification to the Training set
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred_rand = classifier.predict(X_test)

In [32]:
# Model Accuracy, how often is the classifier correct?
print('Accuracy of Random Forest classifier on test set: {:.2f}'.format(accuracy_score(y_test, y_pred_rand)))

Accuracy of Random Forest classifier on test set: 0.87


# K-NN Classifier

In [33]:
# Fitting K-NN to the Training set
classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred_knn = classifier.predict(X_test)

In [34]:
# Model Accuracy, how often is the classifier correct?
print('Accuracy of KNN classifier on test set: {:.2f}'.format(accuracy_score(y_test, y_pred_knn)))

Accuracy of KNN classifier on test set: 0.87


# SVM Classifier

In [35]:
# Fitting SVM to the Training set
classifier = SVC(kernel = 'linear', random_state = 0)
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred_svc = classifier.predict(X_test)

In [36]:
# Model Accuracy, how often is the classifier correct?
print('Accuracy of Support vector classifier on test set: {:.2f}'.format(accuracy_score(y_test, y_pred_svc)))

Accuracy of Support vector classifier on test set: 0.89


# Naive Bayes

In [37]:
# Fitting Naive Bayes to the Training set
classifier = GaussianNB()
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred_naive = classifier.predict(X_test)

In [41]:
# Model Accuracy, how often is the classifier correct?
print('Accuracy of Naive Bayes GaussianNB classifier on test set: {:.2f}'.format(accuracy_score(y_test, y_pred_naive)))

Accuracy of Naive Bayes GaussianNB classifier on test set: 0.63


In [40]:
# Fitting Naive Bayes to the Training set
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred_naive = classifier.predict(X_test)

In [42]:
# Model Accuracy, how often is the classifier correct?
print('Accuracy of Naive Bayes MultinomialNB classifier on test set: {:.2f}'.format(accuracy_score(y_test, y_pred_naive)))

Accuracy of Naive Bayes MultinomialNB classifier on test set: 0.63


In [44]:
# Fitting Naive Bayes to the Training set
classifier = BernoulliNB()
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred_naive = classifier.predict(X_test)

In [45]:
# Model Accuracy, how often is the classifier correct?
print('Accuracy of Naive Bayes BernoulliNB classifier on test set: {:.2f}'.format(accuracy_score(y_test, y_pred_naive)))

Accuracy of Naive Bayes BernoulliNB classifier on test set: 0.82
