### AI Bootcamp - June 2024 Cohort - Analyzing and Predicting Flight Delays
Documentation and Context: See the project's ReadMe.md file for documentation on the column names and purposes.

### What's in this workbook?
This is Jenn's analysis, working notebook.

In [1]:
# Do imports
import pandas as pd
import numpy as np
import random
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression


In [2]:
# pip install --upgrade pandas scikit-learn


In [3]:
# Create DataFrame
df = pd.read_csv('./data/Airlines.csv')
df.head()

Unnamed: 0,id,Airline,Flight,AirportFrom,AirportTo,DayOfWeek,Time,Length,Delay
0,1,CO,269,SFO,IAH,3,15,205,1
1,2,US,1558,PHX,CLT,3,15,222,1
2,3,AA,2400,LAX,DFW,3,20,165,1
3,4,AA,2466,SFO,DFW,3,20,195,1
4,5,AS,108,ANC,SEA,3,30,202,0


In [4]:
# Get number of rows and columns, # of null values, and the data types. There are no null values, so we don't need to do cleanup for that.
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 539383 entries, 0 to 539382
Data columns (total 9 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   id           539383 non-null  int64 
 1   Airline      539383 non-null  object
 2   Flight       539383 non-null  int64 
 3   AirportFrom  539383 non-null  object
 4   AirportTo    539383 non-null  object
 5   DayOfWeek    539383 non-null  int64 
 6   Time         539383 non-null  int64 
 7   Length       539383 non-null  int64 
 8   Delay        539383 non-null  int64 
dtypes: int64(6), object(3)
memory usage: 37.0+ MB


In [5]:
# Get the list of unique values and number of them for the Delay target
df['Delay'].value_counts()

Delay
0    299119
1    240264
Name: count, dtype: int64

In [6]:
# Clean the data to make strings numeric
df_clean = df.copy()

# Co

In [7]:
# Create an X and y variable. We'll use the 'Delay' column as our target since it indicates whether a flight was delayed or not.
X = df.copy()
X.drop('Delay', axis=1, inplace=True)
X.head()
# y = df['Delay']

Unnamed: 0,id,Airline,Flight,AirportFrom,AirportTo,DayOfWeek,Time,Length
0,1,CO,269,SFO,IAH,3,15,205
1,2,US,1558,PHX,CLT,3,15,222
2,3,AA,2400,LAX,DFW,3,20,165
3,4,AA,2466,SFO,DFW,3,20,195
4,5,AS,108,ANC,SEA,3,30,202


In [8]:
y = df['Delay'].values.reshape(-1, 1)
y[:5]

array([[1],
       [1],
       [1],
       [1],
       [0]])

In [9]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [10]:
X_train['Airline'].value_counts()

Airline
WN    70520
DL    45847
OO    37691
AA    34243
MQ    27317
US    25816
XE    23300
EV    21034
UA    20805
CO    15868
FL    15736
9E    15456
B6    13586
YV    10256
OH     9477
AS     8587
F9     4810
HA     4188
Name: count, dtype: int64

In [11]:
# Create an encoder for the Airline column
airline_ohe = OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False)

# Train the encoder
airline_ohe.fit(X_train['Airline'].values.reshape(-1,1))

In [12]:
# Decide how to encode the AirportFrom column
X_train['AirportFrom'].value_counts()

AirportFrom
ATL    25906
ORD    18566
DFW    16540
DEN    14959
LAX    12441
       ...  
SJT       13
FLO       10
GUM        9
ADK        6
ABR        2
Name: count, Length: 293, dtype: int64

In [13]:
# Create an encoder for the AirportFrom column
airport_from_ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

# Train the encoder
airport_from_ohe.fit(X_train['AirportFrom'].values.reshape(-1,1))

In [14]:
# Encode the AirportTo column like we did the AirportFrom column

# Create an encoder for the AirportFrom column
airport_to_ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

# Train the encoder
airport_to_ohe.fit(X_train['AirportTo'].values.reshape(-1,1))

In [15]:
# Create a function using the pretrained encoders to use on
# any new data (including the testing data)

def X_preprocess(X_data):
    # Transform each column into numpy arrays
    airline_encoded = airline_ohe.transform(X_data['Airline'].values.reshape(-1,1))
    airport_from_encoded = airport_from_ohe.transform(X_data['AirportFrom'].values.reshape(-1,1))
    airport_to_encoded = airport_to_ohe.transform(X_data['AirportTo'].values.reshape(-1,1))

    # Reorganize the numpy arrays into a DataFrame
    airline_df = pd.DataFrame(airline_encoded, columns = airline_ohe.get_feature_names_out())
    airport_from_df = pd.DataFrame(airport_from_encoded, columns= airport_from_ohe.get_feature_names_out())
    airport_to_df = pd.DataFrame(airport_to_encoded, columns = airport_to_ohe.get_feature_names_out())
    out_df = pd.concat([airline_df, airport_from_df, airport_to_df], axis = 1)

    # Return the DataFrame
    return out_df

In [16]:
# Preprocess the training data
X_train_encoded = X_preprocess(X_train)
X_train_encoded.head()

Unnamed: 0,x0_AA,x0_AS,x0_B6,x0_CO,x0_DL,x0_EV,x0_F9,x0_FL,x0_HA,x0_MQ,...,x0_TXK,x0_TYR,x0_TYS,x0_UTM,x0_VLD,x0_VPS,x0_WRG,x0_XNA,x0_YAK,x0_YUM
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
# Preprocess the testing data
X_test_encoded = X_preprocess(X_test)
X_test_encoded.head()

Unnamed: 0,x0_AA,x0_AS,x0_B6,x0_CO,x0_DL,x0_EV,x0_F9,x0_FL,x0_HA,x0_MQ,...,x0_TXK,x0_TYR,x0_TYS,x0_UTM,x0_VLD,x0_VPS,x0_WRG,x0_XNA,x0_YAK,x0_YUM
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
# Encode y with OneHotEncoder
encode_y = OneHotEncoder(drop='first', sparse_output = False)

# Train the encoder
encode_y.fit(y_train)

# Apply it to both the y_train and y_test
# Use np.ravel to reshape for logistic regression

y_train_encoded = np.ravel(encode_y.transform(y_train))
y_test_encoded = np.ravel(encode_y.transform(y_test))
y_train_encoded

array([1., 0., 1., ..., 1., 0., 0.])

In [19]:
# Create and train an SVC model
model = RandomForestClassifier(n_estimators=500, max_depth=5)
model.fit(X_train_encoded, y_train_encoded)

In [20]:
# Check the model's balanced accuracy on the test set

y_test_pred = model.predict(X_test_encoded)
print(balanced_accuracy_score(y_test_encoded, y_test_pred))

0.5546539962164153


In [21]:
# Check the model's balanced accuracy on the training set

y_train_pred = model.predict(X_train_encoded)
print(balanced_accuracy_score(y_train_encoded, y_train_pred))

0.5548001931120401


In [22]:
# Scaling the X data by using StandardScaler()
scaler = StandardScaler().fit(X_train_encoded)
X_train_scaled = scaler.transform(X_train_encoded)
X_train_scaled

array([[-0.3040973 , -0.14726534, -0.18641662, ..., -0.04512305,
        -0.01031046, -0.02481728],
       [-0.3040973 , -0.14726534, -0.18641662, ..., -0.04512305,
        -0.01031046, -0.02481728],
       [-0.3040973 , -0.14726534, -0.18641662, ..., -0.04512305,
        -0.01031046, -0.02481728],
       ...,
       [-0.3040973 , -0.14726534, -0.18641662, ..., -0.04512305,
        -0.01031046, -0.02481728],
       [-0.3040973 , -0.14726534, -0.18641662, ..., -0.04512305,
        -0.01031046, -0.02481728],
       [-0.3040973 , -0.14726534, -0.18641662, ..., -0.04512305,
        -0.01031046, -0.02481728]])

In [23]:
# Create a LogisticRegression function
logistic_regression_model = LogisticRegression()

In [24]:
# Fit the model for Logistic Regression
logistic_regression_model.fit(X_train_encoded, y_train_encoded)

In [25]:
# Score the model
print(f'The training data score: {logistic_regression_model.score(X_train_encoded, y_train_encoded)}')
print(f'The testing data score: {logistic_regression_model.score(X_test_encoded, y_test_encoded)}')
      

The training data score: 0.6388414409559571
The testing data score: 0.6380834433353604


In [26]:
# Generate predictions from the model we just fit

predictions = logistic_regression_model.predict(X_train_encoded)

# Convert those predictions (and actual values) to a DataFrame
results_df = pd.DataFrame({'Prediction': predictions, 'Actual': y_train_encoded})

In [27]:
# Apply the fitted model to the test dataset
testing_predictions = logistic_regression_model.predict(X_test_encoded)

# Save both the test predictions and actual test values to a DataFrame
results_df = pd.DataFrame({
    "Testing Data Predictions": testing_predictions, 
    "Testing Data Actual Targets": y_test_encoded})

In [28]:
# Display the results DataFrame
results_df

Unnamed: 0,Testing Data Predictions,Testing Data Actual Targets
0,0.0,0.0
1,1.0,1.0
2,0.0,1.0
3,1.0,0.0
4,1.0,0.0
...,...,...
134841,0.0,0.0
134842,1.0,1.0
134843,1.0,1.0
134844,1.0,0.0


In [29]:
# Import the accuracy score function
from sklearn.metrics import accuracy_score

# Calculate the model's accuracy on the test dataset
accuracy_score(y_test_encoded, testing_predictions)

0.6380834433353604

In [30]:
# Create a Random Forest model
model = RandomForestClassifier(max_depth = 7, n_estimators=100)

# Fit (train) or model using the training data
model.fit(X_train_encoded, y_train_encoded)

# Calculate the accuracy of the model on the testing data
model.score(X_test_encoded, y_test_encoded)

0.6234741853670113