In [None]:
# from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import pandas as pd
import numpy as np
# import tensorflow as tf

In [None]:
#  Import and read the flight data for 2018.
import pandas as pd
flight_18_df = pd.read_csv("Data_files/2018.csv")
flight_18_df.head()

In [None]:
# Check data types. 
flight_18_df.info()

In [None]:
# Convert flight date to datetime. 
flight_18_df["FL_DATE"] = pd.to_datetime(flight_18_df["FL_DATE"])

In [None]:
# Separate the datetime column into columns for year, month, day, and weekday. 
flight_18_df["YEAR"]=flight_18_df["FL_DATE"].dt.year
flight_18_df["MONTH"]=flight_18_df["FL_DATE"].dt.month
flight_18_df["DAY"]=flight_18_df["FL_DATE"].dt.day
# flight_18_df["WEEKDAY"]=flight_18_df["FL_DATE"].dt.dayofweek
flight_18_df["WEEKDAY"]=flight_18_df['FL_DATE'].dt.strftime('%A')

In [None]:
# Drop columns that we won't need for our models. 
prio_flight_18_df = flight_18_df.drop(["FL_DATE", "OP_CARRIER_FL_NUM", "DEP_TIME", "TAXI_OUT", "WHEELS_OFF", "WHEELS_ON", "TAXI_IN", "ARR_TIME", "CANCELLATION_CODE", "ACTUAL_ELAPSED_TIME", "AIR_TIME", "DISTANCE", "CARRIER_DELAY", "NAS_DELAY", "SECURITY_DELAY", "Unnamed: 27", "YEAR", "DAY", "CRS_ARR_TIME"],axis=1)
prio_flight_18_df.head()

In [None]:
# Fill all null values with zeros. 
prio_flight_18_df = prio_flight_18_df.fillna(0)

In [None]:
# Categorize the scheduled departure time to four key times of day. 
prio_flight_18_df.loc[(prio_flight_18_df["CRS_DEP_TIME"] >= 0) & (prio_flight_18_df["CRS_DEP_TIME"] < 600), "sched_dep_time"] = "Night"
prio_flight_18_df.loc[(prio_flight_18_df["CRS_DEP_TIME"] >= 600) & (prio_flight_18_df["CRS_DEP_TIME"] < 1200), "sched_dep_time"] = "Morning"
prio_flight_18_df.loc[(prio_flight_18_df["CRS_DEP_TIME"] >= 1200) & (prio_flight_18_df["CRS_DEP_TIME"] < 1800), "sched_dep_time"] = "Afternoon"
prio_flight_18_df.loc[(prio_flight_18_df["CRS_DEP_TIME"] >= 1800) & (prio_flight_18_df["CRS_DEP_TIME"] < 2400), "sched_dep_time"] = "Evening"
prio_flight_18_df.head(20)

In [None]:
# Categorize the months column by season. 
prio_flight_18_df.loc[(prio_flight_18_df["MONTH"] == 12) | (prio_flight_18_df["MONTH"] == 1) | (prio_flight_18_df["MONTH"] == 2), "season"] = "Winter"
prio_flight_18_df.loc[(prio_flight_18_df["MONTH"] > 2) & (prio_flight_18_df["MONTH"] <= 5), "season"] = "Spring"
prio_flight_18_df.loc[(prio_flight_18_df["MONTH"] > 5) & (prio_flight_18_df["MONTH"] <= 8), "season"] = "Summer"
prio_flight_18_df.loc[(prio_flight_18_df["MONTH"] > 8) & (prio_flight_18_df["MONTH"] <= 11), "season"] = "Autumn"

In [None]:
# Categorize each flight as impacted by a weather delay or not. 
prio_flight_18_df.loc[(prio_flight_18_df["WEATHER_DELAY"] > 0), "delays_weather"] = "yes"
prio_flight_18_df.loc[(prio_flight_18_df["WEATHER_DELAY"] == 0), "delays_weather"] = "no"

In [None]:
# Categorize each flight as impacted by a late aircraft delay or not. 
prio_flight_18_df.loc[(prio_flight_18_df["LATE_AIRCRAFT_DELAY"] > 0), "delays_late_aircraft"] = "yes"
prio_flight_18_df.loc[(prio_flight_18_df["LATE_AIRCRAFT_DELAY"] == 0), "delays_late_aircraft"] = "no"

In [None]:
# Drop unnecessary columns that have been replaced by new categorized columns. 
prio_flight_18_df = prio_flight_18_df.drop(["CRS_DEP_TIME", "WEATHER_DELAY", "LATE_AIRCRAFT_DELAY", "MONTH"],axis=1)

In [None]:
prio_flight_18_df.head()

In [None]:
# Categorize each flight as delayed or not, based on if the flight arrived at its destination late. 
prio_flight_18_df['DELAY'] = np.where((prio_flight_18_df.ARR_DELAY > 0), 1, 0)
prio_flight_18_df.head()

In [None]:
# Determine the top 5 airports so we can narrow the scope of the analysis. 
prio_flight_18_df["ORIGIN"].value_counts()

In [None]:
# Determine the top 5 airports so we can narrow the scope of the analysis. 
prio_flight_18_df["DEST"].value_counts()

In [None]:
# Determine the top 5 airlines so we can narrow the scope of the analysis. 
prio_flight_18_df["OP_CARRIER"].value_counts()

In [None]:
# Create lists to represent the top 5 airports and airlines. 
top_airports = ["ATL","ORD","DFW","CLT","DEN"]
top_airlines = ["WN", "DL", "AA", "OO", "UA"]

In [None]:
# Limit the data frame to include only the top 5 airports and airlines. 
top_prio_flight_18_df = prio_flight_18_df.loc[prio_flight_18_df['ORIGIN'].isin(top_airports)]
top_prio_flight_18_df = top_prio_flight_18_df.loc[prio_flight_18_df['DEST'].isin(top_airports)]
top_prio_flight_18_df = top_prio_flight_18_df.loc[prio_flight_18_df['OP_CARRIER'].isin(top_airlines)]
top_prio_flight_18_df = top_prio_flight_18_df.reset_index(drop=True)

In [None]:
# Review the columns to determine which need to be scaled and which need one hot encoding. 
top_prio_flight_18_df.info()

In [None]:
# Scale the numeric columns: 
flight_data_scaled = StandardScaler().fit_transform(top_prio_flight_18_df[["DEP_DELAY", "ARR_DELAY", "CRS_ELAPSED_TIME"]])

In [None]:
# Create a data frame. 
top_prio_flight_scaled = pd.DataFrame(
    flight_data_scaled,
    columns=["DEP_DELAY", "ARR_DELAY", "CRS_ELAPSED_TIME"]
)

In [None]:
# Check the data frame. 
top_prio_flight_scaled

In [None]:
# Get dummies for the categorical columns: 
flight_dummies = pd.get_dummies(top_prio_flight_18_df[["OP_CARRIER", "ORIGIN", "DEST", "WEEKDAY", "sched_dep_time", "season", "delays_weather", "delays_late_aircraft"]])

In [None]:
# Check the data frame. 
flight_dummies

In [None]:
# Confirm dummies. 
list(flight_dummies.columns)

In [None]:
# Isolate variables that we're trying to predict. 
variables_predicting_df = top_prio_flight_18_df[['CANCELLED','DIVERTED','DELAY']]

In [None]:
# Check the data frame. 
variables_predicting_df

In [None]:
# Concatenate the three data frames. 
cleaned_flights_df = pd.concat([top_prio_flight_scaled, flight_dummies, variables_predicting_df], axis=1)

In [None]:
cleaned_flights_df

In [None]:
# Save the cleaned data to a CSV. 
cleaned_flights_df.to_csv("Data_files/2018_cleaned.csv", index=False)