In [5]:
# Import all the necessary packages
import pandas as pd
import numpy as np

### Preparing Dataframe for GLM Regression Analysis

The basis of the data preparation is the assumption that if the flight was not diverted ***AND*** there was no AVH problem reported, then the flight continued to its final destination despite the emergency and the emergency was therefore minor/procedural in nature. 

As such we have predictors that may or may not have resulted in a diversion and we can build a GLM model based on the binaries

In [29]:
# Load data
df_squawk7700 = pd.read_csv("../data/processed/squawk7700_processed_final_v2.csv")

# Apply diversion logic to all entries
df_squawk7700['diverted'] = (~df_squawk7700['landing'].isna()) & (df_squawk7700['landing'] != df_squawk7700['destination']) & (df_squawk7700['origin'] != df_squawk7700['destination'])

# Apply diversion logic to all entries based on reported AVH problem
df_squawk7700['avh_diverted'] = ((df_squawk7700['avh_result']=='return') | (df_squawk7700['avh_result']=='diverted'))

# Drop the rows that we know were diverted but for which there exists no AVH report
drop_rows = df_squawk7700[(df_squawk7700['diverted']==True) & (df_squawk7700['avh_result'].isna())]
df_squawk_model = df_squawk7700.drop(index=drop_rows.index)
df_squawk_model = df_squawk_model[['diverted','avh_problem']].copy()

# Change the diversion column to a 1/0 binary logic and populate columns with avh problems with 0's to prepare for binary  
df_squawk_model['diverted'] = df_squawk_model['diverted'].apply(lambda x: 1 if x else 0)
problem_list = df_squawk_model['avh_problem'].dropna().unique()
df_squawk_model[problem_list] = 0

# Populate problem columns with a True = 1 based on AVH Problem column
for problem in problem_list:
    match_rows = (df_squawk_model['avh_problem'] == problem)
    df_squawk_model.loc[match_rows, problem] = 1

# Drop the AVH Problem column, save, and print information
df_squawk_model.drop(columns='avh_problem', inplace=True)

df_squawk_model.to_csv('../data/processed/squawk7700_model.csv', index=False)

df_squawk_model.info()
df_squawk_model.head()

<class 'pandas.core.frame.DataFrame'>
Index: 495 entries, 0 to 634
Data columns (total 20 columns):
 #   Column                   Non-Null Count  Dtype
---  ------                   --------------  -----
 0   diverted                 495 non-null    int64
 1   engine                   495 non-null    int64
 2   smoke_burn_smell_flames  495 non-null    int64
 3   cabin_pressure           495 non-null    int64
 4   instrument               495 non-null    int64
 5   hot_air_leak             495 non-null    int64
 6   cracked_windshield       495 non-null    int64
 7   weather_damage           495 non-null    int64
 8   hydraulics               495 non-null    int64
 9   landing_gear             495 non-null    int64
 10  fuel_leak                495 non-null    int64
 11  medical                  495 non-null    int64
 12  misc                     495 non-null    int64
 13  slats                    495 non-null    int64
 14  maintenance              495 non-null    int64
 15  air_conditi

Unnamed: 0,diverted,engine,smoke_burn_smell_flames,cabin_pressure,instrument,hot_air_leak,cracked_windshield,weather_damage,hydraulics,landing_gear,fuel_leak,medical,misc,slats,maintenance,air_condition,bird,brakes,heating,flaps
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
