In [38]:
#load packages
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', 999)

In [39]:
fw = pd.read_csv("../Datasets/NEW/FW_with_airports.csv", parse_dates=['date'])

In [40]:
fw = fw.iloc[:,:11].join(fw.iloc[:,-1])

In [41]:
fw.head()

Unnamed: 0,tail_number,date,aircraft,origin_code,origin,destination_code,destination,departure,arrival,duration,distance_mi,Owner
0,N100KB,2021-02-20,BE9L,US-0571,Williston Basin International Airport (KXWA),KBIS,Bismarck Muni (KBIS),04:57PM CST,05:44PM CST,0:46,174.005874,"EXECUTIVE AIR TAXI CORPBISMARCK, ND, US(Corpor..."
1,N100KB,2021-02-20,BE9L,KBIS,Bismarck Muni (KBIS),US-0571,Williston Basin International Airport (KXWA),01:36PM CST,02:27PM CST,0:51,174.005874,"EXECUTIVE AIR TAXI CORPBISMARCK, ND, US(Corpor..."
2,N100KB,2021-02-18,BE9L,KMOT,Minot Intl (KMOT),KBIS,Bismarck Muni (KBIS),08:20AM CST,08:53AM CST,0:32,105.704153,"EXECUTIVE AIR TAXI CORPBISMARCK, ND, US(Corpor..."
3,N100KB,2021-02-15,BE9L,US-0571,Williston Basin International Airport (KXWA),KMOT,Minot Intl (KMOT),02:15PM CST,02:49PM CST,0:33,113.89946,"EXECUTIVE AIR TAXI CORPBISMARCK, ND, US(Corpor..."
4,N100KB,2021-02-15,BE9L,KMOT,Minot Intl (KMOT),US-0571,Williston Basin International Airport (KXWA),08:00AM CST,08:32AM CST,0:32,113.89946,"EXECUTIVE AIR TAXI CORPBISMARCK, ND, US(Corpor..."


In [42]:
fw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23160 entries, 0 to 23159
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   tail_number       23160 non-null  object        
 1   date              23160 non-null  datetime64[ns]
 2   aircraft          22966 non-null  object        
 3   origin_code       23142 non-null  object        
 4   origin            23160 non-null  object        
 5   destination_code  23121 non-null  object        
 6   destination       23159 non-null  object        
 7   departure         23160 non-null  object        
 8   arrival           22730 non-null  object        
 9   duration          22680 non-null  object        
 10  distance_mi       23151 non-null  float64       
 11  Owner             23160 non-null  object        
dtypes: datetime64[ns](1), float64(1), object(10)
memory usage: 2.1+ MB


In [43]:
fw.isna().sum()

tail_number           0
date                  0
aircraft            194
origin_code          18
origin                0
destination_code     39
destination           1
departure             0
arrival             430
duration            480
distance_mi           9
Owner                 0
dtype: int64

In [44]:
# exclude all diverted and practice flights 
fw = fw.loc[(fw.origin != fw.destination) & (fw.arrival.notnull()) & (fw.destination.notnull()) & (fw.duration.notnull())
           & (fw.distance_mi.notnull())]

In [45]:
# convert duration to minutes and add a new column 
fw['duration_minutes'] = pd.to_datetime(fw.duration, format="%H:%M", errors='coerce').dt.hour * 60 + pd.to_datetime(fw.duration, format="%H:%M",errors='coerce').dt.minute

In [46]:
# exclude all flight less that 2 minutes. We cansider that as diverted or canceled 
fw = fw.loc[fw.duration_minutes > 2]

In [47]:
# set Owner and tail_number as indices
fw = fw.set_index(['Owner', 'tail_number'])

In [48]:
# calculate 25 quantile of each tail number of the airmedical service
fw['duration_q_25'] = fw.groupby(['Owner', 'tail_number']).duration_minutes.quantile(0.25)

# calculate 75 quantile of each tail number of the airmedical service
fw['duration_q_75'] = fw.groupby(['Owner', 'tail_number']).duration_minutes.quantile(0.75)

# calculate IQR
fw['duration_iqr'] = fw['duration_q_75'] - fw['duration_q_25']

# calculate lower whisker
fw['duration_lower_whisker'] = fw['duration_q_25'] - (1.5 * fw['duration_iqr'])

# calculate upper whisker
fw['duration_upper_whisker'] = fw['duration_q_75'] + (1.5 * fw['duration_iqr'])

In [50]:
fw['duration_outlier'] = np.where((fw.duration_minutes < fw.duration_lower_whisker) | (fw.duration_minutes > fw.duration_upper_whisker), "Yes", "No")

In [51]:
# calculate 25 quantile of each tail number of the airmedical service
fw['distance_q_25'] = fw.groupby(['Owner', 'tail_number']).distance_mi.quantile(0.25)

# calculate 75 quantile of each tail number of the airmedical service
fw['distance_q_75'] = fw.groupby(['Owner', 'tail_number']).distance_mi.quantile(0.75)

# calculate IQR
fw['distance_iqr'] = fw['distance_q_75'] - fw['distance_q_25']

# calculate lower whisker
fw['distance_lower_whisker'] = fw['distance_q_25'] - (1.5 * fw['distance_iqr'])

# calculate upper whisker
fw['distance_upper_whisker'] = fw['distance_q_75'] + (1.5 * fw['distance_iqr'])

In [52]:
fw['distance_outlier'] = np.where((fw.distance_mi < fw.distance_lower_whisker) | (fw.distance_mi > fw.distance_upper_whisker), "Yes", "No")

In [53]:
fw.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,date,aircraft,origin_code,origin,destination_code,destination,departure,arrival,duration,distance_mi,duration_minutes,duration_q_25,duration_q_75,duration_iqr,duration_lower_whisker,duration_upper_whisker,duration_outlier,distance_q_25,distance_q_75,distance_iqr,distance_lower_whisker,distance_upper_whisker,distance_outlier
Owner,tail_number,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
"EXECUTIVE AIR TAXI CORPBISMARCK, ND, US(Corporation)",N100KB,2021-02-20,BE9L,US-0571,Williston Basin International Airport (KXWA),KBIS,Bismarck Muni (KBIS),04:57PM CST,05:44PM CST,0:46,174.005874,46,32.5,78.0,45.5,-35.75,146.25,No,111.954264,244.59526,132.640996,-87.00723,443.556754,No
"EXECUTIVE AIR TAXI CORPBISMARCK, ND, US(Corporation)",N100KB,2021-02-20,BE9L,KBIS,Bismarck Muni (KBIS),US-0571,Williston Basin International Airport (KXWA),01:36PM CST,02:27PM CST,0:51,174.005874,51,32.5,78.0,45.5,-35.75,146.25,No,111.954264,244.59526,132.640996,-87.00723,443.556754,No
"EXECUTIVE AIR TAXI CORPBISMARCK, ND, US(Corporation)",N100KB,2021-02-18,BE9L,KMOT,Minot Intl (KMOT),KBIS,Bismarck Muni (KBIS),08:20AM CST,08:53AM CST,0:32,105.704153,32,32.5,78.0,45.5,-35.75,146.25,No,111.954264,244.59526,132.640996,-87.00723,443.556754,No
"EXECUTIVE AIR TAXI CORPBISMARCK, ND, US(Corporation)",N100KB,2021-02-15,BE9L,US-0571,Williston Basin International Airport (KXWA),KMOT,Minot Intl (KMOT),02:15PM CST,02:49PM CST,0:33,113.89946,33,32.5,78.0,45.5,-35.75,146.25,No,111.954264,244.59526,132.640996,-87.00723,443.556754,No
"EXECUTIVE AIR TAXI CORPBISMARCK, ND, US(Corporation)",N100KB,2021-02-15,BE9L,KMOT,Minot Intl (KMOT),US-0571,Williston Basin International Airport (KXWA),08:00AM CST,08:32AM CST,0:32,113.89946,32,32.5,78.0,45.5,-35.75,146.25,No,111.954264,244.59526,132.640996,-87.00723,443.556754,No


In [37]:
fw.to_csv('..//Datasets/NEW/FW_Outliers.csv')