In [1]:
import os
import sys
import numpy as np
import pandas as pd
from geopy import Point
from geopy import distance
pd.set_option('display.max_columns', 999)

In [2]:
# import datasets
fw = pd.read_csv("..//Datasets/NEW/FW_flights.csv", parse_dates=['date'])
fw_info = pd.read_csv("..//Datasets/NEW/FW_services_info.csv")
airports = pd.read_csv("..//Datasets/Airports/BTS_Airports_LAT_LON.csv")
airport_codes = pd.read_csv("..//Datasets/Airports/airport-codes.csv")

In [3]:
# number of existing tail numbers with flight history
fw.tail_number.nunique() 

181

AttributeError: 'DataFrame' object has no attribute 'Owner'

In [4]:
# The percentage of the total tail numbers
print(round(fw.tail_number.nunique() / fw_info.Tail_number.nunique() * 100, 2), "%")

54.35 %


In [5]:
fw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23160 entries, 0 to 23159
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   tail_number           23160 non-null  object        
 1   date                  23160 non-null  datetime64[ns]
 2   aircraft              23160 non-null  object        
 3   origin                23160 non-null  object        
 4   origin_location       23160 non-null  object        
 5   destination           23159 non-null  object        
 6   destination_location  23160 non-null  object        
 7   departure             23160 non-null  object        
 8   arrival               22734 non-null  object        
 9   duration              23148 non-null  object        
dtypes: datetime64[ns](1), object(9)
memory usage: 1.8+ MB


In [6]:
# check for null values
fw.isna().sum()

tail_number               0
date                      0
aircraft                  0
origin                    0
origin_location           0
destination               1
destination_location      0
departure                 0
arrival                 426
duration                 12
dtype: int64

#### Data preprocessing
---

In [7]:
# check duplicates airport_codes
airport_codes.duplicated(subset=['gps_code']).sum()

16578

In [8]:
# check duplicates airport_codes
airport_codes.duplicated(subset=['local_code']).sum()

28768

In [9]:
# Remove useless words
fw = fw.replace(regex={'Near ':'', 'First seen ':'', 'Last seen ':'', 'result unknown': np.nan, 'En Route':None, 'unknown': np.nan,
                       'n/a': np.nan, 'Unknown':None, 'Diverted':np.nan, '¬†':' ', 'Scheduled': None, 'Cancelled': np.nan})

In [10]:
# count of training flights 
len(fw.loc[fw.origin == fw.destination])

731

In [11]:
# drop all training flights 
fw = fw.drop(fw.loc[fw.origin == fw.destination].index, axis=0)

In [12]:
# count of incomplete flights 
len(fw.loc[(fw.arrival.isna()) | (fw.duration.isna())])

470

In [13]:
# drop all incomplete flights 
fw = fw.drop(fw.loc[(fw.arrival.isna()) | (fw.duration.isna())].index, axis=0)

In [14]:
# split origin_Latitude and origin_Longitude
fw['origin_Latitude'] = fw[fw.origin_location.str.startswith('L ')].origin_location.replace(regex={'L ':''}).str.split(" ", n = 1, expand = True)[0]
fw['origin_Longitude'] = fw[fw.origin_location.str.startswith('L ')].origin_location.replace(regex={'L ':''}).str.split(" ", n = 1, expand = True)[1]

In [15]:
# split destination_Latitude and destination_Longitude
fw['destination_Latitude'] = fw[fw.destination_location.str.startswith('L ', na=False)].destination_location.replace(regex={'L ':''}).str.split(" ", n = 1, expand = True)[0]
fw['destination_Longitude'] = fw[fw.destination_location.str.startswith('L ', na=False)].destination_location.replace(regex={'L ':''}).str.split(" ", n = 1, expand = True)[1]

In [16]:
# convert to numeric
fw['origin_Latitude'] = pd.to_numeric(fw.origin_Latitude, errors='coerce')
fw['origin_Longitude'] = pd.to_numeric(fw.origin_Longitude, errors='coerce')

fw['destination_Latitude'] = pd.to_numeric(fw.destination_Latitude, errors='coerce')
fw['destination_Longitude'] = pd.to_numeric(fw.destination_Longitude, errors='coerce')

In [17]:
# split ICAO_code
fw['ICAO_code_origin'] = fw[fw.origin_Latitude.isna()].origin_location.str.rsplit(" - ", n = 1, expand = True)[1]
fw['ICAO_code_destination'] = fw[fw.destination_Latitude.isna()].destination_location.str.rsplit(" - ", n = 1, expand = True)[1]

In [18]:
# replace dual ICAO code
fw.loc[fw.ICAO_code_origin.str.contains(' / ', na=False), 'ICAO_code_origin'] = fw.loc[fw.ICAO_code_origin.str.contains(' / ', na=False), 'ICAO_code_origin'].str.split(" / ", expand=True, n=1)[0]

fw.loc[fw.ICAO_code_destination.str.contains(' / ', na=False), 'ICAO_code_destination'] = fw.loc[fw.ICAO_code_destination.str.contains(' / ', na=False), 'ICAO_code_destination'].str.split(" / ", expand=True, n=1)[0]

#### Origin Location
---

In [19]:
# merge origin airport information 1 time
fw = fw.merge(airport_codes[['ident']].add_suffix('_origin_1'),
         how='left', left_on='ICAO_code_origin', right_on='ident_origin_1')

In [20]:
# Switch dual ICAO codes
fw.loc[(fw.ICAO_code_origin.notnull()) & (fw.ident_origin_1.isna()) & fw.origin.str.contains(' / '), 'ICAO_code_origin'] = fw.loc[(fw.ICAO_code_origin.notnull()) & (fw.ident_origin_1.isna()) & fw.origin.str.contains(' / '), 'origin'].str.split(' / ', n=1, expand=True)[1].str.replace(')', '')

In [21]:
# merge origin airport information 2 time
fw = fw.merge(airport_codes[['ident']].add_suffix('_origin_2'),
         how='left', left_on='ICAO_code_origin', right_on='ident_origin_2')

In [22]:
# merge similar columns
fw.loc[(fw.ICAO_code_origin.notnull()) & (fw.ident_origin_1.isna()) & (fw.origin.str.contains(' / ')), 'ident_origin_1'] = fw.loc[(fw.ICAO_code_origin.notnull()) & (fw.ident_origin_1.isna()) & (fw.origin.str.contains(' / ')), 'ident_origin_2']

In [23]:
# Drop duplicated column
fw = fw.drop(['ident_origin_2'], axis=1)

In [24]:
# load airport_codes again
airport_codes = pd.read_csv("..//Datasets/Airports/airport-codes.csv")

In [25]:
# drop duplicates in gps_code
airport_codes = airport_codes.drop_duplicates(subset=['gps_code'])

In [26]:
# merge origin airport information 3 time
fw = fw.merge(airport_codes[['gps_code', 'ident']].add_suffix('_origin_2'),
         how='left', left_on='ICAO_code_origin', right_on='gps_code_origin_2')

In [27]:
# merge similar columns
fw.loc[(fw.ICAO_code_origin.notnull()) & (fw.ident_origin_1.isna()), 'ident_origin_1'] = fw.loc[(fw.ICAO_code_origin.notnull()) & (fw.ident_origin_1.isna()), 'ident_origin_2']

In [28]:
# Drop duplicated columns
fw = fw.drop(['ident_origin_2', 'gps_code_origin_2'], axis=1)

In [29]:
# load airport_codes again
airport_codes = pd.read_csv("..//Datasets/Airports/airport-codes.csv")

In [30]:
# drop duplicates in local_code
airport_codes = airport_codes.drop_duplicates(subset=['local_code'])

In [31]:
# merge origin airport information 4 times
fw = fw.merge(airport_codes[['local_code', 'ident']].add_suffix('_origin_2'),
         how='left', left_on='ICAO_code_origin', right_on='local_code_origin_2')

In [32]:
# merge similar columns
fw.loc[(fw.ICAO_code_origin.notnull()) & (fw.ident_origin_1.isna()), 'ident_origin_1'] = fw.loc[(fw.ICAO_code_origin.notnull()) & (fw.ident_origin_1.isna()), 'ident_origin_2']

In [33]:
# Drop duplicated columns
fw = fw.drop(['ident_origin_2', 'local_code_origin_2'], axis=1)

In [34]:
# load airport_codes again
airport_codes = pd.read_csv("..//Datasets/Airports/airport-codes.csv")

In [35]:
# split coordinates and convert to numeric
airport_codes["Latitude"] = pd.to_numeric(airport_codes.coordinates.str.split(', ', n=1, expand=True)[1])
airport_codes["Longitude"] = pd.to_numeric(airport_codes.coordinates.str.split(', ', n=1, expand=True)[0])

In [36]:
airport_codes = airport_codes.replace(regex={r' Heliport': '', r' Airpark':'', r' Airport': '', r' Regional':'', r' Municipal':'', r' International':'',
                                            r' LRRS':''})

In [37]:
airport_codes.loc[:, 'name'] = airport_codes.loc[:, 'name'] + ", " + airport_codes.iso_region.str.split('-', expand=True)[1]
airport_codes.loc[:, 'municipality'] = airport_codes.loc[:, 'municipality'] + ", " + airport_codes.iso_region.str.split('-', expand=True)[1]

In [38]:
airport_codes = airport_codes.drop_duplicates(subset=['municipality'])

In [39]:
# merge origin airport information 5 times
fw = fw.merge(airport_codes[['name', 'ident']].add_suffix('_origin_2'),
         how='left', left_on='origin', right_on='name_origin_2')

In [40]:
# merge similar columns
fw.loc[(fw.ICAO_code_origin.isna()), 'ident_origin_1'] = fw.loc[(fw.ICAO_code_origin.isna()), 'ident_origin_2'] 
fw.loc[(fw.ICAO_code_origin.isna()), 'ICAO_code_origin'] = fw.loc[(fw.ICAO_code_origin.isna()), 'ident_origin_2'] 

In [41]:
# Drop duplicated columns
fw = fw.drop(['ident_origin_2', 'name_origin_2'], axis=1)

In [42]:
# merge origin airport information 6 times
fw = fw.merge(airport_codes[['municipality', 'ident']].add_suffix('_origin_2'),
         how='left', left_on='origin', right_on='municipality_origin_2')

In [43]:
# merge similar columns
fw.loc[(fw.ICAO_code_origin.isna()), 'ident_origin_1'] = fw.loc[(fw.ICAO_code_origin.isna()), 'ident_origin_2'] 
fw.loc[(fw.ICAO_code_origin.isna()), 'ICAO_code_origin'] = fw.loc[(fw.ICAO_code_origin.isna()), 'ident_origin_2'] 

In [44]:
# Drop duplicated columns
fw = fw.drop(['ident_origin_2', 'municipality_origin_2'], axis=1)

In [45]:
# check for null values
fw.isna().sum()

tail_number                  0
date                         0
aircraft                   115
origin                       0
origin_location              0
destination                  0
destination_location         0
departure                    0
arrival                      0
duration                     0
origin_Latitude          21635
origin_Longitude         21635
destination_Latitude     21458
destination_Longitude    21458
ICAO_code_origin            16
ICAO_code_destination      504
ident_origin_1              18
dtype: int64

#### Destination Location 
---

In [46]:
# load airport_codes again
airport_codes = pd.read_csv("..//Datasets/Airports/airport-codes.csv")

In [47]:
# merge destination airport information 1 time
fw = fw.merge(airport_codes[['ident']].add_suffix('_destination_1'),
         how='left', left_on='ICAO_code_destination', right_on='ident_destination_1')

In [48]:
# Switch dual ICAO codes
fw.loc[(fw.ICAO_code_destination.notnull()) & (fw.ident_destination_1.isna()) & fw.destination.str.contains(' / '), 'ICAO_code_destination'] = fw.loc[(fw.ICAO_code_destination.notnull()) & (fw.ident_destination_1.isna()) & fw.destination.str.contains(' / '), 'destination'].str.split(' / ', n=1, expand=True)[1].str.replace(')', '')

In [49]:
# merge destination airport information 2 time
fw = fw.merge(airport_codes[['ident']].add_suffix('_destination_2'),
         how='left', left_on='ICAO_code_destination', right_on='ident_destination_2')

In [50]:
# merge similar columns
fw.loc[(fw.ICAO_code_destination.notnull()) & (fw.ident_destination_1.isna()) & (fw.destination.str.contains(' / ')), 'ident_destination_1'] = fw.loc[(fw.ICAO_code_destination.notnull()) & (fw.ident_destination_1.isna()) & (fw.destination.str.contains(' / ')), 'ident_destination_2']

In [51]:
# Drop duplicated column
fw = fw.drop(['ident_destination_2'], axis=1)

In [52]:
# load airport_codes again
airport_codes = pd.read_csv("..//Datasets/Airports/airport-codes.csv")

In [53]:
# drop duplicates in gps_code
airport_codes = airport_codes.drop_duplicates(subset=['gps_code'])

In [54]:
# merge destination airport information 3 time
fw = fw.merge(airport_codes[['gps_code', 'ident']].add_suffix('_destination_2'),
         how='left', left_on='ICAO_code_destination', right_on='gps_code_destination_2')

In [55]:
# merge similar columns
fw.loc[(fw.ICAO_code_destination.notnull()) & (fw.ident_destination_1.isna()), 'ident_destination_1'] = fw.loc[(fw.ICAO_code_destination.notnull()) & (fw.ident_destination_1.isna()), 'ident_destination_2']

In [56]:
# Drop duplicated columns
fw = fw.drop(['ident_destination_2', 'gps_code_destination_2'], axis=1)

In [57]:
# load airport_codes again
airport_codes = pd.read_csv("..//Datasets/Airports/airport-codes.csv")

In [58]:
# drop duplicates in local_code
airport_codes = airport_codes.drop_duplicates(subset=['local_code'])

In [59]:
# merge destination airport information 4 times
fw = fw.merge(airport_codes[['local_code', 'ident']].add_suffix('_destination_2'),
         how='left', left_on='ICAO_code_destination', right_on='local_code_destination_2')

In [60]:
# merge similar columns
fw.loc[(fw.ICAO_code_destination.notnull()) & (fw.ident_destination_1.isna()), 'ident_destination_1'] = fw.loc[(fw.ICAO_code_destination.notnull()) & (fw.ident_destination_1.isna()), 'ident_destination_2']

In [61]:
# Drop duplicated columns
fw = fw.drop(['ident_destination_2', 'local_code_destination_2'], axis=1)

In [62]:
# load airport_codes again
airport_codes = pd.read_csv("..//Datasets/Airports/airport-codes.csv")

In [63]:
# split coordinates and convert to numeric
airport_codes["Latitude"] = pd.to_numeric(airport_codes.coordinates.str.split(', ', n=1, expand=True)[1])
airport_codes["Longitude"] = pd.to_numeric(airport_codes.coordinates.str.split(', ', n=1, expand=True)[0])

In [64]:
airport_codes = airport_codes.replace(regex={r' Heliport': '', r' Airpark':'', r' Airport': '', r' Regional':'', 
                                             r' Municipal':'', r' International':'', r' LRRS':''})

In [65]:
airport_codes.loc[:, 'name'] = airport_codes.loc[:, 'name'] + ", " + airport_codes.iso_region.str.split('-', expand=True)[1]
airport_codes.loc[:, 'municipality'] = airport_codes.loc[:, 'municipality'] + ", " + airport_codes.iso_region.str.split('-', expand=True)[1]

In [66]:
airport_codes = airport_codes.drop_duplicates(subset=['municipality'])

In [67]:
# merge destination airport information 5 times
fw = fw.merge(airport_codes[['name', 'ident']].add_suffix('_destination_2'),
         how='left', left_on='destination', right_on='name_destination_2')

In [68]:
# merge similar columns
fw.loc[(fw.ICAO_code_destination.isna()), 'ident_destination_1'] = fw.loc[(fw.ICAO_code_destination.isna()), 'ident_destination_2'] 
fw.loc[(fw.ICAO_code_destination.isna()), 'ICAO_code_destination'] = fw.loc[(fw.ICAO_code_destination.isna()), 'ident_destination_2'] 

In [69]:
# Drop duplicated columns
fw = fw.drop(['ident_destination_2', 'name_destination_2'], axis=1)

In [70]:
# merge destination airport information 6 times
fw = fw.merge(airport_codes[['municipality', 'ident']].add_suffix('_destination_2'),
         how='left', left_on='destination', right_on='municipality_destination_2')

In [71]:
# merge similar columns
fw.loc[(fw.ICAO_code_destination.isna()), 'ident_destination_1'] = fw.loc[(fw.ICAO_code_destination.isna()), 'ident_destination_2'] 
fw.loc[(fw.ICAO_code_destination.isna()), 'ICAO_code_destination'] = fw.loc[(fw.ICAO_code_destination.isna()), 'ident_destination_2'] 

In [72]:
# Drop duplicated columns
fw = fw.drop(['ident_destination_2', 'municipality_destination_2'], axis=1)

In [73]:
# check for null values
fw.isna().sum()

tail_number                  0
date                         0
aircraft                   115
origin                       0
origin_location              0
destination                  0
destination_location         0
departure                    0
arrival                      0
duration                     0
origin_Latitude          21635
origin_Longitude         21635
destination_Latitude     21458
destination_Longitude    21458
ICAO_code_origin            16
ICAO_code_destination       36
ident_origin_1              18
ident_destination_1         38
dtype: int64

#### Coordinates merging
---

In [74]:
# load airport_codes again
airport_codes = pd.read_csv("..//Datasets/Airports/airport-codes.csv")

In [75]:
# split coordinates and convert to numeric
airport_codes["Latitude"] = pd.to_numeric(airport_codes.coordinates.str.split(', ', n=1, expand=True)[1])
airport_codes["Longitude"] = pd.to_numeric(airport_codes.coordinates.str.split(', ', n=1, expand=True)[0])

In [76]:
# merge by origin
fw = fw.merge(airport_codes.add_suffix('_origin'),
         how='left', left_on='ident_origin_1', right_on='ident_origin')

In [77]:
# replace origin_Latitude and origin_Longitude
fw.loc[fw.origin_Latitude.isna(), 'origin_Latitude'] = fw.loc[fw.origin_Latitude.isna(), 'Latitude_origin']
fw.loc[fw.origin_Longitude.isna(), 'origin_Longitude'] = fw.loc[fw.origin_Longitude.isna(), 'Longitude_origin']

In [78]:
# drop all useless columns
fw = fw.drop(['coordinates_origin', 'Latitude_origin', 'Longitude_origin', 'continent_origin', 'elevation_ft_origin', 'type_origin',
         'local_code_origin', 'gps_code_origin', 'municipality_origin', 'iso_region_origin', 'ident_origin', 'iata_code_origin'], axis=1)

In [79]:
# merge by destination
fw = fw.merge(airport_codes.add_suffix('_destination'),
         how='left', left_on='ident_destination_1', right_on='ident_destination')

In [80]:
# replace destination_Latitude and destination_Longitude
fw.loc[fw.destination_Latitude.isna(), 'destination_Latitude'] = fw.loc[fw.destination_Latitude.isna(), 'Latitude_destination']
fw.loc[fw.destination_Longitude.isna(), 'destination_Longitude'] = fw.loc[fw.destination_Longitude.isna(), 'Longitude_destination']

In [81]:
# drop all useless columns
fw = fw.drop(['coordinates_destination', 'Latitude_destination', 'Longitude_destination', 'continent_destination', 'elevation_ft_destination', 'type_destination',
         'local_code_destination', 'gps_code_destination', 'municipality_destination', 'iso_region_destination', 'ident_destination', 'iata_code_destination'], axis=1)

In [82]:
# check for null values
fw.isna().sum()

tail_number                  0
date                         0
aircraft                   115
origin                       0
origin_location              0
destination                  0
destination_location         0
departure                    0
arrival                      0
duration                     0
origin_Latitude              3
origin_Longitude             3
destination_Latitude         5
destination_Longitude        5
ICAO_code_origin            16
ICAO_code_destination       36
ident_origin_1              18
ident_destination_1         38
name_origin                 18
iso_country_origin          18
name_destination            38
iso_country_destination     38
dtype: int64

#### Additional Information
---

In [83]:
# add Owner column
fw = fw.merge(fw_info[['Tail_number','Owner']], how='left', left_on='tail_number', right_on='Tail_number').drop('Tail_number', axis=1)

In [84]:
# calculate the distance
for i in range(len(fw)):
    try:
        p1 = Point(str(fw.origin_Latitude[i]) + ' ' + str(fw.origin_Longitude[i]))
        p2 = Point(str(fw.destination_Latitude[i]) + ' ' + str(fw.destination_Longitude[i]))
    
        fw.loc[i, 'distance_mi'] = distance.distance(p1,p2).miles
    except:
        pass

In [85]:
# split states 
fw['origin_state'] = fw.origin.str.split(', ', n=1, expand=True)[1]
fw.loc[fw.origin_state.isna(), 'origin_state'] = fw.origin_location.str.split(', ', n=1, expand=True)[1].str.rsplit(') ', n=1, expand=True)[0]

fw['destination_state'] = fw.destination.str.split(', ', n=1, expand=True)[1]
fw.loc[fw.destination_state.isna(), 'destination_state'] = fw.destination_location.str.split(', ', n=1, expand=True)[1].str.rsplit(') ', n=1, expand=True)[0]

In [86]:
# split cities
fw['city_origin'] = fw.origin_location.str.rsplit(" (", n=1, expand=True)[1].str.split(")", n=1, expand=True)[0].str.split(',', n=1, expand=True)[0]
fw.loc[fw.city_origin.isna(), 'city_origin'] =fw.loc[fw.city_origin.isna(), 'origin'].str.split(',', n=1, expand=True)[0]

fw['city_destination'] = fw.destination_location.str.rsplit(" (", n=1, expand=True)[1].str.split(")", n=1, expand=True)[0].str.split(',', n=1, expand=True)[0]
fw.loc[fw.city_destination.isna(), 'city_destination'] = fw.loc[fw.city_destination.isna(), 'destination'].str.split(',', n=1, expand=True)[0]

In [88]:
# add shift
fw['departure_shift'] = np.where((pd.to_datetime(fw.departure.str.split(' ', n=1, expand=True)[0]).dt.time.astype('str') < '19:00:00') &\
                       (pd.to_datetime(fw.departure.str.split(' ', n=1, expand=True)[0]).dt.time.astype('str') > '07:00:00'), 'day', 'night')

fw['arrival_shift'] = np.where((pd.to_datetime(fw.arrival.str.split(' ', n=1, expand=True)[0]).dt.time.astype('str') < '19:00:00') &\
                       (pd.to_datetime(fw.arrival.str.split(' ', n=1, expand=True)[0]).dt.time.astype('str') > '07:00:00'), 'day', 'night')

In [89]:
# rename columns and drop unnecessary columns
fw = fw.rename({'ident_origin_1': 'origin_code', 'ident_destination_1': 'destination_code', 
           'name_origin':'airport_origin', 'iso_country_origin': 'country_origin',
          'name_destination':'airport_destination', 'iso_country_destination': 'country_destination'}, axis=1)\
       .drop(['origin_location', 'destination_location', 'ICAO_code_origin', 'ICAO_code_destination'], axis=1)

In [90]:
fw.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21959 entries, 0 to 21958
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   tail_number            21959 non-null  object        
 1   date                   21959 non-null  datetime64[ns]
 2   aircraft               21844 non-null  object        
 3   origin                 21959 non-null  object        
 4   destination            21959 non-null  object        
 5   departure              21959 non-null  object        
 6   arrival                21959 non-null  object        
 7   duration               21959 non-null  object        
 8   origin_Latitude        21956 non-null  float64       
 9   origin_Longitude       21956 non-null  float64       
 10  destination_Latitude   21954 non-null  float64       
 11  destination_Longitude  21954 non-null  float64       
 12  origin_code            21941 non-null  object        
 13  d

In [91]:
# rearrange columns
fw = fw.iloc[:,:3].join([fw.iloc[:,12], fw.iloc[:,3], fw.iloc[:,13], fw.iloc[:,4], fw.iloc[:,5:8], fw.iloc[:,19], fw.iloc[:,18], 
                    fw.iloc[:,8:12], fw.iloc[:,14], fw.iloc[:,22], fw.iloc[:,20], fw.iloc[:,15],
                    fw.iloc[:,16], fw.iloc[:,23], fw.iloc[:,21], fw.iloc[:,17], fw.iloc[:,24:26]])

In [97]:
fw.head()

Unnamed: 0,tail_number,date,aircraft,origin_code,origin,destination_code,destination,departure,arrival,duration,distance_mi,Owner,origin_Latitude,origin_Longitude,destination_Latitude,destination_Longitude,airport_origin,city_origin,origin_state,country_origin,airport_destination,city_destination,destination_state,country_destination,departure_shift,arrival_shift
0,N100KB,2021-02-20,BE9L,US-0571,Williston Basin International Airport (KXWA),KBIS,Bismarck Muni (KBIS),04:57PM CST,05:44PM CST,0:46,174.005874,"EXECUTIVE AIR TAXI CORPBISMARCK, ND, US(Corpor...",48.258387,-103.748797,46.772701,-100.746002,Williston Basin International Airport,Williston,ND,US,Bismarck Municipal Airport,Bismarck,ND,US,day,day
1,N100KB,2021-02-20,BE9L,KBIS,Bismarck Muni (KBIS),US-0571,Williston Basin International Airport (KXWA),01:36PM CST,02:27PM CST,0:51,174.005874,"EXECUTIVE AIR TAXI CORPBISMARCK, ND, US(Corpor...",46.772701,-100.746002,48.258387,-103.748797,Bismarck Municipal Airport,Bismarck,ND,US,Williston Basin International Airport,Williston,ND,US,day,day
2,N100KB,2021-02-18,BE9L,KMOT,Minot Intl (KMOT),KBIS,Bismarck Muni (KBIS),08:20AM CST,08:53AM CST,0:32,105.704153,"EXECUTIVE AIR TAXI CORPBISMARCK, ND, US(Corpor...",48.259399,-101.279999,46.772701,-100.746002,Minot International Airport,Minot,ND,US,Bismarck Municipal Airport,Bismarck,ND,US,day,day
3,N100KB,2021-02-15,BE9L,US-0571,Williston Basin International Airport (KXWA),KMOT,Minot Intl (KMOT),02:15PM CST,02:49PM CST,0:33,113.89946,"EXECUTIVE AIR TAXI CORPBISMARCK, ND, US(Corpor...",48.258387,-103.748797,48.259399,-101.279999,Williston Basin International Airport,Williston,ND,US,Minot International Airport,Minot,ND,US,day,day
4,N100KB,2021-02-15,BE9L,KMOT,Minot Intl (KMOT),US-0571,Williston Basin International Airport (KXWA),08:00AM CST,08:32AM CST,0:32,113.89946,"EXECUTIVE AIR TAXI CORPBISMARCK, ND, US(Corpor...",48.259399,-101.279999,48.258387,-103.748797,Minot International Airport,Minot,ND,US,Williston Basin International Airport,Williston,ND,US,day,day


In [93]:
fw.shape

(21959, 26)

In [94]:
# check for null values
fw.isna().sum()

tail_number                0
date                       0
aircraft                 115
origin_code               18
origin                     0
destination_code          38
destination                0
departure                  0
arrival                    0
duration                   0
distance_mi                8
Owner                      0
origin_Latitude            3
origin_Longitude           3
destination_Latitude       5
destination_Longitude      5
airport_origin            18
city_origin                0
origin_state              31
country_origin            18
airport_destination       38
city_destination           0
destination_state         33
country_destination       38
departure_shift            0
arrival_shift              0
dtype: int64

In [98]:
# number of existing tail numbers with flight history
fw.tail_number.nunique() 

177

In [101]:
# number of existing tail numbers with flight history
fw.Owner.nunique() 

102

In [95]:
# # import to csv
# fw.to_csv("..//Datasets/NEW/FW_with_airports.csv", index=False)