In [1]:
import pandas as pd
import numpy as np

In [2]:
def flights_preclean(df):
    """
    Input: Raw dataframe of Flights table.
    Output: Cleaned flights table:
        - Remove cancelled rows, made available in new dataframe "df_can"
        - Drop columns ['Unnamed: 0', 'fl_date', 'branded_code_share',
           'mkt_carrier', 'cancelled', 'cancellation_code', 'flights', 'air_time',
            'first_dep_time', 'total_add_gtime', 'longest_add_gtime', 'no_name']
        - Fill null values in delay columns
        - Drop remaining null values
    
    """
    global df_can
    df_can = df[df.cancelled == 1].copy()
    print("Removed cancelled flights - now available in dataframe 'df_can'")
    df = df[df.cancelled == 0]
    df = df.drop(columns=['Unnamed: 0', 'branded_code_share',
           'mkt_carrier', 'cancelled', 'cancellation_code', 'flights', 'air_time',
            'first_dep_time', 'total_add_gtime', 'longest_add_gtime', 'no_name'])
    for col in ['carrier_delay', 'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay']:
        df[col] = df[col].fillna(value=0)
    df = df.dropna()
    return df

In [13]:
df = pd.read_csv('data/raw_flights_test.csv', index_col=0)
df.head()

Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,origin_city_name,dest_airport_id,dest,dest_city_name,crs_dep_time,crs_arr_time,dup,crs_elapsed_time,flights,distance
0,2020-01-01,WN,WN,WN,5888,WN,N951WN,5888,13891,ONT,"Ontario, CA",14771,SFO,"San Francisco, CA",1810,1945,N,95,1,363
1,2020-01-01,WN,WN,WN,6276,WN,N467WN,6276,13891,ONT,"Ontario, CA",14771,SFO,"San Francisco, CA",1150,1320,N,90,1,363
2,2020-01-01,WN,WN,WN,4598,WN,N7885A,4598,13891,ONT,"Ontario, CA",14831,SJC,"San Jose, CA",2020,2130,N,70,1,333
3,2020-01-01,WN,WN,WN,4761,WN,N551WN,4761,13891,ONT,"Ontario, CA",14831,SJC,"San Jose, CA",1340,1455,N,75,1,333
4,2020-01-01,WN,WN,WN,5162,WN,N968WN,5162,13891,ONT,"Ontario, CA",14831,SJC,"San Jose, CA",915,1035,N,80,1,333


In [22]:
# get list of airports
apts = df.origin.unique()
len(apts) * 7

2541

In [27]:
# # unique airport-date pairs
# df[['origin','fl_date']][~df[['origin','fl_date']].duplicated()].to_csv('vc_api/test_airport_dates.csv')