In [1]:
import pandas as pd
import numpy as np

In [58]:
def flights_preclean(df):
    """
    Input: Raw dataframe of Flights table.
    Output: Cleaned flights table:
        - Remove cancelled rows, made available in new dataframe "df_can"
        - Drop columns ['Unnamed: 0', 'fl_date', 'branded_code_share',
           'mkt_carrier', 'cancelled', 'cancellation_code', 'flights', 'air_time',
            'first_dep_time', 'total_add_gtime', 'longest_add_gtime', 'no_name']
        - Fill null values in delay columns
        - Drop remaining null values
    
    """
    global df_can
    df_can = df[df.cancelled == 1].copy()
    print("Removed cancelled flights - now available in dataframe 'df_can'")
    df = df[df.cancelled == 0]
    df = df.drop(columns=['Unnamed: 0', 'branded_code_share',
           'mkt_carrier', 'cancelled', 'cancellation_code', 'flights', 'air_time',
            'first_dep_time', 'total_add_gtime', 'longest_add_gtime', 'no_name'])
    for col in ['carrier_delay', 'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay']:
        df[col] = df[col].fillna(value=0)
    df = df.dropna()
    return df

In [59]:
df = pd.read_csv('data/flights_random_50k.csv')
df = flights_preclean(df)

Removed cancelled flights - now available in dataframe 'df_can'


In [60]:
df.head()

Unnamed: 0,fl_date,mkt_unique_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,origin_city_name,dest_airport_id,...,diverted,dup,crs_elapsed_time,actual_elapsed_time,distance,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay
0,2019-02-22,UA,1723,UA,N27511,1723,14771,SFO,"San Francisco, CA",12758,...,0.0,N,338.0,353.0,2367.0,0.0,0.0,0.0,0.0,0.0
2,2018-12-17,UA,3964,ZW,N425AW,3964,12323,ILM,"Wilmington, NC",13930,...,0.0,N,166.0,152.0,760.0,0.0,0.0,0.0,0.0,0.0
3,2019-08-25,AA,3976,MQ,N810AE,3976,15401,TXK,"Texarkana, AR",11298,...,0.0,N,65.0,49.0,181.0,0.0,0.0,0.0,0.0,0.0
4,2019-03-20,UA,3601,YX,N745YX,3601,13487,MSP,"Minneapolis, MN",11618,...,0.0,N,170.0,146.0,1008.0,0.0,0.0,0.0,0.0,0.0
5,2018-03-31,AA,607,AA,N661AW,607,14100,PHL,"Philadelphia, PA",11298,...,0.0,N,228.0,214.0,1303.0,0.0,0.0,0.0,0.0,0.0
