In [None]:
# store passengers csv from DB folder into a DataFrame
pas_df = pd.read_csv('../DB/passengers_data.csv', index_col='Unnamed: 0')


# helper function to find total passengers per airport
def make_df(df,year):
    
    '''
    Returns a list of DataFrames ordered by month
    showing the total passenger counts for the corresponding aiport
    
    
    Arguments:
    df -- DataFrame from passengers table
            - must include 'passengers', 'month', 'year', 
              and 'origin_airport_id' features
            
    Returns:
    List of DataFrames
        - Columns: total passengers, month, year
        - Row: origin_airport_ids
    '''
    
    months = [1,2,3,4,5,6,7,8,9,10,11,12]
    year = year
    dfs = []
    
    for month in months:
        X = df.loc[(df.month == month) & (df.year == year)].groupby('origin_airport_id')[['passengers']].sum()
        X['month'] = month
        X['year'] = year
        dfs.append(X)
        
    return dfs


# use function to find passenger counts for 2018 and 2019
passengers_per_month_2018 = make_df(pas_df, 2018)
passengers_per_month_2019 = make_df(pas_df, 2019)


# take list of DataFrames and concatenate 
pass_2018 = pd.concat(passengers_per_month_2018)
pass_2019 = pd.concat(passengers_per_month_2019)


# merge yearly DataFrames
avg_monthly_pas = pd.merge(pass_2018, pass_2019, how='left', on=['origin_airport_id', 'month'])


# fill NaN values 
avg_monthly_pas.passengers_y.fillna(avg_monthly_pas.passengers_x, inplace=True)
avg_monthly_pas.year_y.fillna(avg_monthly_pas.year_x + 1, inplace=True)


# calculate an average monthly passenger feature for each airport
avg_monthly_pas['avg_monthly_pas'] = (avg_monthly_pas.passengers_x + avg_monthly_pas.passengers_y) / 2


# drop unnecessary features
avg_monthly_pas = avg_monthly_pas[['month','avg_monthly_pas']]


# merge onto training DataFrame
final = pd.merge(X, avg_monthly_pas, how='left', on=['origin_airport_id','month'])