**Notes for Everyone!**
1.   Make sure you have the [Shared folder](https://drive.google.com/drive/folders/1nUkS4PXVNQc8kbDRlQiR_Oy2iNAPMqWU?usp=sharing) as a Shortcut in your "My Drive"
2.   Run the "Connect to Group Folder" section, and get the code from the link prompt in first cell 

#Connect to Group Folder


In [1]:
# Connect to Drive and Enter Authentication Code
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# cd to folder
import os
os.chdir("/content/drive/My Drive/DS2500GroupProject")

# Imports

In [3]:
#imports
import pandas as pd
import re
from datetime import date

# Read and Clean Data


*   Demographics - survey_data
*   Ridership - ridership_2019data / 
ridership_2020data
*   Location Data - stop_loc






In [4]:
# reads survey data and convert from excel file to csv
# first sheet by default
survey_data = pd.read_excel(r'survey.xlsx')
# read_file.to_csv(r'survey.csv', index = None, header=True)

#reads ridership data
ridership_2019data = pd.read_csv(r'ridership19.csv')
#change service date to date type
ridership_2019data['service_date'] = pd.to_datetime(ridership_2019data['service_date'], format="%Y/%m/%d")

ridership_2020data = pd.read_csv(r'ridership20.csv')
#change service date to date type
ridership_2020data['service_date'] = pd.to_datetime(ridership_2020data['service_date'], format="%Y/%m/%d")

#reads stop data
stop_loc = pd.read_csv(r'stop_loc.csv')

In [5]:
#Filter out rows if Mode is "Commuter Rail"
survey_data = survey_data.loc[survey_data['Mode'] != "Commuter Rail"]

#Filter out stations with long term constructions
constructionstops = ['Wollaston', 'Science Park', 'Lechmere']
ridership_2020data = ridership_2020data.loc[~ridership_2020data['station_name'].isin(constructionstops)]
ridership_2019data = ridership_2019data.loc[~ridership_2019data['station_name'].isin(constructionstops)]

# Joining 2019 and 2020 Ridership Data
*   By Year       - joineddf
*   By Month      - joinedmonthdf
*   By WeekofYear - joinedwoydf

In [15]:
def addweekofyear_df(df):
  df['Month'] = df["service_date"].dt.month
  df['Year'] = df["service_date"].dt.year
  df['WeekOfYear'] = df['service_date'].dt.isocalendar().week
  df['WeekYear'] = df['service_date'].dt.isocalendar().year

def aggregate(df, listofaggbycolumn, aggfunction, aggcol):
  """ performs an aggregate function by the given columns list in the data
  frame on the specified column """
  # Removing as index to become dataframe
  df2 = df.groupby(listofaggbycolumn).agg(aggfunction)[aggcol]
  return df2.reset_index()
  
def addpercentchange_df(df):
  ''' takes df with columns ['gated_entries2019', 'gated_entries2020'] '''
  df['PercentChange'] = (df['gated_entries2020']-df['gated_entries2019'])/df['gated_entries2019']

In [16]:
# Add date related columns
addweekofyear_df(ridership_2019data)
addweekofyear_df(ridership_2020data)

In [17]:
# Aggregate by Year, Station, Line
agg_2019  = aggregate(ridership_2020data, ["route_or_line", "station_name", "stop_id"], sum, "gated_entries")
agg_2020  = aggregate(ridership_2019data, ["route_or_line", "station_name", "stop_id"], sum, "gated_entries")
# Join both years to compare 
joineddf = agg_2019.merge(agg_2020, how='outer', on=["route_or_line", "station_name", "stop_id"], suffixes=['2019','2020'])
# Add Percent Change
addpercentchange_df(joineddf)

In [18]:
# Aggregate by Month, Station, Line
aggmonth_2019  = aggregate(ridership_2020data, ["Month", "route_or_line", "station_name", "stop_id"], sum, "gated_entries")
aggmonth_2020  = aggregate(ridership_2019data, ["Month", "route_or_line", "station_name", "stop_id"], sum, "gated_entries")
# Join both years to compare 
joinedmonthdf = aggmonth_2019.merge(aggmonth_2020, how='outer', on=["Month", "route_or_line", "station_name", "stop_id"], suffixes=['2019','2020']).dropna()
# Add Percent Change
addpercentchange_df(joinedmonthdf)

In [19]:
#Aggregate by Week of Year, Station, Line
# Merge 2019 and 2020 (to adjust for week of year overlaps)
week_merge = pd.concat([ridership_2019data, ridership_2020data])
# Aggregate by Week of Year and Corresponding Year
aggwoy_merged = aggregate(week_merge
                          , ["WeekOfYear", 'WeekYear', "route_or_line"
                          , "station_name", "stop_id"]
                          , sum , "gated_entries")
# Filter by Year
woy_2019 = aggwoy_merged.loc[aggwoy_merged['WeekYear']== 2019]
woy_2020 = aggwoy_merged.loc[aggwoy_merged['WeekYear'] == 2020]
# Join both years to compare (NaNs due to longer 2020 year)
joinedwoydf = woy_2019.merge(woy_2020, how='outer', on=["WeekOfYear", "route_or_line", "station_name", "stop_id"], suffixes=['2019','2020']).reset_index().dropna()
# Add Percent Change
addpercentchange_df(joinedwoydf)