In [3]:
import pandas as pd
import holidays
import json
import requests

In [6]:
### Assumed master data is available in local machine
master_data = pd.read_csv("./master_dataset.csv")

In [3]:
master_data["Start date"] = master_data["Start date"].apply(lambda x: (x[:-6] + ":00").replace(" ","T"))
master_data["End date"] = master_data["End date"].apply(lambda x: (x[:-6] + ":00").replace(" ","T"))

In [7]:

baseUrl = "https://archive-api.open-meteo.com/v1/era5"
stationFile = "stationData.json"

with open(f"./{stationFile}", "r") as file:
    stationData = json.load(file)
stationData = pd.DataFrame(stationData["data"]["stations"])
lat = stationData["lat"].mean()
lon = stationData["lon"].mean()
startDate = "2015-01-01"
endDate = "2020-12-31"
variables = ["temperature_2m", "relativehumidity_2m" ,"precipitation" ,"windspeed_10m"]

body = f"latitude={lat}&longitude={lon}&start_date={startDate}&end_date={endDate}&hourly={','.join(variables)}"
url = f"{baseUrl}?{body}"

response = requests.get(f"{baseUrl}?{body}")
weatherJson = response.json()
weatherData = pd.DataFrame(weatherJson["hourly"])
del weatherJson["hourly"]
weatherDataMeta = weatherJson

In [8]:
master_data_final = master_data.merge(weatherData, left_on="Start date", right_on="time")

In [9]:
dcHolidays = {}
for year in range(2015, 2021):
    dcHolidays.update(holidays.country_holidays("US", subdiv="DC", years=year))

dcHolidayDates = list(dcHolidays.keys())

for date in dcHolidayDates:
    dateString = date.strftime("%Y-%m-%d")
    dcHolidays[dateString] = dcHolidays[date]

for date in dcHolidayDates:
    dcHolidays.pop(date)

dcHolidayDates = list(dcHolidays.keys())

In [10]:
master_data_final["date"] = master_data_final["time"].apply(lambda x: x.split("T")[0])
master_data_final["holiday"] = master_data_final["date"].apply(lambda x: dcHolidays[x] if x in dcHolidayDates else "")
master_data_final["isHoliday"] = master_data_final["holiday"].apply(lambda x: 0 if x == "" else 1)

In [11]:
master_data_final.to_csv("./MasterDataset_WithWeatherAndEvent.csv")

In [4]:
master_data_final = pd.read_csv("./MasterDataset_WithWeatherAndEvent.csv")

  master_data_final = pd.read_csv("./MasterDataset_WithWeatherAndEvent.csv")


In [8]:
master_data_final["Start station number"].value_counts()

Start station number
31623.0    349067
31258.0    308901
31247.0    269894
31200.0    237188
31201.0    222503
            ...  
31667.0        11
31938.0        11
32902.0         8
31668.0         5
31936.0         4
Name: count, Length: 630, dtype: int64

In [11]:
master_data_final[master_data_final["Start station number"] == 31623].reset_index().loc[0.:]

Unnamed: 0.2,index,Unnamed: 0.1,Unnamed: 0,Duration,Start date,End date,Start station number,Start station,End station number,End station,Member type,time,temperature_2m,relativehumidity_2m,precipitation,windspeed_10m,date,holiday,isHoliday
0,81,81,81,223,2015-01-01T01:00,2015-01-01T01:00,31623.0,Columbus Circle / Union Station,31631.0,8th & F St NE,Member,2015-01-01T01:00,-3.1,65,0.0,7.7,2015-01-01,New Year's Day,1
1,109,109,109,517,2015-01-01T01:00,2015-01-01T01:00,31623.0,Columbus Circle / Union Station,31622.0,13th & D St NE,Member,2015-01-01T01:00,-3.1,65,0.0,7.7,2015-01-01,New Year's Day,1
2,110,110,110,546,2015-01-01T01:00,2015-01-01T01:00,31623.0,Columbus Circle / Union Station,31622.0,13th & D St NE,Member,2015-01-01T01:00,-3.1,65,0.0,7.7,2015-01-01,New Year's Day,1
3,144,144,144,666,2015-01-01T01:00,2015-01-01T01:00,31623.0,Columbus Circle / Union Station,31512.0,Neal St & Trinidad Ave NE,Member,2015-01-01T01:00,-3.1,65,0.0,7.7,2015-01-01,New Year's Day,1
4,189,189,189,708,2015-01-01T02:00,2015-01-01T02:00,31623.0,Columbus Circle / Union Station,31619.0,Lincoln Park / 13th & East Capitol St NE,Member,2015-01-01T02:00,-3.1,65,0.0,9.2,2015-01-01,New Year's Day,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
349062,19434405,19434405,19348639,6,2020-12-26T08:00,2020-12-26T08:00,31623.0,Columbus Circle / Union Station,31627.0,3rd & M St NE,member,2020-12-26T08:00,-4.4,41,0.0,16.9,2020-12-26,,0
349063,19434525,19434525,19338105,3,2020-12-08T23:00,2020-12-08T23:00,31623.0,Columbus Circle / Union Station,31612.0,D St & Maryland Ave NE,member,2020-12-08T23:00,3.6,54,0.0,15.1,2020-12-08,,0
349064,19434798,19434798,19337961,10,2020-12-17T06:00,2020-12-17T06:00,31623.0,Columbus Circle / Union Station,31650.0,1st & M St SE,member,2020-12-17T06:00,-0.1,91,0.8,19.3,2020-12-17,,0
349065,19435123,19435123,19418597,12,2020-12-16T23:00,2020-12-16T23:00,31623.0,Columbus Circle / Union Station,31617.0,Bladensburg Rd & Benning Rd NE,member,2020-12-16T23:00,2.3,97,2.1,25.8,2020-12-16,,0
