In [60]:
import os
import datetime
import requests
from json import JSONDecodeError
import pandas as pd

# from dotenv import load_dotenv
# load_dotenv()

# Processing special city - `Seattle`.
### We need different stations across the Seattle. 

I downloaded daily `PM2.5` data manually from [here](https://www.epa.gov/outdoor-air-quality-data/download-daily-data)

In [167]:
seattle_df = pd.DataFrame()

for year in range(2013, 2023 + 1):
    df_ = pd.read_csv(f"data/seattle_pm25_{year}.csv")
    seattle_df = pd.concat([seattle_df, df_])

seattle_df = seattle_df.reset_index(drop=True)

seattle_df.shape

(67901, 20)

In [169]:
seattle_df.tail(2)

Unnamed: 0,Date,Source,Site ID,POC,Daily Mean PM2.5 Concentration,UNITS,DAILY_AQI_VALUE,Site Name,DAILY_OBS_COUNT,PERCENT_COMPLETE,AQS_PARAMETER_CODE,AQS_PARAMETER_DESC,CBSA_CODE,CBSA_NAME,STATE_CODE,STATE,COUNTY_CODE,COUNTY,SITE_LATITUDE,SITE_LONGITUDE
67899,04/02/2023,AirNow,530611007,5,4.8,ug/m3 LC,20,MARYSVILLE - 7TH AVE (Marysville Junior High),1,100.0,88101,PM2.5 - Local Conditions,42660,"Seattle-Tacoma-Bellevue, WA",53,Washington,61,Snohomish,48.054315,-122.171529
67900,04/03/2023,AirNow,530611007,5,4.8,ug/m3 LC,20,MARYSVILLE - 7TH AVE (Marysville Junior High),1,100.0,88101,PM2.5 - Local Conditions,42660,"Seattle-Tacoma-Bellevue, WA",53,Washington,61,Snohomish,48.054315,-122.171529


In [170]:
seattle_df = seattle_df.rename(columns={
    'Daily Mean PM2.5 Concentration': 'pm25',
    'Date': 'date',
    'SITE_LATITUDE': 'latitude',
    'SITE_LONGITUDE': 'longitude',
    'Site Name': 'site_name'
})[['site_name', 'date', 'pm25', 'latitude', 'longitude']]

In [171]:
seattle_df = seattle_df.drop_duplicates(subset=['date', 'site_name'])

In [176]:
seattle_df.site_name.value_counts()

NORTH BEND - NORTH BEND WAY                                       3705
TACOMA - L STREET                                                 3696
SEATTLE - BEACON HILL                                             3691
MARYSVILLE - 7TH AVE (Marysville Junior High)                     3648
DARRINGTON - FIR ST (Darrington High School)                      3614
SEATTLE - SOUTH PARK #2                                           3577
TACOMA - ALEXANDER AVE                                            3569
KENT - JAMES & CENTRAL                                            3556
SEATTLE - DUWAMISH                                                3439
Seattle-10th & Weller                                             3097
LAKE FOREST PARK TOWNE CENTER                                     2999
PUYALLUP - 128TH ST                                               2700
Tacoma-S 36th St                                                  2574
Bellevue-SE 12th St                                               2172
LYNNWO

In [178]:
sites, records = seattle_df.site_name.value_counts().index, seattle_df.site_name.value_counts().values

In [179]:
for site, record in zip(sites, records):
    print(site)
    print("# of observations -", record)
    print(seattle_df[seattle_df.site_name == site].date.tail(1).values)
    print("---")

NORTH BEND - NORTH BEND WAY
# of observations - 3705
['04/03/2023']
---
TACOMA - L STREET
# of observations - 3696
['04/03/2023']
---
SEATTLE - BEACON HILL
# of observations - 3691
['04/03/2023']
---
MARYSVILLE - 7TH AVE (Marysville Junior High)
# of observations - 3648
['04/03/2023']
---
DARRINGTON - FIR ST (Darrington High School)
# of observations - 3614
['04/03/2023']
---
SEATTLE - SOUTH PARK #2
# of observations - 3577
['04/03/2023']
---
TACOMA - ALEXANDER AVE
# of observations - 3569
['04/02/2023']
---
KENT - JAMES & CENTRAL
# of observations - 3556
['04/03/2023']
---
SEATTLE - DUWAMISH
# of observations - 3439
['04/03/2023']
---
Seattle-10th & Weller
# of observations - 3097
['04/03/2023']
---
LAKE FOREST PARK TOWNE CENTER
# of observations - 2999
['04/03/2023']
---
PUYALLUP - 128TH ST
# of observations - 2700
['11/06/2020']
---
Tacoma-S 36th St
# of observations - 2574
['04/03/2023']
---
Bellevue-SE 12th St
# of observations - 2172
['04/03/2023']
---
LYNNWOOD - 212TH
# of obser

## Considering data quantity and the freshness of data for each site, I decided to cut off some sites:

In [180]:
sites_to_delete = sites[-6:]
sites_to_delete

Index(['Tulalip-Tulalip Tribe', 'PUYALLUP-66TH AVE E (PUYALLUP TRIBE)',
       'SEATTLE - OLIVE ST', 'Auburn M St SE',
       'ENUMCLAW - MUD MTN (Army Corp of Engineers site)',
       'ISSAQUAH -  LAKE SAMMAMISH (Wiithin Lake Sammamish State Park)'],
      dtype='object')

In [181]:
seattle_df = seattle_df[~seattle_df.site_name.isin(sites_to_delete)].reset_index(drop=True)

In [182]:
seattle_df = seattle_df.dropna()

In [183]:
seattle_df.shape

(53806, 5)

In [184]:
# lets rename these sites so we could later concat this df with other cities data

seattle_df.site_name= seattle_df.site_name.apply(lambda x: "Seattle - " + x)

In [185]:
seattle_df.site_name.value_counts()

Seattle - NORTH BEND - NORTH BEND WAY                      3705
Seattle - TACOMA - L STREET                                3696
Seattle - SEATTLE - BEACON HILL                            3691
Seattle - MARYSVILLE - 7TH AVE (Marysville Junior High)    3648
Seattle - DARRINGTON - FIR ST (Darrington High School)     3614
Seattle - SEATTLE - SOUTH PARK #2                          3577
Seattle - TACOMA - ALEXANDER AVE                           3569
Seattle - KENT - JAMES & CENTRAL                           3556
Seattle - SEATTLE - DUWAMISH                               3439
Seattle - Seattle-10th & Weller                            3097
Seattle - LAKE FOREST PARK TOWNE CENTER                    2999
Seattle - PUYALLUP - 128TH ST                              2700
Seattle - Tacoma-S 36th St                                 2574
Seattle - Bellevue-SE 12th St                              2172
Seattle - LYNNWOOD - 212TH                                 2079
Seattle - Tukwila Allentown             

In [229]:
seattle_df.date = pd.to_datetime(seattle_df.date)

In [230]:
seattle_df

Unnamed: 0,site_name,date,pm25,latitude,longitude
0,Seattle - NORTH BEND - NORTH BEND WAY,2013-01-01,4.7,47.490220,-121.772780
1,Seattle - NORTH BEND - NORTH BEND WAY,2013-01-02,2.8,47.490220,-121.772780
2,Seattle - NORTH BEND - NORTH BEND WAY,2013-01-03,3.2,47.490220,-121.772780
3,Seattle - NORTH BEND - NORTH BEND WAY,2013-01-04,4.3,47.490220,-121.772780
4,Seattle - NORTH BEND - NORTH BEND WAY,2013-01-05,5.3,47.490220,-121.772780
...,...,...,...,...,...
54526,Seattle - MARYSVILLE - 7TH AVE (Marysville Jun...,2023-03-30,7.9,48.054315,-122.171529
54527,Seattle - MARYSVILLE - 7TH AVE (Marysville Jun...,2023-03-31,3.7,48.054315,-122.171529
54528,Seattle - MARYSVILLE - 7TH AVE (Marysville Jun...,2023-04-01,3.4,48.054315,-122.171529
54529,Seattle - MARYSVILLE - 7TH AVE (Marysville Jun...,2023-04-02,3.1,48.054315,-122.171529


In [231]:
seattle_df.to_csv("data/processed_seattle_pm25_2013_2022.csv", index=False)

# Weather data from open meteo

- Maximum Temperature (2 m)
- Minimum Temperature (2 m)
- Precipitation Sum
- Rain Sum
- Snowfall Sum
- Precipitation Hours
- Maximum Wind Speed (10 m)
- Maximum Wind Gusts (10 m)
- Dominant Wind Direction (10 m)


In [224]:
def convert_date_to_unix(x):
    """
    Convert datetime to unix time in milliseconds.
    """
    dt_obj = datetime.datetime.strptime(str(x), '%Y-%m-%d %H:%M:%S')
    dt_obj = int(dt_obj.timestamp() * 1000)
    return dt_obj

In [225]:
def get_city_coordinates(city_name: str):
    """
    Takes city name and returns its latitude and longitude (rounded to 2 digits after dot).
    """
    from geopy.geocoders import Nominatim
    
    
    # Initialize Nominatim API (for getting lat and long of the city)
    geolocator = Nominatim(user_agent="MyApp")
    city = geolocator.geocode(city_name)

    latitude = round(city.latitude, 2)
    longitude = round(city.longitude, 2)
    
    return latitude, longitude

In [226]:
get_city_coordinates("Seattle")

(47.6, -122.33)

In [204]:
def get_weather_data_from_open_meteo(city_name: str = None,
                                     coordinates: list = None,
                                     start_date: str = None,
                                     end_date: str = None,
                                     forecast: bool = False):
    """
    Takes city name and returns pandas DataFrame with weather data.
    """
    
    if coordinates:
        latitude, longitude = coordinates
    elif city_name:
        latitude, longitude = get_city_coordinates(city_name=city_name)
    
    params = {
        'latitude': latitude,
        'longitude': longitude,
        'daily': ["temperature_2m_max", "temperature_2m_min",
                  "precipitation_sum", "rain_sum", "snowfall_sum",
                  "precipitation_hours", "windspeed_10m_max",
                  "windgusts_10m_max", "winddirection_10m_dominant"],
        'start_date': start_date,
        'end_date': end_date,
        'timezone': "Europe/London"
    }
    
    if forecast:
        # historical forecast endpoint
        base_url = 'https://api.open-meteo.com/v1/forecast' 
    else:
        # historical observations endpoint
        base_url = 'https://archive-api.open-meteo.com/v1/archive?' 
        
    response = requests.get(base_url, params=params)

    response_json = response.json()    
    res_df = pd.DataFrame(response_json["daily"])
    
    res_df["city_name"] = city_name
    
    # rename columns
    res_df = res_df.rename(columns={
        "time": "date",
        "temperature_2m_max": "temperature_max",
        "temperature_2m_min": "temperature_min",
        "windspeed_10m_max": "wind_speed_max",
        "winddirection_10m_dominant": "wind_direction_dominant",
        "windgusts_10m_max": "wind_gusts_max"
    })
    
    # change columns order
    res_df = res_df[
        ['city_name', 'date', 'temperature_max', 'temperature_min',
         'precipitation_sum', 'rain_sum', 'snowfall_sum',
         'precipitation_hours', 'wind_speed_max',
         'wind_gusts_max', 'wind_direction_dominant']
    ]
    
    # convert dates in 'base_time' column
    res_df["date"] = pd.to_datetime(res_df["date"])
    
#     # create 'unix' columns
#     res_df["unix_time"] = res_df["base_time"].apply(convert_date_to_unix)
    
    return res_df

In [209]:
get_weather_data_from_open_meteo(city_name="Seattle", start_date="2023-01-01", end_date="2023-01-01")

Unnamed: 0,city_name,date,temperature_max,temperature_min,precipitation_sum,rain_sum,snowfall_sum,precipitation_hours,wind_speed_max,wind_gusts_max,wind_direction_dominant
0,Seattle,2023-01-01,8.6,2.7,1.8,1.8,0.0,8.0,7.1,23.0,155


In [210]:
get_weather_data_from_open_meteo(city_name="Seattle", start_date="2023-01-01", end_date="2023-01-01", forecast=1)

Unnamed: 0,city_name,date,temperature_max,temperature_min,precipitation_sum,rain_sum,snowfall_sum,precipitation_hours,wind_speed_max,wind_gusts_max,wind_direction_dominant
0,Seattle,2023-01-01,8.6,4.5,0.0,0.0,0.0,0.0,8.1,15.1,99


In [213]:
row = seattle_df.sample(1)
row

Unnamed: 0,site_name,date,pm25,latitude,longitude
36384,Seattle - LAKE FOREST PARK TOWNE CENTER,08/16/2020,11.8,47.755,-122.2806


In [221]:
site_name = row.site_name.values[0]
coordinates = list(row[['latitude', 'longitude']].values[0])

In [222]:
site_name, coordinates

('Seattle - LAKE FOREST PARK TOWNE CENTER', [47.755, -122.2806])

In [223]:
get_weather_data_from_open_meteo(city_name=site_name, coordinates=coordinates, start_date="2023-01-01", end_date="2023-01-01")

Unnamed: 0,city_name,date,temperature_max,temperature_min,precipitation_sum,rain_sum,snowfall_sum,precipitation_hours,wind_speed_max,wind_gusts_max,wind_direction_dominant
0,Seattle - LAKE FOREST PARK TOWNE CENTER,2023-01-01,8.1,3.9,2.2,2.2,0.0,10.0,7.8,14.8,140


In [55]:
date_today = datetime.datetime.now().strftime("%Y-%m-%d")

In [56]:
date_today

'2023-04-04'

In [101]:
df_w1 = get_weather_data_from_open_meteo("Krakow", date_today, date_today, forecast=True)

Parsed weather for Krakow since 2023-03-23 till 2023-03-23.
Took 0.86 sec.



In [101]:
df_w1 = get_weather_data_from_open_meteo("Krakow", date_today, date_today, forecast=True)

Parsed weather for Krakow since 2023-03-23 till 2023-03-23.
Took 0.86 sec.



In [102]:
df_w1

Unnamed: 0,city_name,time,temperature_2m_max,temperature_2m_min,precipitation_sum,rain_sum,snowfall_sum,precipitation_hours,windspeed_10m_max,windgusts_10m_max,winddirection_10m_dominant,unix_time
0,Krakow,2023-03-23,20.0,7.7,0.0,0.0,0.0,0.0,23.5,53.3,250,1679526000000


In [12]:
df_w2

Unnamed: 0,city_name,time,temperature_2m_max,temperature_2m_min,precipitation_sum,rain_sum,snowfall_sum,precipitation_hours,windspeed_10m_max,windgusts_10m_max,winddirection_10m_dominant,unix_time
0,Seattle,2022-01-01,0.7,-10.3,0.0,0.0,0.0,0.0,9.8,20.2,322,1640991600000


In [13]:
df_w1.shape, df_w2.shape

((1, 12), (1, 12))

In [14]:
def convert_to_daily(df, pollutant: str):
    """
    Returns DataFrame where pollutant column is resampled to days and rounded.
    """
    res_df = df.copy()
    # convert dates in 'time' column
    res_df["time"] = pd.to_datetime(res_df["time"])
    
    # I want data daily, not hourly (mean per each day = 1 datarow per 1 day)
    res_df = res_df.set_index('time')
    res_df = res_df[pollutant].resample('1d').mean().reset_index()
    res_df[pollutant] = res_df[pollutant].fillna(res_df[pollutant].median())
    res_df[pollutant] = res_df[pollutant].apply(lambda x: round(x, 0))
    
    return res_df

In [15]:
def get_air_quality_from_open_meteo(city_name: str,
                                    pollutant: str,
                                    start_date: str = None,
                                    end_date: str = None):
    """
    Takes city name, daterange and returns pandas DataFrame with hourly air quality data.
    
    ! It has data STARTING FROM '2022-07-29'.
    """
    
    latitude, longitude = get_city_coordinates(city_name=city_name)
    pollutant = pollutant.lower()
    # make it work with both "no2" and "nitrogen_dioxide" passed.
    if pollutant == "no2":
        pollutant = "nitrogen_dioxide"
        
    params = {
        'latitude': latitude,
        'longitude': longitude,
        'hourly': pollutant,
        'start_date': start_date,
        'end_date': end_date
    }

    # observations endpoint
    base_url = 'https://air-quality-api.open-meteo.com/v1/air-quality?' 
        
    response = requests.get(base_url, params=params)
    response_json = response.json()
    res_df = pd.DataFrame(response_json["hourly"])

    res_df = convert_to_daily(res_df, pollutant)
    
    res_df["city_name"] = city_name
    
    # change columns order
    res_df = res_df[["city_name", "time", pollutant]]
    
    if pollutant == "nitrogen_dioxide":
        res_df = res_df.rename(columns={
            pollutant: "no2"
        })
    
    # create 'unix' column
    res_df["unix_time"] = res_df["time"].apply(convert_date_to_unix)
    
    return res_df


In [105]:
df_aq = get_air_quality_from_open_meteo("Krakow", "no2", date_today, date_today)

In [106]:
df_aq.head(3)

Unnamed: 0,city_name,time,no2,unix_time
0,Krakow,2023-03-23,13.0,1679526000000


# [EEA](https://discomap.eea.europa.eu/map/fme/AirQualityExport.htm)
## EEA means European Environmental Agency

In [76]:
def find_fullest_csv(csv_links: list, year: str):
    candidates = [link for link in csv_links if str(year) in link]
    biggest_df = pd.read_csv(candidates[0])
    for link in candidates[1:]:
        _df = pd.read_csv(link)
        if len(biggest_df) < len(_df):
            biggest_df = _df
    return biggest_df

In [77]:
def get_air_quality_from_eea(city_name: str,
                             pollutant: str,
                             start_year: str = None,
                             end_year: str = None):
    """
    Takes city name, daterange and returns pandas DataFrame with daily air quality data.
    It parses data by 1-year batches, so please specify years, not dates. (example: "2014", "2022"...)
    
    EEA means European Environmental Agency. So it has data for Europe Union countries ONLY.
    """
    import time
    start_of_cell = time.time()
    
    params = {
        'CountryCode': '',
        'CityName': city_name,
        'Pollutant': pollutant.upper(),
        'Year_from': start_year,
        'Year_to': end_year,
        'Station': '',
        'Source': 'All',
        'Samplingpoint': '',
        'Output': 'TEXT',
        'UpdateDate': '',
        'TimeCoverage': "Year"
    }

    # observations endpoint
    base_url = "https://fme.discomap.eea.europa.eu/fmedatastreaming/AirQualityDownload/AQData_Extract.fmw?"

    response = requests.get(base_url, params=params)

    response.encoding = response.apparent_encoding
    csv_links = response.text.split("\r\n")
    
    res_df = pd.DataFrame()
    target_year = int(start_year)
    
    for year in range(int(start_year), int(end_year) + 1):
        # find the fullest, the biggest csv file with observations for this particular year
        _df = find_fullest_csv(csv_links, year)
        # append it to res_df
        res_df = pd.concat([res_df, _df])
        # print(res_df.shape[0])
    
    # res_df.to_csv("checkpoint.csv")
    
    pollutant = pollutant.lower()
    res_df = res_df.rename(columns={
        'DatetimeBegin': 'time',
        'Concentration': pollutant        
    })
    
    # cut timezones info
    res_df['time'] = res_df['time'].apply(lambda x: x[:-6])
    # convert dates in 'time' column
    res_df['time'] = pd.to_datetime(res_df['time'])
    
    res_df = convert_to_daily(res_df, pollutant)
    
    res_df['city_name'] = city_name
    
    # create 'unix' column
    res_df['unix_time'] = res_df['time'].apply(convert_date_to_unix) 
    res_df = res_df[['city_name', 'time', pollutant.lower(), 'unix_time']]
    
    end_of_cell = time.time()
    
    print(f"Processed {pollutant.upper()} for {city_name} since {start_year} till {end_year}.")
    print(f"Took {round(end_of_cell - start_of_cell, 2)} sec.\n")
    
    return res_df

In [117]:
import unicodedata

def remove_diacritics(city_name):
    """
    Remove diacritics from a given city name.
    """
    return ''.join(c for c in unicodedata.normalize('NFD', city_name) if unicodedata.category(c) != 'Mn')


In [118]:
remove_diacritics("Kraków")

'Krakow'

In [115]:
df_eea = get_air_quality_from_eea(
    city_name="Kraków", pollutant="PM10",
    start_year="2013", end_year="2014"
)

Processed PM10 for Kraków since 2013 till 2014.
Took 8.43 sec.



In [116]:
df_eea

Unnamed: 0,city_name,time,pm10,unix_time
0,Kraków,2013-01-01,214.0,1356994800000
1,Kraków,2013-01-02,85.0,1357081200000
2,Kraków,2013-01-03,31.0,1357167600000
3,Kraków,2013-01-04,17.0,1357254000000
4,Kraków,2013-01-05,18.0,1357340400000
...,...,...,...,...
725,Kraków,2014-12-27,65.0,1419634800000
726,Kraków,2014-12-28,52.0,1419721200000
727,Kraków,2014-12-29,57.0,1419807600000
728,Kraków,2014-12-30,95.0,1419894000000


# [USEPA](https://aqs.epa.gov/aqsweb/documents/data_api.html#daily)
## USEPA means United States Environmental Protection Agency
[Manual downloading](https://www.epa.gov/outdoor-air-quality-data/download-daily-data)

In [41]:
city_code_dict = {}
pollutant_dict = {
    'CO': '42101',
    'SO2': '42401',
    'NO2': '42602',
    'O3': '44201',
    'PM10': '81102',
    'PM2.5': '88101'
}

def get_city_code(city_name: str):
    "Encodes city name to be used later for data parsing using USEPA."
    if city_code_dict:
        city_full = [i for i in city_code_dict.keys() if city_name in i][0]
        return city_code_dict[city_full]
    else:
        params = {
            "email": "test@aqs.api",
            "key": "test"
        }
        response = requests.get("https://aqs.epa.gov/data/api/list/cbsas?", params)
        response_json = response.json()
        data = response_json["Data"]
        for item in data:
            city_code_dict[item['value_represented']] = item['code']
        
        return get_city_code(city_name)

In [42]:
get_city_code("Seattle")

'42660'

In [20]:
import datetime

def make_date_intervals(start_date, end_date):
    start_dt = datetime.datetime.strptime(start_date, '%Y-%m-%d')
    end_dt = datetime.datetime.strptime(end_date, '%Y-%m-%d')
    date_intervals = []
    for year in range(start_dt.year, end_dt.year + 1):
        year_start = datetime.datetime(year, 1, 1)
        year_end = datetime.datetime(year, 12, 31)
        interval_start = max(start_dt, year_start)
        interval_end = min(end_dt, year_end)
        if interval_start < interval_end:
            date_intervals.append((interval_start.strftime('%Y%m%d'), interval_end.strftime('%Y%m%d')))
    return date_intervals


In [94]:
import time


def get_air_quality_from_usepa(city_name: str,
                               pollutant: str,
                               start_date: str = None,
                               end_date: str = None):
    """
    Takes city name, daterange and returns pandas DataFrame with daily air quality data.
    
    USEPA means United States Environmental Protection Agency. So it has data for US ONLY.
    """
    start_of_cell = time.time()
    
    pollutant = pollutant.lower()
    
    res_df = pd.DataFrame()
    
    # to print 'Success' log only once.
    was = False
    for start_date_, end_date_ in make_date_intervals(start_date, end_date):
        params = {
            "email": "test@aqs.api",
            "key": "test",
            "param": pollutant_dict[pollutant.upper()], # encoded pollutant 
            "bdate": start_date_,
            "edate": end_date_,
            "cbsa": get_city_code(city_name) # Core-based statistical area
        }

        # observations endpoint
        base_url = "https://aqs.epa.gov/data/api/dailyData/byCBSA?" 

        response = requests.get(base_url, params=params)
        response_json = response.json()
        if not was:
            print(response_json["Header"][0]["status"])
            was = True
        
        df_ = pd.DataFrame(response_json["Data"])


        df_ = df_.rename(columns={
            'date_local': 'time',
            'arithmetic_mean': pollutant        
        })

        # convert dates in 'time' column
        df_['time'] = pd.to_datetime(df_['time'])

        df_['city_name'] = city_name

        # create 'unix' column
        df_['unix_time'] = df_['time'].apply(convert_date_to_unix) 
        df_ = df_[['city_name', 'time', pollutant.lower(), 'unix_time']]

        res_df = pd.concat([res_df, df_])
    
    # there are duplicated rows (several records for the same day and station). get rid of it.
    res_df = res_df.groupby(['time', 'city_name'], as_index=False)[pollutant].mean()
    
    end_of_cell = time.time()
    print(f"Processed {pollutant.upper()} for {city_name} since {start_date} till {end_date}.")
    print(f"Took {round(end_of_cell - start_of_cell, 2)} sec.\n")
    
    return res_df

In [96]:
df = get_air_quality_from_usepa(city_name="Chicago", pollutant="NO2",
                                start_date="2018-01-01", end_date="2018-10-01")

Success
Processed NO2 for Chicago since 2018-01-01 till 2018-10-01.
Took 1.39 sec.



In [86]:
df2 = get_air_quality_from_eea(city_name="Berlin", pollutant="PM10",
                               start_year="2018", end_year="2022")

Processed PM10 for Berlin since 2018 till 2022.
Took 40.58 sec.



In [100]:
df2

Unnamed: 0,city_name,time,pm10,unix_time
0,Berlin,2018-01-01,14.0,1514761200000
1,Berlin,2018-01-02,11.0,1514847600000
2,Berlin,2018-01-03,7.0,1514934000000
3,Berlin,2018-01-04,9.0,1515020400000
4,Berlin,2018-01-05,9.0,1515106800000
...,...,...,...,...
1820,Berlin,2022-12-26,8.0,1672009200000
1821,Berlin,2022-12-27,10.0,1672095600000
1822,Berlin,2022-12-28,9.0,1672182000000
1823,Berlin,2022-12-29,6.0,1672268400000


# I want different stations across Seattle, WA

## I will try this API - [OpenAQ API](https://docs.openaq.org/)

In [130]:
import requests

# set the API endpoint URL
url = "https://api.waqi.info/map/bounds/"

# set the parameters for the API request
parameters = {
    "latlng": "47.4962,-122.3182,47.7341,-122.1096",
    "inc": "placeholders",
    "token": "3d5f8a77a5a1324e48eb097fd65b37331016783b" # replace with your WAQI API token
}

# make the API request and retrieve the response
response = requests.get(url, params=parameters)

# check if the response was successful
if response.status_code == 200:
    # retrieve the data from the response
    data = response.json()["data"]
else:
    print("Error retrieving data from API")


In [131]:
data

[{'lat': 47.5682,
  'lon': -122.3086,
  'uid': 140,
  'aqi': '28',
  'station': {'name': 'Beacon Hill, Seattle, Washington, USA',
   'time': '2023-03-24T23:00:00+09:00'}},
 {'lat': 47.6171136,
  'lon': -122.3044689,
  'uid': 142,
  'aqi': '9',
  'station': {'name': 'Olive St, Seattle, Washington, USA',
   'time': '2023-03-24T22:00:00+09:00'}},
 {'lat': 47.600863,
  'lon': -122.148397,
  'uid': 9284,
  'aqi': '4',
  'station': {'name': 'SE 12th St, Bellevue, Washington, USA',
   'time': '2023-03-24T23:00:00+09:00'}},
 {'lat': 47.498535,
  'lon': -122.278385,
  'uid': 9508,
  'aqi': '17',
  'station': {'name': 'Tukwilla Allentown, Washington, USA',
   'time': '2023-03-24T22:00:00+09:00'}}]

In [163]:
import requests
import json

def get_historical_air_quality(city_name, pollutant, start_date, end_date):
    YOUR_API_KEY = "ec06f8b18e27406ea8587c97b00e4c1d"
    url = f"http://api.weatherbit.io/v2.0/history/daily?city={city_name}&start_date={start_date}&end_date={end_date}&key={YOUR_API_KEY}"
    response = requests.get(url)
    response_json = response.json()
    # print(response_json)
    return pd.DataFrame(response_json["data"])

In [166]:
df = get_historical_air_quality("Seattle", "pm10", "2016-04-01", "2017-05-01")

KeyError: 'data'

In [165]:
df

Unnamed: 0,clouds,datetime,dewpt,dhi,dni,ghi,max_dhi,max_dni,max_ghi,max_temp,...,solar_rad,t_dhi,t_dni,t_ghi,t_solar_rad,temp,ts,wind_dir,wind_gust_spd,wind_spd
0,90,2017-04-01,8.1,45.5,371.7,239.1,113.3,886.6,757.1,13.1,...,63.1,1092.1,8921.4,5738.0,1515.3,10.9,1491030000,180,3.9,2.0
1,64,2017-04-02,4.6,45.8,374.2,241.7,113.5,887.7,762.2,12.5,...,206.5,1099.7,8981.1,5799.9,4955.1,8.9,1491116400,198,4.5,1.8
2,60,2017-04-03,3.2,46.1,376.6,244.2,113.8,888.7,767.2,11.1,...,189.0,1107.0,9039.4,5861.5,4535.8,8.5,1491202800,243,5.8,2.2
3,100,2017-04-04,4.1,46.4,379.0,246.8,114.0,889.8,772.2,15.7,...,62.2,1114.1,9096.0,5922.8,1493.5,9.9,1491289200,231,7.8,1.7
4,100,2017-04-05,9.0,46.7,381.3,249.3,114.3,890.8,777.1,12.2,...,29.2,1121.1,9150.8,5983.7,700.6,10.5,1491375600,180,4.5,2.1
5,92,2017-04-06,8.7,47.0,383.5,251.8,114.5,891.8,781.9,13.9,...,108.1,1127.8,9203.7,6044.2,2595.2,11.4,1491462000,254,5.1,2.5
6,98,2017-04-07,6.8,47.3,385.6,254.3,114.8,892.7,786.7,14.8,...,74.1,1134.3,9254.9,6104.2,1778.0,10.8,1491548400,193,12.2,4.4
7,72,2017-04-08,4.4,47.5,387.7,256.8,115.0,893.7,791.4,10.6,...,132.6,1140.7,9304.4,6163.8,3182.1,8.4,1491634800,185,9.9,3.8
8,69,2017-04-09,2.2,47.8,389.7,259.3,115.2,894.6,796.0,11.7,...,85.1,1146.9,9352.1,6222.9,2042.5,8.9,1491721200,199,5.8,2.1
9,69,2017-04-10,5.5,48.0,391.6,261.7,115.4,895.4,800.6,10.9,...,187.4,1152.9,9398.1,6281.4,4498.6,8.8,1491807600,190,5.1,3.1


In [141]:
data = get_air_quality_from_waqi()

In [142]:
data

{'aqi': 9,
 'idx': 5855,
 'attributions': [{'url': 'http://www.ecy.wa.gov/',
   'name': 'Washington State Department of Ecology',
   'logo': 'US-Washignton-State-Department-of-Ecology.png'},
  {'url': 'http://www.airnow.gov/', 'name': 'Air Now - US EPA'},
  {'url': 'https://waqi.info/', 'name': 'World Air Quality Index Project'}],
 'city': {'geo': [47.597222, -122.319722],
  'name': '10th and Weller, Seattle, Washington, USA',
  'url': 'https://aqicn.org/city/usa/washington/seattle/10th-and-weller',
  'location': ''},
 'dominentpol': 'pm25',
 'iaqi': {'h': {'v': 85.2},
  'no2': {'v': 15.7},
  'p': {'v': 1018.4},
  'pm25': {'v': 9},
  't': {'v': 3.8},
  'w': {'v': 4.5},
  'wg': {'v': 8.7}},
 'time': {'s': '2023-03-24 06:00:00',
  'tz': '-07:00',
  'v': 1679637600,
  'iso': '2023-03-24T06:00:00-07:00'},
 'forecast': {'daily': {'o3': [{'avg': 4,
     'day': '2023-03-22',
     'max': 18,
     'min': 1},
    {'avg': 15, 'day': '2023-03-23', 'max': 18, 'min': 10},
    {'avg': 10, 'day': '202

# Put everything together

In [None]:
city_names = {
    "EU": [
        ""        
    ],
    "USA": [
        
    ]
}

## 1. Weather data

# Data engineering

In [None]:
# Data engineering

def moving_average(df, window=7):
    df[f'mean_{window}_days'] = df.groupby('station_id')['users_count'] \
                                    .rolling(window=window).mean().reset_index(0,drop=True).shift(1)
    return df

# def moving_average(df, window=7):
#     df[f'mean_{window}_days'] = df["users_count"].rolling(window=window).mean()
#     return df


def moving_std(df, window):
    df[f'std_{window}_days'] = df.groupby('station_id')['users_count'] \
                                    .rolling(window=window).std().reset_index(0,drop=True).shift(1)
    return df


def exponential_moving_average(df, window):
    df[f'exp_mean_{window}_days'] = df.groupby('station_id')['users_count'].ewm(span=window) \
                                        .mean().reset_index(0,drop=True).shift(1)
    return df


def exponential_moving_std(df, window):
    df[f'exp_std_{window}_days'] = df.groupby('station_id')['users_count'].ewm(span=window) \
                                        .std().reset_index(0,drop=True).shift(1)
    return df


def engineer_citibike_features(df):
    df_res = df.copy()
    # there are duplicated rows (several records for the same day and station). get rid of it.
    df_res = df_res.groupby(['date', 'station_id'], as_index=False)['users_count'].sum()

    df_res['prev_users_count'] = df_res.groupby('station_id')['users_count'].shift(+1)
    df_res = df_res.dropna()
    df_res = moving_average(df_res, 7)
    df_res = moving_average(df_res, 14)


    for i in [7, 14]:
        for func in [moving_std, exponential_moving_average,
                     exponential_moving_std
                     ]:
            df_res = func(df_res, i)
    df_res = df_res.reset_index(drop=True)
    return df_res.sort_values(by=["date", "station_id"]).dropna()

In [2]:

import pandas as pd

In [3]:
df = pd.read_csv("seattle_data_2022.csv")

In [53]:
import folium

# Create a folium map centered on the first location in the list
map = folium.Map(location=locations[0], zoom_start=14)

# Add markers for each location to the map
for index, row in stations_df.iterrows():
    folium.Marker(location=[row['latitude'], row['longitude']]).add_to(map)

# map
# # Save the map to an HTML file
map.save("map.html")
