In [2]:
import os
import datetime
import time
import requests
import pandas as pd
import json
import ast

from geopy.geocoders import Nominatim

In [3]:
def convert_date_to_unix(x):
    """
    Convert datetime to unix time in milliseconds.
    """
    dt_obj = datetime.datetime.strptime(str(x), '%Y-%m-%d %H:%M:%S')
    dt_obj = int(dt_obj.timestamp() * 1000)
    return dt_obj

In [4]:
def get_city_coordinates(city_name: str):
    """
    Takes city name and returns its latitude and longitude (rounded to 2 digits after dot).
    """ 
    # Initialize Nominatim API (for getting lat and long of the city)
    geolocator = Nominatim(user_agent="MyApp")
    city = geolocator.geocode(city_name)

    latitude = round(city.latitude, 2)
    longitude = round(city.longitude, 2)
    
    return latitude, longitude

# Representing the Target cities 

In [32]:
with open('target_cities.json') as json_file:
    target_cities = json.load(json_file)

In [33]:
for i in target_cities:
    print(i)
    print(len(target_cities[i]))

EU
17
US
13
Seattle
15


In [34]:
from pprint import pprint

pprint(target_cities)

{'EU': ['Amsterdam',
        'Athina',
        'Berlin',
        'Gdansk',
        'Kraków',
        'London',
        'Madrid',
        'Marseille',
        'Milano',
        'München',
        'Napoli',
        'Paris',
        'Sevilla',
        'Stockholm',
        'Tallinn',
        'Varna',
        'Wien'],
 'Seattle': {'Seattle - Bellevue-SE 12th St': [47.60086, -122.1484],
             'Seattle - DARRINGTON - FIR ST (Darrington High School)': [48.2469,
                                                                        -121.6031],
             'Seattle - KENT - JAMES & CENTRAL': [47.38611, -122.23028],
             'Seattle - LAKE FOREST PARK TOWNE CENTER': [47.755, -122.2806],
             'Seattle - MARYSVILLE - 7TH AVE (Marysville Junior High)': [48.05432,
                                                                         -122.17153],
             'Seattle - NORTH BEND - NORTH BEND WAY': [47.49022, -121.77278],
             'Seattle - SEATTLE - BEACON HILL': [47.56

In [135]:
# with open("target_cities.json", "w") as json_file:
#     json.dump(target_cities, json_file)

### ALL target cities on the one map

In [139]:
# Create a folium map centered on the first location in the list
map = folium.Map(location=[42.57, -44.092], zoom_start=3)

for city in target_cities["EU"]:
    latitude, longitude = get_city_coordinates(city)
    folium.Marker(location=[latitude, longitude]).add_to(map)
    
for city in target_cities["US"]:
    latitude, longitude = get_city_coordinates(city)
    folium.Marker(location=[latitude, longitude]).add_to(map)

for city in target_cities["Seattle"]:
    latitude, longitude = target_cities["Seattle"][city]
    folium.Marker(location=[latitude, longitude]).add_to(map)

# Save the map to an HTML file
map.save("map_all_target_cities.html")

# Air Quality [Open Meteo](https://open-meteo.com/en/docs/air-quality-api)

### I will be using this functionality to fill gaps / parse recent data for all my target cities (locations).

#### Note that Open Meteo gives data from `2022-08-04` till `(today + 6 days)` time period.

In [36]:
def get_aqi_data_from_open_meteo(city_name: str,
                                 start_date: str,
                                 end_date: str,
                                 coordinates: list = None,
                                 pollutant: str = "pm2_5"):
    """
    Takes [city name OR coordinates] and returns pandas DataFrame with AQI data.
    
    Examples of arguments:
        ...
        coordinates=(47.755, -122.2806),
        start_date="2023-01-01",
        pollutant="no2"
        ...
    """
    start_of_cell = time.time()
    
    if coordinates:
        latitude, longitude = coordinates
    else:
        latitude, longitude = get_city_coordinates(city_name=city_name)
    
    pollutant = pollutant.lower()
    if pollutant == "pm2.5":
        pollutant = "pm2_5"
    
    # make it work with both "no2" and "nitrogen_dioxide" passed.
    if pollutant == "no2":
        pollutant = "nitrogen_dioxide"
        
    params = {
        'latitude': latitude,
        'longitude': longitude,
        'hourly': [pollutant],
        'start_date': start_date,
        'end_date': end_date,
        'timezone': "Europe/London"
    }
    
    # base endpoint
    base_url = "https://air-quality-api.open-meteo.com/v1/air-quality"
        
    response = requests.get(base_url, params=params)

    response_json = response.json()    
    res_df = pd.DataFrame(response_json["hourly"])       
    
    # convert dates
    res_df["time"] = pd.to_datetime(res_df["time"])
    
    # resample to days
    res_df = res_df.groupby(res_df['time'].dt.date).mean(numeric_only=True).reset_index()
    
    res_df[pollutant] = round(res_df[pollutant], 1)
    
    # rename columns
    res_df = res_df.rename(columns={
        "time": "date"
    })
    
    res_df["city_name"] = city_name
    
    # change columns order
    res_df = res_df[
        ['city_name', 'date', pollutant]
    ]
    
#     # create 'unix' columns
#     res_df["unix_time"] = res_df["base_time"].apply(convert_date_to_unix)

    end_of_cell = time.time()
    print(f"Processed {pollutant.upper()} for {city_name} since {start_date} till {end_date}.")
    print(f"Took {round(end_of_cell - start_of_cell, 2)} sec.\n")
    
    return res_df


In [38]:
df1 = get_aqi_data_from_open_meteo(city_name="Milano",
                                   start_date="2022-08-04", end_date="2023-01-01")
df1.tail(3)

Processed PM2_5 for Milano since 2022-08-04 till 2023-01-01.
Took 0.65 sec.



Unnamed: 0,city_name,date,pm2_5
148,Milano,2022-12-30,60.1
149,Milano,2022-12-31,48.9
150,Milano,2023-01-01,50.9


In [39]:
coords_test = (47.60086, -122.14839)

In [40]:
df2 = get_aqi_data_from_open_meteo(city_name="somewhere near Seattle", coordinates=coords_test,
                                   start_date="2023-01-10", end_date="2023-01-19")
df2.tail(3)

Processed PM2_5 for somewhere near Seattle since 2023-01-10 till 2023-01-19.
Took 0.29 sec.



Unnamed: 0,city_name,date,pm2_5
7,somewhere near Seattle,2023-01-17,5.7
8,somewhere near Seattle,2023-01-18,3.7
9,somewhere near Seattle,2023-01-19,6.3


# Filling the gap in Air Quality data (PM2.5)

In [41]:
df_eu = pd.read_csv("data/backfill_pm2_5_eu.csv")
df_us = pd.read_csv("data/backfill_pm2_5_us.csv")
df_seattle = pd.read_csv("data/backfill_pm2_5_seattle.csv")

In [42]:
last_dates_eu = df_eu[["date", "city_name"]].groupby("city_name").max()
last_dates_eu.date = last_dates_eu.date.astype(str)
last_dates_eu = last_dates_eu.to_dict()["date"]

last_dates_us = df_us[["date", "city_name"]].groupby("city_name").max()
last_dates_us.date = last_dates_us.date.astype(str)
last_dates_us = last_dates_us.to_dict()["date"]

last_dates_seattle = df_seattle[["date", "city_name"]].groupby("city_name").max()
last_dates_seattle.date = last_dates_seattle.date.astype(str)
last_dates_seattle = last_dates_seattle.to_dict()["date"]

In [43]:
target_cities["Seattle"]['Seattle - Bellevue-SE 12th St']

[47.60086, -122.1484]

In [44]:
last_dates_eu

{'Amsterdam': '2023-04-11',
 'Athina': '2023-04-11',
 'Berlin': '2023-04-11',
 'Gdansk': '2023-04-11',
 'Kraków': '2023-04-11',
 'London': '2023-04-11',
 'Madrid': '2023-04-11',
 'Marseille': '2023-04-11',
 'Milano': '2023-04-11',
 'München': '2023-04-11',
 'Napoli': '2023-04-11',
 'Paris': '2023-04-11',
 'Sevilla': '2023-04-11',
 'Stockholm': '2023-04-11',
 'Tallinn': '2023-04-11',
 'Varna': '2022-08-02',
 'Wien': '2023-04-11'}

In [45]:
date_today = datetime.datetime.now().strftime("%Y-%m-%d")
date_today

'2023-04-13'

In [46]:
start_of_cell = time.time()

df_eu_update = pd.DataFrame()
for city_name in last_dates_eu: 
    df_ = get_aqi_data_from_open_meteo(city_name=city_name,
                                       start_date=last_dates_eu[city_name],
                                       end_date=date_today)
    df_eu_update = pd.concat([df_eu_update, df_]).reset_index(drop=True)
    
    
end_of_cell = time.time()
print("-" * 64)
print(f"Parsed new PM2.5 data for EU cities up to {date_today}.")
print(f"Took {round(end_of_cell - start_of_cell, 2)} sec.\n")

Processed PM2_5 for Amsterdam since 2023-04-11 till 2023-04-13.
Took 0.54 sec.

Processed PM2_5 for Athina since 2023-04-11 till 2023-04-13.
Took 1.3 sec.

Processed PM2_5 for Berlin since 2023-04-11 till 2023-04-13.
Took 0.54 sec.

Processed PM2_5 for Gdansk since 2023-04-11 till 2023-04-13.
Took 0.48 sec.

Processed PM2_5 for Kraków since 2023-04-11 till 2023-04-13.
Took 0.49 sec.

Processed PM2_5 for London since 2023-04-11 till 2023-04-13.
Took 0.72 sec.

Processed PM2_5 for Madrid since 2023-04-11 till 2023-04-13.
Took 0.74 sec.

Processed PM2_5 for Marseille since 2023-04-11 till 2023-04-13.
Took 0.52 sec.

Processed PM2_5 for Milano since 2023-04-11 till 2023-04-13.
Took 0.55 sec.

Processed PM2_5 for München since 2023-04-11 till 2023-04-13.
Took 0.48 sec.

Processed PM2_5 for Napoli since 2023-04-11 till 2023-04-13.
Took 0.48 sec.

Processed PM2_5 for Paris since 2023-04-11 till 2023-04-13.
Took 0.57 sec.

Processed PM2_5 for Sevilla since 2023-04-11 till 2023-04-13.
Took 0.47

In [48]:
start_of_cell = time.time()

df_us_update = pd.DataFrame()
for city_name in last_dates_us: 
    df_ = get_aqi_data_from_open_meteo(city_name=city_name,
                                       start_date=last_dates_us[city_name],
                                       end_date=date_today)
    df_us_update = pd.concat([df_us_update, df_]).reset_index(drop=True)
    
    
end_of_cell = time.time()
print("-" * 64)
print(f"Parsed new PM2.5 data for US cities up to {date_today}.")
print(f"Took {round(end_of_cell - start_of_cell, 2)} sec.\n")

Processed PM2_5 for Albuquerque since 2022-12-31 till 2023-04-13.
Took 0.56 sec.

Processed PM2_5 for Atlanta since 2022-12-31 till 2023-04-13.
Took 0.55 sec.

Processed PM2_5 for Chicago since 2022-12-31 till 2023-04-13.
Took 0.54 sec.

Processed PM2_5 for Columbus since 2022-12-31 till 2023-04-13.
Took 0.5 sec.

Processed PM2_5 for Dallas since 2022-12-31 till 2023-04-13.
Took 0.6 sec.

Processed PM2_5 for Denver since 2022-12-31 till 2023-04-13.
Took 0.52 sec.

Processed PM2_5 for Houston since 2022-12-31 till 2023-04-13.
Took 0.55 sec.

Processed PM2_5 for Los Angeles since 2022-12-31 till 2023-04-13.
Took 0.6 sec.

Processed PM2_5 for New York since 2022-12-31 till 2023-04-13.
Took 0.57 sec.

Processed PM2_5 for Phoenix-Mesa since 2022-12-31 till 2023-04-13.
Took 0.62 sec.

Processed PM2_5 for Salt Lake City since 2022-12-31 till 2023-04-13.
Took 0.5 sec.

Processed PM2_5 for San Francisco since 2022-12-31 till 2023-04-13.
Took 0.64 sec.

Processed PM2_5 for Tampa since 2022-12-31

In [49]:
last_dates_seattle['Seattle - Bellevue-SE 12th St']

'2023-04-03'

In [50]:
start_of_cell = time.time()

df_seattle_update = pd.DataFrame()
for city_name in last_dates_seattle:
    coordinates = target_cities["Seattle"][city_name]
    df_ = get_aqi_data_from_open_meteo(city_name=city_name,
                                       coordinates=coordinates,
                                       start_date=last_dates_seattle[city_name],
                                       end_date=date_today)
    df_seattle_update = pd.concat([df_seattle_update, df_]).reset_index(drop=True)
    

end_of_cell = time.time()
print("-" * 64)
print(f"Parsed new PM2.5 data for Seattle and surrounding areas up to {date_today}.")
print(f"Took {round(end_of_cell - start_of_cell, 2)} sec.\n")

Processed PM2_5 for Seattle - Bellevue-SE 12th St since 2023-04-03 till 2023-04-13.
Took 0.27 sec.

Processed PM2_5 for Seattle - DARRINGTON - FIR ST (Darrington High School) since 2023-04-03 till 2023-04-13.
Took 0.24 sec.

Processed PM2_5 for Seattle - KENT - JAMES & CENTRAL since 2023-04-03 till 2023-04-13.
Took 0.25 sec.

Processed PM2_5 for Seattle - LAKE FOREST PARK TOWNE CENTER since 2023-04-03 till 2023-04-13.
Took 0.26 sec.

Processed PM2_5 for Seattle - MARYSVILLE - 7TH AVE (Marysville Junior High) since 2023-04-03 till 2023-04-13.
Took 0.25 sec.

Processed PM2_5 for Seattle - NORTH BEND - NORTH BEND WAY since 2023-04-03 till 2023-04-13.
Took 0.25 sec.

Processed PM2_5 for Seattle - SEATTLE - BEACON HILL since 2023-04-03 till 2023-04-13.
Took 0.25 sec.

Processed PM2_5 for Seattle - SEATTLE - DUWAMISH since 2023-04-03 till 2023-04-13.
Took 0.26 sec.

Processed PM2_5 for Seattle - SEATTLE - SOUTH PARK #2 since 2023-04-03 till 2023-04-13.
Took 0.24 sec.

Processed PM2_5 for Sea

In [51]:
df_seattle_update

Unnamed: 0,city_name,date,pm2_5
0,Seattle - Bellevue-SE 12th St,2023-04-03,6.4
1,Seattle - Bellevue-SE 12th St,2023-04-04,9.9
2,Seattle - Bellevue-SE 12th St,2023-04-05,12.6
3,Seattle - Bellevue-SE 12th St,2023-04-06,12.2
4,Seattle - Bellevue-SE 12th St,2023-04-07,4.7
...,...,...,...
161,Seattle - Tulalip-Totem Beach Rd,2023-04-09,9.0
162,Seattle - Tulalip-Totem Beach Rd,2023-04-10,6.3
163,Seattle - Tulalip-Totem Beach Rd,2023-04-11,3.9
164,Seattle - Tulalip-Totem Beach Rd,2023-04-12,9.1


In [54]:
df_air_quality = pd.concat([
    df_eu, df_us, df_seattle, df_eu_update,
    df_us_update, df_seattle_update
]).reset_index(drop=True)

In [56]:
# how many observations (days) we have per one location
df_air_quality[['city_name', 'date']].groupby('city_name').count()

Unnamed: 0_level_0,date
city_name,Unnamed: 1_level_1
Albuquerque,3753
Amsterdam,3756
Athina,3756
Atlanta,3750
Berlin,3756
Chicago,3754
Columbus,2595
Dallas,3751
Denver,3754
Gdansk,3756
