In [1]:
import os
import datetime
import time
import requests
import pandas as pd
import json
import ast

from geopy.geocoders import Nominatim

In [2]:
def convert_date_to_unix(x):
    """
    Convert datetime to unix time in milliseconds.
    """
    dt_obj = datetime.datetime.strptime(str(x), '%Y-%m-%d %H:%M:%S')
    dt_obj = int(dt_obj.timestamp() * 1000)
    return dt_obj

In [3]:
def get_city_coordinates(city_name: str):
    """
    Takes city name and returns its latitude and longitude (rounded to 2 digits after dot).
    """ 
    # Initialize Nominatim API (for getting lat and long of the city)
    geolocator = Nominatim(user_agent="MyApp")
    city = geolocator.geocode(city_name)

    latitude = round(city.latitude, 2)
    longitude = round(city.longitude, 2)
    
    return latitude, longitude

# Representing the Target cities 

In [4]:
with open('target_cities.json') as json_file:
    target_cities = json.load(json_file)

In [5]:
for i in target_cities:
    print(i)
    print(len(target_cities[i]))

EU
17
US
13
Seattle
15


In [6]:
from pprint import pprint

pprint(target_cities)

{'EU': ['Amsterdam',
        'Athina',
        'Berlin',
        'Gdansk',
        'Kraków',
        'London',
        'Madrid',
        'Marseille',
        'Milano',
        'München',
        'Napoli',
        'Paris',
        'Sevilla',
        'Stockholm',
        'Tallinn',
        'Varna',
        'Wien'],
 'Seattle': {'Seattle - Bellevue-SE 12th St': [47.60086, -122.1484],
             'Seattle - DARRINGTON - FIR ST (Darrington High School)': [48.2469,
                                                                        -121.6031],
             'Seattle - KENT - JAMES & CENTRAL': [47.38611, -122.23028],
             'Seattle - LAKE FOREST PARK TOWNE CENTER': [47.755, -122.2806],
             'Seattle - MARYSVILLE - 7TH AVE (Marysville Junior High)': [48.05432,
                                                                         -122.17153],
             'Seattle - NORTH BEND - NORTH BEND WAY': [47.49022, -121.77278],
             'Seattle - SEATTLE - BEACON HILL': [47.56

In [7]:
# with open("target_cities.json", "w") as json_file:
#     json.dump(target_cities, json_file)

### ALL target cities on the one map

In [139]:
# Create a folium map centered on the first location in the list
map = folium.Map(location=[42.57, -44.092], zoom_start=3)

for city in target_cities["EU"]:
    latitude, longitude = get_city_coordinates(city)
    folium.Marker(location=[latitude, longitude]).add_to(map)
    
for city in target_cities["US"]:
    latitude, longitude = get_city_coordinates(city)
    folium.Marker(location=[latitude, longitude]).add_to(map)

for city in target_cities["Seattle"]:
    latitude, longitude = target_cities["Seattle"][city]
    folium.Marker(location=[latitude, longitude]).add_to(map)

# Save the map to an HTML file
map.save("map_all_target_cities.html")

# Air Quality [Open Meteo](https://open-meteo.com/en/docs/air-quality-api)

### I will be using this functionality to fill gaps / parse recent data for all my target cities (locations).

#### Note that Open Meteo gives data from `2022-08-04` till `(today + 6 days)` time period.

In [8]:
def get_aqi_data_from_open_meteo(city_name: str,
                                 start_date: str,
                                 end_date: str,
                                 coordinates: list = None,
                                 pollutant: str = "pm2_5"):
    """
    Takes [city name OR coordinates] and returns pandas DataFrame with AQI data.
    
    Examples of arguments:
        ...
        coordinates=(47.755, -122.2806),
        start_date="2023-01-01",
        pollutant="no2"
        ...
    """
    start_of_cell = time.time()
    
    if coordinates:
        latitude, longitude = coordinates
    else:
        latitude, longitude = get_city_coordinates(city_name=city_name)
    
    pollutant = pollutant.lower()
    if pollutant == "pm2.5":
        pollutant = "pm2_5"
    
    # make it work with both "no2" and "nitrogen_dioxide" passed.
    if pollutant == "no2":
        pollutant = "nitrogen_dioxide"
        
    params = {
        'latitude': latitude,
        'longitude': longitude,
        'hourly': [pollutant],
        'start_date': start_date,
        'end_date': end_date,
        'timezone': "Europe/London"
    }
    
    # base endpoint
    base_url = "https://air-quality-api.open-meteo.com/v1/air-quality"
        
    response = requests.get(base_url, params=params)

    response_json = response.json()    
    res_df = pd.DataFrame(response_json["hourly"])       
    
    # convert dates
    res_df["time"] = pd.to_datetime(res_df["time"])
    
    # resample to days
    res_df = res_df.groupby(res_df['time'].dt.date).mean(numeric_only=True).reset_index()
    
    res_df[pollutant] = round(res_df[pollutant], 1)
    
    # rename columns
    res_df = res_df.rename(columns={
        "time": "date"
    })
    
    res_df["city_name"] = city_name
    
    # change columns order
    res_df = res_df[
        ['city_name', 'date', pollutant]
    ]
    
#     # create 'unix' columns
#     res_df["unix_time"] = res_df["base_time"].apply(convert_date_to_unix)

    end_of_cell = time.time()
    print(f"Processed {pollutant.upper()} for {city_name} since {start_date} till {end_date}.")
    print(f"Took {round(end_of_cell - start_of_cell, 2)} sec.\n")
    
    return res_df


In [9]:
df1 = get_aqi_data_from_open_meteo(city_name="Milano",
                                   start_date="2022-08-04", end_date="2023-01-01")
df1.tail(3)

Processed PM2_5 for Milano since 2022-08-04 till 2023-01-01.
Took 0.8 sec.



Unnamed: 0,city_name,date,pm2_5
148,Milano,2022-12-30,60.1
149,Milano,2022-12-31,48.9
150,Milano,2023-01-01,50.9


In [10]:
coords_test = (47.60086, -122.14839)

In [11]:
df2 = get_aqi_data_from_open_meteo(city_name="somewhere near Seattle", coordinates=coords_test,
                                   start_date="2023-01-10", end_date="2023-01-19")
df2.tail(3)

Processed PM2_5 for somewhere near Seattle since 2023-01-10 till 2023-01-19.
Took 0.23 sec.



Unnamed: 0,city_name,date,pm2_5
7,somewhere near Seattle,2023-01-17,5.7
8,somewhere near Seattle,2023-01-18,3.7
9,somewhere near Seattle,2023-01-19,6.3


# Filling the gap in Air Quality data (PM2.5)

In [16]:
today = datetime.date.today()

In [12]:
df_eu = pd.read_csv("data/backfill_pm2_5_eu.csv")
df_us = pd.read_csv("data/backfill_pm2_5_us.csv")
df_seattle = pd.read_csv("data/backfill_pm2_5_seattle.csv")

In [13]:
df_aq_backfill = pd.concat([df_eu, df_us, df_seattle]).reset_index(drop=True)

In [14]:
last_dates_aq = df_aq_backfill[["date", "city_name"]].groupby("city_name").max()
last_dates_aq.date = last_dates_aq.date.astype(str)
last_dates_aq = last_dates_aq.to_dict()["date"]

In [19]:
start_of_cell = time.time()

coordinates = None
df_aq_update = pd.DataFrame()
for city_name in last_dates_aq: 
    if city_name in target_cities["Seattle"]:
        coordinates = target_cities["Seattle"][city_name]
    df_ = get_aqi_data_from_open_meteo(city_name=city_name,
                                       coordinates=coordinates,
                                       start_date=last_dates_aq[city_name],
                                       end_date=str(today))
    df_aq_update = pd.concat([df_aq_update, df_]).reset_index(drop=True)
    
    
end_of_cell = time.time()
print("-" * 64)
print(f"Parsed new PM2.5 data for ALL locations up to {str(today)}.")
print(f"Took {round(end_of_cell - start_of_cell, 2)} sec.\n")

Processed PM2_5 for Albuquerque since 2022-12-31 till 2023-04-13.
Took 0.54 sec.

Processed PM2_5 for Amsterdam since 2023-04-11 till 2023-04-13.
Took 0.55 sec.

Processed PM2_5 for Athina since 2023-04-11 till 2023-04-13.
Took 0.47 sec.

Processed PM2_5 for Atlanta since 2022-12-31 till 2023-04-13.
Took 0.65 sec.

Processed PM2_5 for Berlin since 2023-04-11 till 2023-04-13.
Took 0.64 sec.

Processed PM2_5 for Chicago since 2022-12-31 till 2023-04-13.
Took 0.64 sec.

Processed PM2_5 for Columbus since 2022-12-31 till 2023-04-13.
Took 0.57 sec.

Processed PM2_5 for Dallas since 2022-12-31 till 2023-04-13.
Took 0.51 sec.

Processed PM2_5 for Denver since 2022-12-31 till 2023-04-13.
Took 0.72 sec.

Processed PM2_5 for Gdansk since 2023-04-11 till 2023-04-13.
Took 0.83 sec.

Processed PM2_5 for Houston since 2022-12-31 till 2023-04-13.
Took 0.61 sec.

Processed PM2_5 for Kraków since 2023-04-11 till 2023-04-13.
Took 0.82 sec.

Processed PM2_5 for London since 2023-04-11 till 2023-04-13.
To

In [20]:
df_air_quality = pd.concat([
    df_aq_backfill, df_aq_update
]).reset_index(drop=True)

In [22]:
df_air_quality.shape

(157885, 3)

In [21]:
# how many observations (days) we have per one location
df_air_quality[['city_name', 'date']].groupby('city_name').count()

Unnamed: 0_level_0,date
city_name,Unnamed: 1_level_1
Albuquerque,3753
Amsterdam,3756
Athina,3756
Atlanta,3750
Berlin,3756
Chicago,3754
Columbus,2595
Dallas,3751
Denver,3754
Gdansk,3756


# Weather data [Open Meteo](https://open-meteo.com/)

In [26]:
def get_weather_data_from_open_meteo(city_name: str,
                                     start_date: str,
                                     end_date: str,
                                     coordinates: list = None,
                                     forecast: bool = False):
    """
    Takes [city name OR coordinates] and returns pandas DataFrame with weather data.
    
    Examples of arguments:
        coordinates=(47.755, -122.2806), start_date="2023-01-01"
    """
    start_of_cell = time.time()
    
    if coordinates:
        latitude, longitude = coordinates
    else:
        latitude, longitude = get_city_coordinates(city_name=city_name)
    
    params = {
        'latitude': latitude,
        'longitude': longitude,
        'daily': ["temperature_2m_max", "temperature_2m_min",
                  "precipitation_sum", "rain_sum", "snowfall_sum",
                  "precipitation_hours", "windspeed_10m_max",
                  "windgusts_10m_max", "winddirection_10m_dominant"],
        'start_date': start_date,
        'end_date': end_date,
        'timezone': "Europe/London"
    }
    
    if forecast:
        # historical forecast endpoint
        base_url = 'https://api.open-meteo.com/v1/forecast' 
    else:
        # historical observations endpoint
        base_url = 'https://archive-api.open-meteo.com/v1/archive' 
        
    response = requests.get(base_url, params=params)

    response_json = response.json()    
    res_df = pd.DataFrame(response_json["daily"])
    
    res_df["city_name"] = city_name
    
    # rename columns
    res_df = res_df.rename(columns={
        "time": "date",
        "temperature_2m_max": "temperature_max",
        "temperature_2m_min": "temperature_min",
        "windspeed_10m_max": "wind_speed_max",
        "winddirection_10m_dominant": "wind_direction_dominant",
        "windgusts_10m_max": "wind_gusts_max"
    })
    
    # change columns order
    res_df = res_df[
        ['city_name', 'date', 'temperature_max', 'temperature_min',
         'precipitation_sum', 'rain_sum', 'snowfall_sum',
         'precipitation_hours', 'wind_speed_max',
         'wind_gusts_max', 'wind_direction_dominant']
    ]
    
    # convert dates in 'date' column
    res_df["date"] = pd.to_datetime(res_df["date"])
    
#     # create 'unix' columns
#     res_df["unix_time"] = res_df["base_time"].apply(convert_date_to_unix)
    end_of_cell = time.time()
    print(f"Parsed weather for {city_name} since {start_date} till {end_date}.")
    print(f"Took {round(end_of_cell - start_of_cell, 2)} sec.\n")
        
    return res_df

# Filling the gap in weather data

In [27]:
df_weather_backfill = pd.read_csv("data/backfill_weather.csv")

In [28]:
last_dates_weather = df_weather_backfill[["date", "city_name"]].groupby("city_name").max()
last_dates_weather.date = last_dates_weather.date.astype(str)
last_dates_weather = last_dates_weather.to_dict()["date"]

In [29]:
start_of_cell = time.time()

coordinates = None

df_weather_update = pd.DataFrame()
for city_name in last_dates_weather: 
    if city_name in target_cities["Seattle"]:
        coordinates = target_cities["Seattle"][city_name]
    df_ = get_weather_data_from_open_meteo(city_name=city_name,
                                           coordinates=coordinates,
                                           start_date=last_dates_weather[city_name],
                                           end_date=str(today),
                                           forecast=True)
    df_weather_update = pd.concat([df_weather_update, df_]).reset_index(drop=True)
    
    
end_of_cell = time.time()
print("-" * 64)
print(f"Parsed new weather data for ALL cities up to {str(today)}.")
print(f"Took {round(end_of_cell - start_of_cell, 2)} sec.\n")

Parsed weather for Albuquerque since 2023-04-13 till 2023-04-13.
Took 0.5 sec.

Parsed weather for Amsterdam since 2023-04-13 till 2023-04-13.
Took 0.53 sec.

Parsed weather for Athina since 2023-04-13 till 2023-04-13.
Took 0.49 sec.

Parsed weather for Atlanta since 2023-04-13 till 2023-04-13.
Took 0.52 sec.

Parsed weather for Berlin since 2023-04-13 till 2023-04-13.
Took 0.51 sec.

Parsed weather for Chicago since 2023-04-13 till 2023-04-13.
Took 0.47 sec.

Parsed weather for Columbus since 2023-04-13 till 2023-04-13.
Took 0.49 sec.

Parsed weather for Dallas since 2023-04-13 till 2023-04-13.
Took 0.49 sec.

Parsed weather for Denver since 2023-04-13 till 2023-04-13.
Took 0.5 sec.

Parsed weather for Gdansk since 2023-04-13 till 2023-04-13.
Took 0.51 sec.

Parsed weather for Houston since 2023-04-13 till 2023-04-13.
Took 0.52 sec.

Parsed weather for Kraków since 2023-04-13 till 2023-04-13.
Took 0.47 sec.

Parsed weather for London since 2023-04-13 till 2023-04-13.
Took 0.56 sec.

P

In [30]:
df_weather_update.sample()

Unnamed: 0,city_name,date,temperature_max,temperature_min,precipitation_sum,rain_sum,snowfall_sum,precipitation_hours,wind_speed_max,wind_gusts_max,wind_direction_dominant
28,Seattle - MARYSVILLE - 7TH AVE (Marysville Jun...,2023-04-13,11.2,2.7,0.0,0.0,0.0,0.0,15.5,27.7,241


In [31]:
df_weather = pd.concat([df_weather_backfill, df_weather_update])

In [34]:
df_weather.shape

(169020, 11)