In [24]:
import os
import datetime
import time
import requests
import json
import ast

import pandas as pd
import numpy as np

from geopy.geocoders import Nominatim

In [6]:
with open('target_cities.json') as json_file:
    target_cities = json.load(json_file)

# Feature Engineering: Air Quality data

In [51]:
df_air_quality = pd.read_csv("backfill_pm2_5.csv")

In [52]:
def calculate_periodic_features(df):
    df_res = df.copy()
    # Extract year, month, and day of the week from the 'date' column
    df_res['year'] = pd.to_datetime(df_res['date']).dt.year
    df_res['day_of_month'] = pd.to_datetime(df_res['date']).dt.day
    df_res['month'] = pd.to_datetime(df_res['date']).dt.month
    df_res['day_of_week'] = pd.to_datetime(df_res['date']).dt.dayofweek
    df_res['is_weekend'] = np.where(df_res['day_of_week'].isin([5,6]), 1, 0)
    day_of_year = pd.to_datetime(df_res['date']).dt.dayofyear
       
    df_res['sin_day_of_year'] = np.sin(2 * np.pi * day_of_year / 365)
    df_res['cos_day_of_year'] = np.cos(2 * np.pi * day_of_year / 365)
    df_res['sin_day_of_week'] = np.sin(2 * np.pi * df_res['day_of_week'] / 7)
    df_res['cos_day_of_week'] = np.cos(2 * np.pi * df_res['day_of_week'] / 7)

    return df_res

In [53]:
def moving_average(df, window=7):
    df[f'mean_{window}_days'] = df.groupby('city_name')['pm2_5'] \
                                    .rolling(window=window).mean().reset_index(0,drop=True).shift(1)
    return df


def moving_std(df, window):
    df[f'std_{window}_days'] = df.groupby('city_name')['pm2_5'] \
                                    .rolling(window=window).std().reset_index(0,drop=True).shift(1)
    return df


def exponential_moving_average(df, window):
    df[f'exp_mean_{window}_days'] = df.groupby('city_name')['pm2_5'].ewm(span=window) \
                                        .mean().reset_index(0,drop=True).shift(1)
    return df


def exponential_moving_std(df, window):
    df[f'exp_std_{window}_days'] = df.groupby('city_name')['pm2_5'].ewm(span=window) \
                                        .std().reset_index(0,drop=True).shift(1)
    return df


In [54]:
df_air_quality['pm_2_5-1'] = df_air_quality.groupby('city_name')['pm2_5'].shift(+1) # day before
df_air_quality['pm_2_5-2'] = df_air_quality.groupby('city_name')['pm2_5'].shift(+2) # two days before
df_air_quality['pm_2_5-3'] = df_air_quality.groupby('city_name')['pm2_5'].shift(+3)
df_air_quality['pm_2_5-4'] = df_air_quality.groupby('city_name')['pm2_5'].shift(+4)
df_air_quality['pm_2_5-5'] = df_air_quality.groupby('city_name')['pm2_5'].shift(+5) # five days before

df_air_quality = df_air_quality.dropna()

df_air_quality = moving_average(df_air_quality, 7)
df_air_quality = moving_average(df_air_quality, 14)
df_air_quality = moving_average(df_air_quality, 28)


for i in [7, 14, 28]:
    for func in [moving_std, exponential_moving_average,
                 exponential_moving_std
                 ]:
        df_air_quality = func(df_air_quality, i)
        

df_air_quality = df_air_quality.sort_values(by=["date", "pm2_5"]).dropna()
df_air_quality = df_air_quality.reset_index(drop=True)

df_air_quality = calculate_periodic_features(df_air_quality)

In [62]:
df_air_quality.columns

Index(['city_name', 'date', 'pm2_5', 'pm_2_5-1', 'pm_2_5-2', 'pm_2_5-3',
       'pm_2_5-4', 'pm_2_5-5', 'mean_7_days', 'mean_14_days', 'mean_28_days',
       'std_7_days', 'exp_mean_7_days', 'exp_std_7_days', 'std_14_days',
       'exp_mean_14_days', 'exp_std_14_days', 'std_28_days',
       'exp_mean_28_days', 'exp_std_28_days', 'year', 'day_of_month', 'month',
       'day_of_week', 'is_weekend', 'sin_day_of_year', 'cos_day_of_year',
       'sin_day_of_week', 'cos_day_of_week'],
      dtype='object')

# Feature engineering: Weather data

In [57]:
df_weather = pd.read_csv("data/backfill_weather.csv")

In [59]:
df_weather.tail(3)

Unnamed: 0,city_name,date,temperature_max,temperature_min,precipitation_sum,rain_sum,snowfall_sum,precipitation_hours,wind_speed_max,wind_gusts_max,wind_direction_dominant
168972,Seattle - Tulalip-Totem Beach Rd,2023-04-11,8.4,3.6,17.4,26.1,0.0,9.0,19.5,37.8,156
168973,Seattle - Tulalip-Totem Beach Rd,2023-04-12,10.7,1.6,0.0,0.0,0.0,0.0,17.2,24.1,44
168974,Seattle - Tulalip-Totem Beach Rd,2023-04-13,11.0,3.3,0.0,0.0,0.0,0.0,15.4,32.4,254


In [61]:
df_weather.columns

Index(['city_name', 'date', 'temperature_max', 'temperature_min',
       'precipitation_sum', 'rain_sum', 'snowfall_sum', 'precipitation_hours',
       'wind_speed_max', 'wind_gusts_max', 'wind_direction_dominant'],
      dtype='object')

## TBD: Do I really need to do it with weather data also? 