In [1]:
import pandas as pd
import numpy as np

# Feature Engineering: Air Quality data

In [2]:
df_air_quality = pd.read_csv("data/backfill_pm2_5.csv")

In [3]:
def calculate_periodic_features(df):
    df_res = df.copy()
    # Extract year, month, and day of the week from the 'date' column
    df_res['year'] = pd.to_datetime(df_res['date']).dt.year
    df_res['day_of_month'] = pd.to_datetime(df_res['date']).dt.day
    df_res['month'] = pd.to_datetime(df_res['date']).dt.month
    df_res['day_of_week'] = pd.to_datetime(df_res['date']).dt.dayofweek
    df_res['is_weekend'] = np.where(df_res['day_of_week'].isin([5,6]), 1, 0)
    day_of_year = pd.to_datetime(df_res['date']).dt.dayofyear
       
    df_res['sin_day_of_year'] = np.sin(2 * np.pi * day_of_year / 365)
    df_res['cos_day_of_year'] = np.cos(2 * np.pi * day_of_year / 365)
    df_res['sin_day_of_week'] = np.sin(2 * np.pi * df_res['day_of_week'] / 7)
    df_res['cos_day_of_week'] = np.cos(2 * np.pi * df_res['day_of_week'] / 7)

    return df_res

In [4]:
def moving_average(df, window=7):
    df[f'mean_{window}_days'] = df.groupby('city_name')['pm2_5'] \
                                    .rolling(window=window).mean().reset_index(0,drop=True).shift(1)
    return df


def moving_std(df, window):
    df[f'std_{window}_days'] = df.groupby('city_name')['pm2_5'] \
                                    .rolling(window=window).std().reset_index(0,drop=True).shift(1)
    return df


def exponential_moving_average(df, window):
    df[f'exp_mean_{window}_days'] = df.groupby('city_name')['pm2_5'].ewm(span=window) \
                                        .mean().reset_index(0,drop=True).shift(1)
    return df


def exponential_moving_std(df, window):
    df[f'exp_std_{window}_days'] = df.groupby('city_name')['pm2_5'].ewm(span=window) \
                                        .std().reset_index(0,drop=True).shift(1)
    return df


In [5]:
df_air_quality_processed = df_air_quality.copy()

In [6]:
df_air_quality_processed['pm_2_5-1'] = df_air_quality_processed.groupby('city_name')['pm2_5'].shift(+1) # day before
df_air_quality_processed['pm_2_5-2'] = df_air_quality_processed.groupby('city_name')['pm2_5'].shift(+2) # two days before
df_air_quality_processed['pm_2_5-3'] = df_air_quality_processed.groupby('city_name')['pm2_5'].shift(+3)
df_air_quality_processed['pm_2_5-4'] = df_air_quality_processed.groupby('city_name')['pm2_5'].shift(+4)
df_air_quality_processed['pm_2_5-5'] = df_air_quality_processed.groupby('city_name')['pm2_5'].shift(+5) # five days before

df_air_quality_processed = df_air_quality_processed.dropna()

df_air_quality_processed = moving_average(df_air_quality_processed, 7)
df_air_quality_processed = moving_average(df_air_quality_processed, 14)
df_air_quality_processed = moving_average(df_air_quality_processed, 28)


for i in [7, 14, 28]:
    for func in [moving_std, exponential_moving_average,
                 exponential_moving_std
                 ]:
        df_air_quality_processed = func(df_air_quality_processed, i)
        

df_air_quality_processed = df_air_quality_processed.sort_values(by=["date", "pm2_5"]).dropna()
df_air_quality_processed = df_air_quality_processed.reset_index(drop=True)

df_air_quality_processed = calculate_periodic_features(df_air_quality_processed)

In [7]:
df_air_quality_processed.columns

Index(['city_name', 'date', 'pm2_5', 'pm_2_5-1', 'pm_2_5-2', 'pm_2_5-3',
       'pm_2_5-4', 'pm_2_5-5', 'mean_7_days', 'mean_14_days', 'mean_28_days',
       'std_7_days', 'exp_mean_7_days', 'exp_std_7_days', 'std_14_days',
       'exp_mean_14_days', 'exp_std_14_days', 'std_28_days',
       'exp_mean_28_days', 'exp_std_28_days', 'year', 'day_of_month', 'month',
       'day_of_week', 'is_weekend', 'sin_day_of_year', 'cos_day_of_year',
       'sin_day_of_week', 'cos_day_of_week'],
      dtype='object')

# Feature engineering: Weather data

In [8]:
df_weather = pd.read_csv("data/backfill_weather.csv")

In [9]:
df_weather.tail(3)

Unnamed: 0,city_name,date,temperature_max,temperature_min,precipitation_sum,rain_sum,snowfall_sum,precipitation_hours,wind_speed_max,wind_gusts_max,wind_direction_dominant
168972,Seattle - Tulalip-Totem Beach Rd,2023-04-11,8.4,3.6,17.4,26.1,0.0,9.0,19.5,37.8,156
168973,Seattle - Tulalip-Totem Beach Rd,2023-04-12,10.7,1.6,0.0,0.0,0.0,0.0,17.2,24.1,44
168974,Seattle - Tulalip-Totem Beach Rd,2023-04-13,11.0,3.3,0.0,0.0,0.0,0.0,15.4,32.4,254


In [10]:
df_weather.columns

Index(['city_name', 'date', 'temperature_max', 'temperature_min',
       'precipitation_sum', 'rain_sum', 'snowfall_sum', 'precipitation_hours',
       'wind_speed_max', 'wind_gusts_max', 'wind_direction_dominant'],
      dtype='object')

## TBD: Do I really need to do it with weather data also? 

# Final dataset

In [14]:
dataset = df_air_quality_processed.merge(df_weather, how="inner", on=['city_name', 'date'])

In [15]:
dataset

Unnamed: 0,city_name,date,pm2_5,pm_2_5-1,pm_2_5-2,pm_2_5-3,pm_2_5-4,pm_2_5-5,mean_7_days,mean_14_days,...,cos_day_of_week,temperature_max,temperature_min,precipitation_sum,rain_sum,snowfall_sum,precipitation_hours,wind_speed_max,wind_gusts_max,wind_direction_dominant
0,Seattle - TACOMA - L STREET,2013-01-01,32.8,17.4,34.0,7.3,21.1,18.3,18.957143,14.071429,...,0.623490,4.1,-1.7,0.0,0.0,0.0,0.0,9.5,22.7,27
1,Seattle - SEATTLE - BEACON HILL,2013-01-02,6.9,13.4,15.0,7.8,5.3,2.6,8.714286,9.528571,...,-0.222521,4.7,-3.1,0.0,0.0,0.0,0.0,10.4,24.1,80
2,Seattle - TACOMA - L STREET,2013-01-02,31.5,32.8,17.4,34.0,7.3,21.1,20.228571,15.792857,...,-0.222521,6.0,-2.1,0.0,0.0,0.0,0.0,10.5,22.0,56
3,Seattle - SEATTLE - BEACON HILL,2013-01-03,4.5,6.9,13.4,15.0,7.8,5.3,7.628571,9.600000,...,-0.900969,4.9,-3.6,0.0,0.0,0.0,0.0,10.9,24.5,100
4,Seattle - TACOMA - L STREET,2013-01-03,26.8,31.5,32.8,17.4,34.0,7.3,23.200000,17.764286,...,-0.900969,6.0,-1.6,0.0,0.0,0.0,0.0,8.0,17.3,78
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
156432,Los Angeles,2023-04-13,15.2,33.9,45.3,50.6,37.4,45.0,39.585714,30.428571,...,-0.900969,15.7,12.0,0.4,0.6,0.0,1.0,10.1,13.3,225
156433,Athina,2023-04-13,17.1,14.4,10.0,10.0,12.0,21.0,14.342857,13.671429,...,-0.900969,21.2,10.1,0.0,0.0,0.0,0.0,15.5,43.9,194
156434,Houston,2023-04-13,17.3,14.3,16.5,13.9,10.1,7.2,10.742857,11.171429,...,-0.900969,24.3,13.4,0.0,0.0,0.0,0.0,10.0,22.3,32
156435,Gdansk,2023-04-13,20.3,21.4,24.5,27.0,11.0,9.0,17.557143,11.850000,...,-0.900969,14.6,5.5,0.0,0.0,0.0,0.0,10.6,25.2,106


In [16]:
dataset.shape

(156437, 38)

In [18]:
dataset.to_csv("data/dataset.csv", index=False)