# Data Handling

In [1]:
import sys
import numpy as np
import pandas as pd
# from datetime import datetime as dt
# from datetime import timedelta
from tqdm.notebook import tqdm
from pathlib import Path

proj_path = Path().absolute().parent
sys.path.append(str(proj_path))
data_path = proj_path / 'data' 

from src.dbengine import DBEngine
db = DBEngine(db_path=data_path / "airpollution.db")

In [2]:
# what columns to use?
columns = ['district', 'datetime', 'SO2', 'CO', 'O3', 'NO2', 'PM10', 'PM25']

sql = """
SELECT aq.measure_code, aq.datetime, aq.SO2, aq.CO, aq.O3, aq.NO2, aq.PM10, aq.PM25
FROM airquality AS aq
"""
res = db.query(sql)
df = pd.DataFrame(res).rename(columns=dict(enumerate(columns)))
df['datetime'] = pd.to_datetime(df['datetime'])
df = df.set_index(['district', 'datetime']).sort_index()
# should fill the null values using previous datetime's value

df = df.mask(df.isnull()).groupby([df.index.get_level_values(1).time]).fillna(method = 'ffill')
df.isnull().sum()

SO2     0
CO      0
O3      0
NO2     0
PM10    0
PM25    0
dtype: int64

In [3]:
# what columns to use?
# resample 

columns = [
    'datetime', 'PM10', 'PM25', 'SO2', 'CO', 'O3', 'NO2', 'temperature', 
    'precipitation', 'wind_speed', 'wind_direction', 'humidity', 'vapor_pressure', 
    'local_pressure', 'sea_level_pressure', 'sunshine', 'solar_radiation', 
    'ground_temperature', '5cm_soil_temperature', '10cm_soil_temperature', '20cm_soil_temperature', '30cm_soil_temperature'
]

sql = """
SELECT 
    a.datetime, a.PM10, a.PM25, a.SO2, a.CO, a.O3, a.NO2, w.temperature,
    w.precipitation, w.wind_speed, w.wind_direction, w.humidity, w.vapor_pressure, 
    w.local_pressure, w.sea_level_pressure, w.sunshine, w.solar_radiation,
    w."ground_temperature", w."5cm_soil_temperature", w."10cm_soil_temperature", w."20cm_soil_temperature", w."30cm_soil_temperature"
FROM (
    SELECT 
        aq.datetime, AVG(aq.SO2) SO2, AVG(aq.CO) CO, AVG(aq.O3) O3, AVG(aq.NO2) NO2, AVG(aq.PM10) PM10, AVG(aq.PM25) PM25
    FROM airquality AS aq
    GROUP BY aq.datetime
    ORDER BY aq.datetime
) AS a
JOIN weather AS w
ON a.datetime = w.datetime
"""
res = db.query(sql)
df = pd.DataFrame(res).rename(columns=dict(enumerate(columns))).set_index(['datetime']).sort_index()

In [4]:
df_null = len(df) - df.describe().loc['count']
print(df_null.index[(df_null > 0).values].values)
df_null

['temperature' 'precipitation' 'wind_speed' 'wind_direction'
 'vapor_pressure' 'local_pressure' 'sea_level_pressure' 'sunshine'
 'solar_radiation' 'ground_temperature' '5cm_soil_temperature'
 '10cm_soil_temperature' '20cm_soil_temperature' '30cm_soil_temperature']


PM10                         0.0
PM25                         0.0
SO2                          0.0
CO                           0.0
O3                           0.0
NO2                          0.0
temperature                  2.0
precipitation            28071.0
wind_speed                  11.0
wind_direction              11.0
humidity                     0.0
vapor_pressure               4.0
local_pressure              13.0
sea_level_pressure          13.0
sunshine                 14137.0
solar_radiation          14137.0
ground_temperature          23.0
5cm_soil_temperature        24.0
10cm_soil_temperature       23.0
20cm_soil_temperature       24.0
30cm_soil_temperature       24.0
Name: count, dtype: float64

In [5]:
check_columns = [
    'temperature', 'wind_speed', 'wind_direction', 'vapor_pressure', 'local_pressure', 'sea_level_pressure',
    'ground_temperature', '5cm_soil_temperature', '10cm_soil_temperature', '20cm_soil_temperature', '30cm_soil_temperature'
]

for c in check_columns:
    idx = df.loc[df[c].isnull(), c].index
    print(c)
    if len(idx) > 4:
        print(idx.values[:2], '...', idx.values[-2:])
    else:
        print(idx.values)

temperature
['2018-11-29 18:00:00' '2020-08-26 13:00:00']
wind_speed
['2018-12-16 05:00:00' '2018-12-16 06:00:00'] ... ['2021-01-07 11:00:00' '2021-01-07 12:00:00']
wind_direction
['2018-12-16 05:00:00' '2018-12-16 06:00:00'] ... ['2021-01-07 11:00:00' '2021-01-07 12:00:00']
vapor_pressure
['2018-01-26 12:00:00' '2018-01-26 13:00:00' '2020-08-26 09:00:00'
 '2020-08-26 10:00:00']
local_pressure
['2020-08-26 05:00:00' '2020-08-26 10:00:00'] ... ['2020-11-27 17:00:00' '2020-11-27 18:00:00']
sea_level_pressure
['2020-08-26 00:00:00' '2020-08-26 10:00:00'] ... ['2020-11-27 17:00:00' '2020-11-27 18:00:00']
ground_temperature
['2018-02-15 06:00:00' '2018-02-15 07:00:00'] ... ['2021-06-12 00:00:00' '2021-06-12 01:00:00']
5cm_soil_temperature
['2018-01-26 01:00:00' '2018-02-15 06:00:00'] ... ['2021-06-12 00:00:00' '2021-06-12 01:00:00']
10cm_soil_temperature
['2018-02-15 06:00:00' '2018-02-15 07:00:00'] ... ['2021-06-12 00:00:00' '2021-06-12 01:00:00']
20cm_soil_temperature
['2018-01-26 01:00:0