In [96]:
from bs4 import BeautifulSoup as Soup
import requests
from argparse import ArgumentParser
from datetime import datetime as dt
from datetime import timedelta
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm.notebook import tqdm

In [2]:
import time
from datetime import datetime as dt

def now_timestamp():
    return time.mktime(time.localtime())

def now_milliseconds():
    return int(time.time() * 1000)

def get_datetime_now():
    return dt.fromtimestamp(now_timestamp())

def get_timestamp(dt):
    return time.mktime(dt.timetuple())

# format hour, day < 10 to 10 format
def format10(no):
    if no < 10:
        return "0" + str(no)
    else:
        return str(no)


In [3]:
class Property():
    fm = "%Y-%m-%d %H:%M:%S"
    fm2 = "%Y-%m-%d %H:%M"
    months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
    district_codes = [1171,1170,1167,1168,1164,1152,1169,1153,1165,1154,1163,1172,1160,1155,1162,1176,1156,1161,1157,1173,1174,1158,1159,1166,1175]
    
pr = Property()
data_path = Path("./data")

# Airquality Data

## US AQI

https://forum.airnowtech.org/t/the-aqi-equation/169


In [169]:
def get_aqi(AQIhigh, AQIlow, conchigh, conclow, concentration):
    conc = float(concentration)
    a = ((conc - conclow) / (conchigh - conclow)) * (AQIhigh - AQIlow) + AQIlow
    return a

def get_AQIPM10(concentration):
    conc = float(concentration)
    c = np.floor(conc)
    if (c >= 0 and c < 55):
        AQI = get_aqi(50, 0, 54, 0, c)
    elif(c >= 55 and c < 155):
        AQI = get_aqi(100, 51, 154, 55, c)
    elif(c >= 155 and c < 255):
        AQI = get_aqi(150, 101, 254, 155, c)
    elif(c >= 255 and c < 355):
        AQI = get_aqi(200, 151, 354, 255, c)
    elif(c >= 355 and c < 425):
        AQI = get_aqi(300, 201, 424, 355, c)
    elif(c >= 425 and c < 505):
        AQI = get_aqi(400, 301, 504, 425, c)
    elif(c >= 505 and c < 605):
        AQI = get_aqi(500, 401, 604, 505, c)
    else:
        AQI = 0
    return AQI


# convert pm25 micro value to aqi value
def get_AQIPM25(concentration):
    conc = float(concentration)
    c = (np.floor(10 * conc)) / 10
    if (c >= 0 and c < 12.1):
        AQI = get_aqi(50, 0, 12, 0, c)
    elif (c >= 12.1 and c < 35.5):
        AQI = get_aqi(100, 51, 35.4, 12.1, c)
    elif (c >= 35.5 and c < 55.5):
        AQI = get_aqi(150, 101, 55.4, 35.5, c)
    elif (c >= 55.5 and c < 150.5):
        AQI = get_aqi(200, 151, 150.4, 55.5, c)
    elif (c >= 150.5 and c < 250.5):
        AQI = get_aqi(300, 201, 250.4, 150.5, c)
    elif (c >= 250.5 and c < 350.5):
        AQI = get_aqi(400, 301, 350.4, 250.5, c)
    elif (c >= 350.5 and c < 500.5):
        AQI = get_aqi(500, 401, 500.4, 350.5, c)
    else:
        AQI = 0
    return AQI

In [170]:
columns = {
    '지역': 'district', 
    '측정소코드': 'measurecode', 
    '측정소명': 'measurename', 
    '측정일시': 'date', 
    '주소': 'address',
    '망': 'measurepoint'
}

## 서울

In [6]:
year = 2018
files = list((data_path / "airquality").glob(f"{year}*.xlsx"))
data = []
for p in files:
    df = pd.read_excel(p).rename(columns=columns)
    df = df.loc[df["district"].str.contains("서울"), :]
    df["PM10_AQI"] = df["PM10"].apply(get_AQIPM10)
    df["PM25_AQI"] = df["PM25"].apply(get_AQIPM25)
    data.append(df)
df = pd.concat(data).reset_index(drop=True)

In [75]:
# fill in null value in measurement point from 2018.01 ~ 2018.06
temp = df.loc[~df["measurepoint"].isnull(), ["measurename", "measurepoint"]]
measurement_dict = {k: v for k, v in temp.loc[~temp.duplicated()].values}
df.loc[df["measurepoint"].isnull(), ["measurepoint"]] = [measurement_dict.get(x) for x in df.loc[df["measurepoint"].isnull(), "measurename"].values]

In [86]:
df.to_csv(data_path / "airq_2018.tsv", encoding="utf-8", sep="\t", index=False)

In [106]:
for year in [2019, 2020]:
    files = list((data_path / "airquality").glob(f"{year}*.xlsx"))
    files = sorted(files, key=lambda x: int(x.name.split(" ")[1].strip("월.xlsx")))
    data = []
    for p in tqdm(files, total=len(files), desc=f"{year}"):
        df = pd.read_excel(p).rename(columns=columns)
        df = df.loc[df["district"].str.contains("서울"), :]
        df["PM10_AQI"] = df["PM10"].apply(get_AQIPM10)
        df["PM25_AQI"] = df["PM25"].apply(get_AQIPM25)
        data.append(df)
    df = pd.concat(data).reset_index(drop=True)
    df.to_csv(data_path / f"airq_{year}.tsv", encoding="utf-8", sep="\t", index=False)

2019:   0%|          | 0/12 [00:00<?, ?it/s]

2020:   0%|          | 0/12 [00:00<?, ?it/s]

## 백령도

In [182]:
for year in [2018, 2019, 2020]:
    files = list((data_path / "airquality").glob(f"{year}*.xlsx"))
    if year != 2018:
        files = sorted(files, key=lambda x: int(x.name.split(" ")[1].strip("월.xlsx")))
    data = []
    for p in tqdm(files, total=len(files), desc=f"{year}"):
        df = pd.read_excel(p).rename(columns=columns)
        df = df.loc[df["measurename"].isin(["백령도"]), :]
        df["PM10_AQI"] = df["PM10"].apply(get_AQIPM10)
        df["PM25_AQI"] = df["PM25"].apply(get_AQIPM25)
        data.append(df)
    df = pd.concat(data).reset_index(drop=True)
    df.to_csv(data_path / f"airq_baek_{year}.tsv", encoding="utf-8", sep="\t", index=False)

2018:   0%|          | 0/4 [00:00<?, ?it/s]

2019:   0%|          | 0/12 [00:00<?, ?it/s]

2020:   0%|          | 0/12 [00:00<?, ?it/s]

# Weather Data

'지점', '지점명', '일시', '기온(°C)', '강수량(mm)', '풍속(m/s)', '풍향(16방위)', '습도(%)', '현지기압(hPa)', '지면온도(°C)'

In [146]:
columns = {
    '지점': 'measureCode', 
    '지점명': 'measureName', 
    '일시': 'date', 
    '기온(°C)': 'temperature', 
    '강수량(mm)': 'precipitation', 
    '풍속(m/s)': 'windSpeed', 
    '풍향(16방위)': 'windDirection', 
    '습도(%)': 'humidity', 
    '현지기압(hPa)': 'spotAtmosphericPressure', 
    '지면온도(°C)': 'groundTemperature'
}

data = []
for year in [2018, 2019, 2020, 2021]:
    df = pd.read_csv(data_path / "weather" / f"weather_{year}.csv", encoding="euc-kr")
    df = df.rename(columns=columns).iloc[:, 2:]
    data.append(df)

In [165]:
df = pd.concat(data[:3]).reset_index(drop=True)
df.loc[df["precipitation"].isnull(), "precipitation"] = 0.0
df.to_csv(data_path / f"weather_2018-2020.tsv", encoding="utf-8", sep="\t", index=False)

In [168]:
df = data[-1]
df.loc[df["precipitation"].isnull(), "precipitation"] = 0.0
df.to_csv(data_path / f"weather_2021.tsv", encoding="utf-8", sep="\t", index=False)

# Holiday Data

In [78]:
parser = ArgumentParser()
parser.add_argument("-i", "--interval", default=1, type=int)
parser.add_argument("-s", "--start", default=2018, type=int)
parser.add_argument("-e", "--end", default=2021, type=int)
parser.add_argument("-c", "--country", default="south-korea")
args = parser.parse_known_args()[0]

In [79]:
args

Namespace(country='south-korea', end=2021, interval=1, start=2018)

In [80]:
def get_holiday_data(html):
    tables = html.find('table', attrs={"id": "holidays-table"}).find("tbody")
    rows = tables.find_all('tr')

    data = [("date", "day", "name", "type")]
    for r in rows:
        tags = r.find_all("td")
        if len(tags) == 0:
            continue
        date = r.attrs["data-date"]
        date = dt.fromtimestamp(int(int(date) / 1e3))
        new_row = [f"{date.year:04d}-{date.month:02d}-{date.day:02d}"] + [x.text.strip() for x in tags]
        data.append(new_row)
    return data

def craw_data(year, country="south-korea"):
    url = f"https://www.timeanddate.com/holidays/{country}/{year}"
    r = requests.get(url)
    html = Soup(r.text, "html5lib")
    return html

In [81]:
all_data = []
for y in range(args.start, args.end+1):
    html = craw_data(y, args.country)
    data = get_holiday_data(html)
    all_data.append(pd.DataFrame(data[1:], columns=data[0]))
    
df_holiday = pd.concat(all_data)

In [82]:
df_holiday = df_holiday.reset_index(drop=True)
df_holiday.to_csv(data_path / f"holiday_{args.start}-{args.end}.tsv", sep="\t", index=False)