# Hand Book For Data Science 


## Air Pollution Dataset

https://www.airkorea.or.kr/web/last_amb_hour_data?pMENU_NO=123

In [1]:
from bs4 import BeautifulSoup as Soup
import requests
from argparse import ArgumentParser
from datetime import datetime as dt
from datetime import timedelta
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm.notebook import tqdm

In [2]:
import time
from datetime import datetime as dt

def now_timestamp():
    return time.mktime(time.localtime())

def now_milliseconds():
    return int(time.time() * 1000)

def get_datetime_now():
    return dt.fromtimestamp(now_timestamp())

def get_timestamp(dt):
    return time.mktime(dt.timetuple())

# format hour, day < 10 to 10 format
def format10(no):
    if no < 10:
        return "0" + str(no)
    else:
        return str(no)


In [3]:
class Property():
    fm = "%Y-%m-%d %H:%M:%S"
    fm2 = "%Y-%m-%d %H:%M"
    months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
    district_codes = [1171,1170,1167,1168,1164,1152,1169,1153,1165,1154,1163,1172,1160,1155,1162,1176,1156,1161,1157,1173,1174,1158,1159,1166,1175]
    
pr = Property()
data_path = Path("./data")

# Airquality Data

## US AQI

https://forum.airnowtech.org/t/the-aqi-equation/169

In [4]:
def get_aqi(AQIhigh, AQIlow, conchigh, conclow, concentration):
    conc = float(concentration)
    a = ((conc - conclow) / (conchigh - conclow)) * (AQIhigh - AQIlow) + AQIlow
    return a

def get_AQIPM10(concentration):
    conc = float(concentration)
    c = np.floor(conc)
    if (c >= 0 and c < 55):
        AQI = get_aqi(50, 0, 54, 0, c)
    elif(c >= 55 and c < 155):
        AQI = get_aqi(100, 51, 154, 55, c)
    elif(c >= 155 and c < 255):
        AQI = get_aqi(150, 101, 254, 155, c)
    elif(c >= 255 and c < 355):
        AQI = get_aqi(200, 151, 354, 255, c)
    elif(c >= 355 and c < 425):
        AQI = get_aqi(300, 201, 424, 355, c)
    elif(c >= 425 and c < 505):
        AQI = get_aqi(400, 301, 504, 425, c)
    elif(c >= 505 and c < 605):
        AQI = get_aqi(500, 401, 604, 505, c)
    else:
        AQI = 0
    return AQI


# convert pm25 micro value to aqi value
def get_AQIPM25(concentration):
    conc = float(concentration)
    c = (np.floor(10 * conc)) / 10
    if (c >= 0 and c < 12.1):
        AQI = get_aqi(50, 0, 12, 0, c)
    elif (c >= 12.1 and c < 35.5):
        AQI = get_aqi(100, 51, 35.4, 12.1, c)
    elif (c >= 35.5 and c < 55.5):
        AQI = get_aqi(150, 101, 55.4, 35.5, c)
    elif (c >= 55.5 and c < 150.5):
        AQI = get_aqi(200, 151, 150.4, 55.5, c)
    elif (c >= 150.5 and c < 250.5):
        AQI = get_aqi(300, 201, 250.4, 150.5, c)
    elif (c >= 250.5 and c < 350.5):
        AQI = get_aqi(400, 301, 350.4, 250.5, c)
    elif (c >= 350.5 and c < 500.5):
        AQI = get_aqi(500, 401, 500.4, 350.5, c)
    else:
        AQI = 0
    return AQI

In [5]:
columns = {
    '지역': 'district', 
    '측정소코드': 'measurecode', 
    '측정소명': 'measurename', 
    '측정일시': 'date', 
    '주소': 'address',
    '망': 'measurepoint'
}

## 서울

In [6]:
year = 2018
files = list((data_path / "airquality").glob(f"{year}*.xlsx"))
data = []
for p in files:
    df = pd.read_excel(p).rename(columns=columns)
    df = df.loc[df["district"].str.contains("서울"), :]
    df["PM10_AQI"] = df["PM10"].apply(get_AQIPM10)
    df["PM25_AQI"] = df["PM25"].apply(get_AQIPM25)
    data.append(df)
df = pd.concat(data).reset_index(drop=True)

In [75]:
# fill in null value in measurement point from 2018.01 ~ 2018.06
temp = df.loc[~df["measurepoint"].isnull(), ["measurename", "measurepoint"]]
measurement_dict = {k: v for k, v in temp.loc[~temp.duplicated()].values}
df.loc[df["measurepoint"].isnull(), ["measurepoint"]] = [measurement_dict.get(x) for x in df.loc[df["measurepoint"].isnull(), "measurename"].values]

In [86]:
df.to_csv(data_path / "airq_2018.tsv", encoding="utf-8", sep="\t", index=False)

In [106]:
for year in [2019, 2020]:
    files = list((data_path / "airquality").glob(f"{year}*.xlsx"))
    files = sorted(files, key=lambda x: int(x.name.split(" ")[1].strip("월.xlsx")))
    data = []
    for p in tqdm(files, total=len(files), desc=f"{year}"):
        df = pd.read_excel(p).rename(columns=columns)
        df = df.loc[df["district"].str.contains("서울"), :]
        df["PM10_AQI"] = df["PM10"].apply(get_AQIPM10)
        df["PM25_AQI"] = df["PM25"].apply(get_AQIPM25)
        data.append(df)
    df = pd.concat(data).reset_index(drop=True)
    df.to_csv(data_path / f"airq_{year}.tsv", encoding="utf-8", sep="\t", index=False)

2019:   0%|          | 0/12 [00:00<?, ?it/s]

2020:   0%|          | 0/12 [00:00<?, ?it/s]

## 백령도

In [182]:
for year in [2018, 2019, 2020]:
    files = list((data_path / "airquality").glob(f"{year}*.xlsx"))
    if year != 2018:
        files = sorted(files, key=lambda x: int(x.name.split(" ")[1].strip("월.xlsx")))
    data = []
    for p in tqdm(files, total=len(files), desc=f"{year}"):
        df = pd.read_excel(p).rename(columns=columns)
        df = df.loc[df["measurename"].isin(["백령도"]), :]
        df["PM10_AQI"] = df["PM10"].apply(get_AQIPM10)
        df["PM25_AQI"] = df["PM25"].apply(get_AQIPM25)
        data.append(df)
    df = pd.concat(data).reset_index(drop=True)
    df.to_csv(data_path / f"airq_baek_{year}.tsv", encoding="utf-8", sep="\t", index=False)

2018:   0%|          | 0/4 [00:00<?, ?it/s]

2019:   0%|          | 0/12 [00:00<?, ?it/s]

2020:   0%|          | 0/12 [00:00<?, ?it/s]

## Preprocessing

* 각 timestamp 로 평균내서 해당 값을 채움

In [None]:
data = []
files = [data_path / f"airq_{year}.tsv" for year in [2018, 2019, 2020]]
for p in files:
    df = pd.read_csv(p, sep="\t")
    data.append(df)
df = pd.concat(data).reset_index(drop=True)
df["date"] = df["date"].astype(str)
df["date"] = df["date"].str.slice(0, 4) + "-" + df["date"].str.slice(4, 6) + "-" + df["date"].str.slice(6, 8) + \
    " " + (df["date"].str.slice(8, 10).astype(int) - 1).astype(str).apply(lambda x: f"0{x}" if len(x) == 1 else x)
df["date"] = pd.to_datetime(df["date"], format='%Y-%m-%d %H')
df_mean = df.groupby(["date"])[['SO2', 'CO', 'O3', 'NO2', 'PM10', 'PM25']].mean()
null_mask = df.loc[:, ['SO2', 'CO', 'O3', 'NO2', 'PM10', 'PM25']].isnull()
iter_idx = df.index[null_mask.sum(1)>0]
for i in tqdm(iter_idx, total=len(iter_idx)):
    row = df.loc[i]
    date = row["date"]
    row_mask = row.isnull()
    idx = np.arange(len(df.columns))[row_mask]
    df.iloc[i, idx] = df_mean.loc[date].iloc[idx-4].values
    
df["PM10_AQI"] = df["PM10"].apply(get_AQIPM10).values
df["PM25_AQI"] = df["PM25"].apply(get_AQIPM25).values
df["district"] = df.district.str.split(expand=True).iloc[:, 1]
df.to_csv(data_path / "airq_filled_2018-2020.tsv", encoding="utf-8", sep="\t", index=False)


In [43]:
data = []
files = [data_path / f"airq_baek_{year}.tsv" for year in [2018, 2019, 2020]]
for p in files:
    df = pd.read_csv(p, sep="\t")
    data.append(df)
df = pd.concat(data).reset_index(drop=True)
df.loc[df["measurepoint"].isnull(), "measurepoint"] = "국가배경농도"

df["date"] = df["date"].astype(str)
df["date"] = df["date"].str.slice(0, 4) + "-" + df["date"].str.slice(4, 6) + "-" + df["date"].str.slice(6, 8) + \
    " " + (df["date"].str.slice(8, 10).astype(int) - 1).astype(str).apply(lambda x: f"0{x}" if len(x) == 1 else x)
df["date"] = pd.to_datetime(df["date"], format='%Y-%m-%d %H')
df_mean = df.groupby(["date"])[['SO2', 'CO', 'O3', 'NO2', 'PM10', 'PM25']].mean()
df_mean = df_mean.fillna(method="ffill")
null_mask = df.loc[:, ['SO2', 'CO', 'O3', 'NO2', 'PM10', 'PM25']].isnull()
iter_idx = df.index[null_mask.sum(1)>0]
for i in tqdm(iter_idx, total=len(iter_idx)):
    row = df.loc[i]
    date = row["date"]
    row_mask = row.isnull()
    idx = np.arange(len(df.columns))[row_mask]
    df.iloc[i, idx] = df_mean.loc[date].iloc[idx-4].values
    
df["PM10_AQI"] = df["PM10"].apply(get_AQIPM10).values
df["PM25_AQI"] = df["PM25"].apply(get_AQIPM25).values
df["district"] = df.district.str.split(expand=True).iloc[:, 1]
df.to_csv(data_path / "airq_baek_filled_2018-2020.tsv", encoding="utf-8", sep="\t", index=False)

  0%|          | 0/2875 [00:00<?, ?it/s]

## pollution average

도시대기 및 도로변 측정 지역 같이 평균내기

In [35]:
df_airq = pd.read_csv(data_path / "airq_filled_2018-2020.tsv", encoding="utf-8", sep="\t")
df_temp = df_airq.groupby(['district', 'date'])[['SO2', 'CO', 'O3','NO2', 'PM10', 'PM25', 'address']].mean().reset_index()
df_temp["PM10_AQI"] = df_temp["PM10"].apply(get_AQIPM10).values
df_temp["PM25_AQI"] = df_temp["PM25"].apply(get_AQIPM25).values

In [36]:
# 임시방편
df_temp = df_temp.loc[df_temp["district"] != "강서로"].reset_index(drop=True)

In [39]:
df_temp.to_csv(data_path / "airq_filled_by_gu_2018-2020.tsv", encoding="utf-8", sep="\t", index=False)

# Weather Data

'지점', '지점명', '일시', '기온(°C)', '강수량(mm)', '풍속(m/s)', '풍향(16방위)', '습도(%)', '현지기압(hPa)', '지면온도(°C)'

In [5]:
columns = {
    '지점': 'measureCode', 
    '지점명': 'measureName', 
    '일시': 'date', 
    '기온(°C)': 'temperature', 
    '강수량(mm)': 'precipitation', 
    '풍속(m/s)': 'windSpeed', 
    '풍향(16방위)': 'windDirection', 
    '습도(%)': 'humidity', 
    '현지기압(hPa)': 'spotAtmosphericPressure', 
    '지면온도(°C)': 'groundTemperature'
}

data = []
for year in [2018, 2019, 2020, 2021]:
    df = pd.read_csv(data_path / "weather" / f"weather_{year}.csv", encoding="euc-kr")
    df = df.rename(columns=columns).iloc[:, 2:]
    data.append(df)

In [6]:
df = pd.concat(data[:3]).reset_index(drop=True)
df.loc[df["precipitation"].isnull(), "precipitation"] = 0.0
df.to_csv(data_path / f"weather_2018-2020.tsv", encoding="utf-8", sep="\t", index=False)

In [7]:
df = data[-1]
df.loc[df["precipitation"].isnull(), "precipitation"] = 0.0
df.to_csv(data_path / f"weather_2021.tsv", encoding="utf-8", sep="\t", index=False)

In [9]:
data = []
for year in [2018, 2019, 2020]:
    df = pd.read_csv(data_path / "weather" / f"weather_baek_{year}.csv", encoding="euc-kr")
    df = df.rename(columns=columns).iloc[:, 2:]
    data.append(df)

In [10]:
df = pd.concat(data[:3]).reset_index(drop=True)
df.loc[df["precipitation"].isnull(), "precipitation"] = 0.0
df.to_csv(data_path / f"weather_baek_2018-2020.tsv", encoding="utf-8", sep="\t", index=False)

## Preprocessing

* 이전 시간대 값을 채움

In [27]:
df = pd.read_csv(data_path / f"weather_2018-2020.tsv", encoding="utf-8", sep="\t")
df = df.fillna(method="ffill")
df.to_csv(data_path / f"weather_filled_2018-2020.tsv", encoding="utf-8", sep="\t", index=False)

In [514]:
df = pd.read_csv(data_path / f"weather_2021.tsv", encoding="utf-8", sep="\t")
df.iloc[:, 1:].fillna(method="pad", inplace=True)
df.to_csv(data_path / f"weather_filled_2021.tsv", encoding="utf-8", sep="\t", index=False)

In [28]:
df = pd.read_csv(data_path / f"weather_2018-2020.tsv", encoding="utf-8", sep="\t")
df = df.fillna(method="ffill")
df.to_csv(data_path / f"weather_baek_filled_2018-2020.tsv", encoding="utf-8", sep="\t", index=False)

# Holiday Data

In [78]:
parser = ArgumentParser()
parser.add_argument("-i", "--interval", default=1, type=int)
parser.add_argument("-s", "--start", default=2018, type=int)
parser.add_argument("-e", "--end", default=2021, type=int)
parser.add_argument("-c", "--country", default="south-korea")
args = parser.parse_known_args()[0]

In [79]:
args

Namespace(country='south-korea', end=2021, interval=1, start=2018)

In [80]:
def get_holiday_data(html):
    tables = html.find('table', attrs={"id": "holidays-table"}).find("tbody")
    rows = tables.find_all('tr')

    data = [("date", "day", "name", "type")]
    for r in rows:
        tags = r.find_all("td")
        if len(tags) == 0:
            continue
        date = r.attrs["data-date"]
        date = dt.fromtimestamp(int(int(date) / 1e3))
        new_row = [f"{date.year:04d}-{date.month:02d}-{date.day:02d}"] + [x.text.strip() for x in tags]
        data.append(new_row)
    return data

def craw_data(year, country="south-korea"):
    url = f"https://www.timeanddate.com/holidays/{country}/{year}"
    r = requests.get(url)
    html = Soup(r.text, "html5lib")
    return html

In [81]:
all_data = []
for y in range(args.start, args.end+1):
    html = craw_data(y, args.country)
    data = get_holiday_data(html)
    all_data.append(pd.DataFrame(data[1:], columns=data[0]))
    
df_holiday = pd.concat(all_data)

In [82]:
df_holiday = df_holiday.reset_index(drop=True)
df_holiday.to_csv(data_path / f"holiday_{args.start}-{args.end}.tsv", sep="\t", index=False)

## Traffic

* 포아송 분포에 따라서 fill in 하면 더 좋을듯

In [47]:
df_traffic_info = pd.read_excel(data_path / "traffic" / "traffic202001.xlsx", sheet_name=2)
df_traffic_info = df_traffic_info.iloc[:-2, :]
df_traffic_info["district"] = df_traffic_info["주소"].str.split(expand=True).iloc[:, 1].values
df_traffic_info.to_csv(data_path / "traffic_info.tsv", encoding="utf-8", sep="\t", index=False)

In [48]:
df_traffic_info = pd.read_csv(data_path / "traffic_info.tsv", encoding="utf-8", sep="\t")

In [49]:
df_traffic_info

Unnamed: 0,지점번호,지점명칭,위도,경도,주소,도로명 주소,유입 방향,유출방향,district
0,A-01,성산로(금화터널),37.568588,126.948436,서울시 서대문구 신촌동 1-142,,[성산로]봉원고가차도->독립문역,[성산로]독립문역->봉원고가차도,서대문구
1,A-02,사직로(사직터널),37.572298,126.962853,서울시 종로구 행촌동 1-186,,[사직로]독립문역->사직단,[사직로]사직단->독립문역,종로구
2,A-03,자하문로(자하문터널),37.588831,126.968548,서울시 종로구 청운동 24-6,,[자하문로]석파정->청운초등학교,[자하문로]청운초등학교->석파정,종로구
3,A-04,대사관로(삼청터널),37.596359,126.984209,서울시 성북구 성북동 산25-148,,[삼청로]삼청각->삼청공원입구,[삼청로]삼청공원입구->삼청각,성북구
4,A-05,율곡로(안국역),37.576000,126.984342,서울시 종로구 경운동 90-4,서울시 종로구 율곡로 46,[율곡로]안국역->안국동사거리,[율곡로]안국동사거리->안국역,종로구
...,...,...,...,...,...,...,...,...,...
130,F-05,동부간선도로,37.568685,127.076023,서울시 동대문구 장안동 19-7,,[동부간선도로]장안교->군자교,[동부간선도로]군자교->장안교,동대문구
131,F-06,경부고속도로,37.493135,127.022533,서울시 서초구 서초동 1748-26,,[경부고속도로]서초2교 -> 서초IC,[경부고속도로]서초IC -> 서초2교,서초구
132,F-07,분당수서로,37.497648,127.087195,서울시 강남구 일원동 467-1,,[분당수서로]수서IC->탄천1교,[분당수서로]탄천1교->수서IC,강남구
133,F-08,강남순환로(관악터널),37.449096,126.926169,서울시 금천구 시흥동 산 93-1,,[강남순환로]금천요금소->관악IC,[노들로]관악IC->금천요금소,금천구


공사,시스템개선,장애등으로 유효 데이터없는 기간이 있을 수 있음

In [51]:
files = sorted((data_path / "traffic").glob("*.xlsx"))

In [71]:
data = []
time_columns = [
    '0시', '1시', '2시', '3시', '4시', '5시', '6시', '7시', '8시', '9시','10시', '11시', '12시', '13시', '14시', 
    '15시', '16시', '17시', '18시', '19시','20시', '21시', '22시', '23시'
]

for f in tqdm(files, total=len(files)):
    num = 0 if f.name.strip("traffic")[:4] == "2017" else 1
    df = pd.read_excel(f, sheet_name=num)
    df = df.merge(df_traffic_info.loc[:, ["지점번호", "district"]], on="지점번호")
    if len(df["방향"].unique()) != 2:
        temp1 = df.loc[:, "구분"].values.copy()
        temp2 = df.loc[:, "방향"].values.copy()
        df.loc[:, "구분"] = temp2
        df.loc[:, "방향"] = temp1
    for c in tqdm(time_columns, total=len(time_columns), desc=f"checking column: {f.name}"):
        m = df.loc[:, time_columns].loc[:, c].apply(lambda x: isinstance(x, str))
        if m.sum() > 0:
            df.loc[m, c] = 0.0
    df.loc[:, time_columns] = df.loc[:, time_columns].astype(np.float)
    data.append(df)

  0%|          | 0/52 [00:00<?, ?it/s]

checking column: traffic201701.xlsx:   0%|          | 0/24 [00:00<?, ?it/s]

checking column: traffic201702.xlsx:   0%|          | 0/24 [00:00<?, ?it/s]

checking column: traffic201703.xlsx:   0%|          | 0/24 [00:00<?, ?it/s]

checking column: traffic201704.xlsx:   0%|          | 0/24 [00:00<?, ?it/s]

checking column: traffic201705.xlsx:   0%|          | 0/24 [00:00<?, ?it/s]

checking column: traffic201706.xlsx:   0%|          | 0/24 [00:00<?, ?it/s]

checking column: traffic201707.xlsx:   0%|          | 0/24 [00:00<?, ?it/s]

checking column: traffic201708.xlsx:   0%|          | 0/24 [00:00<?, ?it/s]

checking column: traffic201709.xlsx:   0%|          | 0/24 [00:00<?, ?it/s]

checking column: traffic201710.xlsx:   0%|          | 0/24 [00:00<?, ?it/s]

checking column: traffic201711.xlsx:   0%|          | 0/24 [00:00<?, ?it/s]

checking column: traffic201712.xlsx:   0%|          | 0/24 [00:00<?, ?it/s]

checking column: traffic201801.xlsx:   0%|          | 0/24 [00:00<?, ?it/s]

checking column: traffic201802.xlsx:   0%|          | 0/24 [00:00<?, ?it/s]

checking column: traffic201803.xlsx:   0%|          | 0/24 [00:00<?, ?it/s]

checking column: traffic201804.xlsx:   0%|          | 0/24 [00:00<?, ?it/s]

checking column: traffic201805.xlsx:   0%|          | 0/24 [00:00<?, ?it/s]

checking column: traffic201806.xlsx:   0%|          | 0/24 [00:00<?, ?it/s]

checking column: traffic201807.xlsx:   0%|          | 0/24 [00:00<?, ?it/s]

checking column: traffic201808.xlsx:   0%|          | 0/24 [00:00<?, ?it/s]

checking column: traffic201809.xlsx:   0%|          | 0/24 [00:00<?, ?it/s]

checking column: traffic201810.xlsx:   0%|          | 0/24 [00:00<?, ?it/s]

checking column: traffic201811.xlsx:   0%|          | 0/24 [00:00<?, ?it/s]

checking column: traffic201812.xlsx:   0%|          | 0/24 [00:00<?, ?it/s]

checking column: traffic201901.xlsx:   0%|          | 0/24 [00:00<?, ?it/s]

checking column: traffic201902.xlsx:   0%|          | 0/24 [00:00<?, ?it/s]

checking column: traffic201903.xlsx:   0%|          | 0/24 [00:00<?, ?it/s]

checking column: traffic201904.xlsx:   0%|          | 0/24 [00:00<?, ?it/s]

checking column: traffic201905.xlsx:   0%|          | 0/24 [00:00<?, ?it/s]

checking column: traffic201906.xlsx:   0%|          | 0/24 [00:00<?, ?it/s]

checking column: traffic201907.xlsx:   0%|          | 0/24 [00:00<?, ?it/s]

checking column: traffic201908.xlsx:   0%|          | 0/24 [00:00<?, ?it/s]

checking column: traffic201909.xlsx:   0%|          | 0/24 [00:00<?, ?it/s]

checking column: traffic201910.xlsx:   0%|          | 0/24 [00:00<?, ?it/s]

checking column: traffic201911.xlsx:   0%|          | 0/24 [00:00<?, ?it/s]

checking column: traffic201912.xlsx:   0%|          | 0/24 [00:00<?, ?it/s]

checking column: traffic202001.xlsx:   0%|          | 0/24 [00:00<?, ?it/s]

checking column: traffic202002.xlsx:   0%|          | 0/24 [00:00<?, ?it/s]

checking column: traffic202003.xlsx:   0%|          | 0/24 [00:00<?, ?it/s]

checking column: traffic202004.xlsx:   0%|          | 0/24 [00:00<?, ?it/s]

checking column: traffic202005.xlsx:   0%|          | 0/24 [00:00<?, ?it/s]

checking column: traffic202006.xlsx:   0%|          | 0/24 [00:00<?, ?it/s]

checking column: traffic202007.xlsx:   0%|          | 0/24 [00:00<?, ?it/s]

checking column: traffic202008.xlsx:   0%|          | 0/24 [00:00<?, ?it/s]

checking column: traffic202009.xlsx:   0%|          | 0/24 [00:00<?, ?it/s]

checking column: traffic202010.xlsx:   0%|          | 0/24 [00:00<?, ?it/s]

checking column: traffic202011.xlsx:   0%|          | 0/24 [00:00<?, ?it/s]

checking column: traffic202012.xlsx:   0%|          | 0/24 [00:00<?, ?it/s]

checking column: traffic202101.xlsx:   0%|          | 0/24 [00:00<?, ?it/s]

checking column: traffic202102.xlsx:   0%|          | 0/24 [00:00<?, ?it/s]

checking column: traffic202103.xlsx:   0%|          | 0/24 [00:00<?, ?it/s]

checking column: traffic202104.xlsx:   0%|          | 0/24 [00:00<?, ?it/s]

In [72]:
df = pd.concat(data).iloc[:, :-3].reset_index(drop=True)
df["일자"] = pd.to_datetime(df["일자"], format="%Y%m%d")
print(df["방향"].unique())

['유입' '유출']


In [73]:
df.to_csv(data_path / "traffic.tsv", encoding="utf-8", index=False, sep="\t")

In [74]:
df_before_fill = pd.read_csv(data_path / "traffic.tsv", encoding="utf-8", sep="\t")
df_before_fill["일자"] = pd.to_datetime(df_before_fill["일자"], format="%Y-%m-%d")

In [75]:
start_idx = df_before_fill.loc[df_before_fill["일자"] == "2018-01-01"].index[0]
df_before_fill.iloc[start_idx:len(df_before_fill), 6:6+24].describe().loc["count"] / len(df_before_fill.iloc[start_idx:len(df_before_fill)])

0시     0.896930
1시     0.897469
2시     0.897594
3시     0.897749
4시     0.898897
5시     0.899580
6시     0.899598
7시     0.899607
8시     0.899342
9시     0.899199
10시    0.899047
11시    0.898678
12시    0.899150
13시    0.898687
14시    0.899543
15시    0.900009
16시    0.900171
17시    0.900043
18시    0.900612
19시    0.899939
20시    0.900567
21시    0.900643
22시    0.897646
23시    0.895626
Name: count, dtype: float64

전년도 대비 동기로 채우기엔 코로나 때문에 특성이 달라질 것 같음 (분포 필요), 따라서 같은 지난주 같은 요일 대비로 채우는게 맞을 듯

In [76]:
# weekday_dict = dict(enumerate(["월", "화", "수", "목", "금", "토", "일"]))
start_idx = df_before_fill.loc[df_before_fill["일자"] == "2018-01-01"].index[0]
null_mask = df_before_fill.iloc[start_idx:len(df_before_fill), 6:6+24].isnull()
iter_idx = df_before_fill.iloc[start_idx:len(df_before_fill)].index[null_mask.sum(1) > 0]
for i in tqdm(iter_idx, total=len(iter_idx)):
    row = df_before_fill.loc[i]
    row_mask = row.iloc[6:6+24].isnull()
    date = row.iloc[0]
    measure_point = row.iloc[3]
    direction = row.iloc[5]

    lastweek_date = date - timedelta(7)
    lastweek_data = df_before_fill.loc[(df_before_fill["일자"] == lastweek_date) & (df_before_fill["지점번호"] == measure_point) & (df_before_fill["방향"] == direction)]
    lastweek_masked = lastweek_data.iloc[0, 6:6+24][row_mask]
    
    idx = np.arange(6, 6+24)[row_mask]
    values = lastweek_data.iloc[0, idx].fillna(0).values.copy()
    df_before_fill.iloc[i, idx] = values
    
df_before_fill.to_csv(data_path / "traffic_filled.tsv", encoding="utf-8", index=False, sep="\t")

  0%|          | 0/41145 [00:00<?, ?it/s]

In [122]:
df = pd.read_csv(data_path / "traffic_filled.tsv", encoding="utf-8", sep="\t")

In [123]:
start_idx = df.loc[df["일자"] == "2018-01-01"].index[0]
df.iloc[start_idx:len(df), 6:6+24].describe().loc["count"] / len(df.iloc[start_idx:len(df)])

0시     1.0
1시     1.0
2시     1.0
3시     1.0
4시     1.0
5시     1.0
6시     1.0
7시     1.0
8시     1.0
9시     1.0
10시    1.0
11시    1.0
12시    1.0
13시    1.0
14시    1.0
15시    1.0
16시    1.0
17시    1.0
18시    1.0
19시    1.0
20시    1.0
21시    1.0
22시    1.0
23시    1.0
Name: count, dtype: float64

In [124]:
s = df.loc[(df["일자"] == "2018-01-01")].index[0]
e = df.loc[(df["일자"] == "2020-12-31")].index[-1]
df_temp = df.loc[s:e].reset_index(drop=True).loc[:, ['일자', '요일', 'district', '지점명', '지점번호', '구분', '방향', '0시', '1시', '2시', '3시', '4시',
       '5시', '6시', '7시', '8시', '9시', '10시', '11시', '12시', '13시', '14시', '15시',
       '16시', '17시', '18시', '19시', '20시', '21시', '22시', '23시']]
df_temp.groupby(["일자", "district", "지점번호"])[time_columns].sum().astype(int).to_csv(data_path/"traffic_filled_2018-2020.tsv", sep="\t", encoding="utf-8")

In [86]:
# df_temp = pd.read_csv(data_path / "traffic_filled_2018-2020.tsv", sep="\t")

In [87]:
s = df.loc[(df["일자"] == "2021-01-01")].index[0]
# e = df.loc[(df["일자"] == "2021-01-01")].index[-1]
df_temp = df.loc[s:len(df)].reset_index(drop=True).loc[:, ['일자', '요일', 'district', '지점명', '지점번호', '구분', '방향', '0시', '1시', '2시', '3시', '4시',
       '5시', '6시', '7시', '8시', '9시', '10시', '11시', '12시', '13시', '14시', '15시',
       '16시', '17시', '18시', '19시', '20시', '21시', '22시', '23시']]

In [88]:
df_temp.groupby(["일자", "district", "지점번호"])[time_columns].sum().astype(int).to_csv(data_path/"traffic_filled_2021.tsv", sep="\t", encoding="utf-8")

## 구별로 뽑아내기

In [135]:
df_airq = pd.read_csv(data_path / "airq_filled_by_gu_2018-2020.tsv", encoding="utf-8", sep="\t")
df_traffic = pd.read_csv(data_path / "traffic_filled_2018-2020.tsv", encoding="utf-8", sep="\t")

In [136]:
df_traffic.columns

Index(['일자', 'district', '지점번호', '0시', '1시', '2시', '3시', '4시', '5시', '6시',
       '7시', '8시', '9시', '10시', '11시', '12시', '13시', '14시', '15시', '16시',
       '17시', '18시', '19시', '20시', '21시', '22시', '23시'],
      dtype='object')

In [137]:
time_columns = [
    '0시', '1시', '2시', '3시', '4시', '5시', '6시', '7시', '8시', '9시','10시', '11시', '12시', '13시', '14시', 
    '15시', '16시', '17시', '18시', '19시','20시', '21시', '22시', '23시'
]

In [138]:
df_traffic

Unnamed: 0,일자,district,지점번호,0시,1시,2시,3시,4시,5시,6시,...,14시,15시,16시,17시,18시,19시,20시,21시,22시,23시
0,2018-01-01,강남구,B-13,218,260,130,72,75,101,167,...,897,961,895,832,653,621,562,425,273,189
1,2018-01-01,강남구,C-13,6031,6043,3629,2720,2270,2671,3565,...,8921,9270,8968,8436,7934,7536,7579,7283,6037,4490
2,2018-01-01,강남구,C-17,3446,3339,1525,946,913,1071,1492,...,6623,6322,6336,5992,5560,5403,5828,5332,3798,2364
3,2018-01-01,강남구,D-35,2590,2327,1968,1784,1572,1760,2043,...,3353,3457,3456,3451,3209,3121,2896,2856,2828,2623
4,2018-01-01,강남구,D-38,1564,1505,765,583,518,598,738,...,2426,2892,2953,2822,2381,2120,2139,1989,1597,1029
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
147955,2020-12-31,중구,A-24,69,51,31,32,21,36,88,...,693,655,746,794,853,521,396,345,229,166
147956,2020-12-31,중구,D-03,991,785,584,510,562,771,1539,...,3859,3394,3163,2913,2730,2543,2518,2152,1423,1027
147957,2020-12-31,중랑구,B-05,430,296,220,230,355,801,1651,...,2855,3016,3181,3422,3498,2894,2097,1833,1233,764
147958,2020-12-31,중랑구,B-07,1397,1077,851,923,1189,2572,4702,...,6883,7188,6911,6423,6195,6223,6038,5936,3996,2286


In [143]:
df_traffic = df_traffic.loc[df_traffic.district.isin(df_airq.district.unique()), :]

In [144]:
df_traffic.groupby(["일자", "district"])[time_columns].mean().to_csv(data_path/"traffic_filled_by_gu_2018-2020.tsv", sep="\t", encoding="utf-8")

Join 하기 좋은 형태로 전환

In [145]:
df_traffic = pd.read_csv(data_path / "traffic_filled_by_gu_2018-2020.tsv", encoding="utf-8", sep="\t")
df_traffic.shape

(26304, 26)

In [146]:
data = []
for date in tqdm(df_traffic["일자"].unique(), total=len(df_traffic["일자"].unique())):
    temp = []
    df_temp = df_traffic.loc[df_traffic["일자"] == date]#.set_index(["일자", "district"])
    for t in time_columns:
        a = (df_temp["일자"] + f" {t.strip('시') if len(t.strip('시')) == 2 else '0'+t.strip('시')}").values, df_temp["district"].values, df_temp[t].values
        temp.extend(list(zip(*a)))
    data.append(pd.DataFrame(temp, columns=["date", "district", "traffic"]))
df = pd.concat(data).reset_index(drop=True)

  0%|          | 0/1096 [00:00<?, ?it/s]

In [147]:
df = df.sort_values(by=["district", "date"])
df["date"] = pd.to_datetime(df["date"], format="%Y-%m-%d %H").values
df.to_csv(data_path/"traffic_filled_by_gu_2018-2020.tsv", sep="\t", encoding="utf-8", index=False)

In [148]:
df = pd.read_csv(data_path/"traffic_filled_by_gu_2018-2020.tsv", sep="\t", encoding="utf-8")

In [149]:
df

Unnamed: 0,date,district,traffic
0,2018-01-01 00:00:00,강남구,2808.9
1,2018-01-01 01:00:00,강남구,2697.4
2,2018-01-01 02:00:00,강남구,1640.2
3,2018-01-01 03:00:00,강남구,1235.2
4,2018-01-01 04:00:00,강남구,1097.8
...,...,...,...
631291,2020-12-31 19:00:00,중랑구,4558.5
631292,2020-12-31 20:00:00,중랑구,4067.5
631293,2020-12-31 21:00:00,중랑구,3884.5
631294,2020-12-31 22:00:00,중랑구,2614.5


# Merge all data

In [None]:
df_airq = pd.read_csv(data_path / "airq_filled_by_gu_2018-2020.tsv", encoding="utf-8", sep="\t")
df_airq = df_airq.loc[(df_airq.district != "강북구")]
df_weather = pd.read_csv(data_path / "weather_filled_2018-2020.tsv", encoding="utf-8", sep="\t")

df_airq_baek = pd.read_csv(data_path / "airq_baek_filled_2018-2020.tsv", encoding="utf-8", sep="\t")
df_airq_baek = df_airq_baek.loc[:, ['measurename', 'date', 'SO2', 'CO', 'O3', 'NO2', 'PM10', 'PM25', 'PM10_AQI', 'PM25_AQI']].rename(columns={"measurename": "district"})
df_weather_baek = pd.read_csv(data_path / "weather_baek_filled_2018-2020.tsv", encoding="utf-8", sep="\t")

df_traffic = pd.read_csv(data_path / "traffic_filled_by_gu_2018-2020.tsv", encoding="utf-8", sep="\t")

df_airq["date"] = pd.to_datetime(df_airq["date"])
df_weather["date"] = pd.to_datetime(df_weather["date"])

df_airq_baek["date"] = pd.to_datetime(df_airq_baek["date"])
df_weather_baek["date"] = pd.to_datetime(df_weather_baek["date"])
df_traffic["date"] = pd.to_datetime(df_traffic["date"])

df_airq.merge(df_weather, on="date").to_csv(data_path / "df_seoul.tsv", encoding="utf-8", sep="\t", index=False)
df_airq_baek.merge(df_weather_baek, on="date").to_csv(data_path / "df_baek.tsv", encoding="utf-8", sep="\t", index=False)

In [50]:
df_seoul = pd.read_csv(data_path / "df_seoul.tsv", encoding="utf-8", sep="\t")
df_baek = pd.read_csv(data_path / "df_baek.tsv", encoding="utf-8", sep="\t")
df_traffic = pd.read_csv(data_path / "traffic_filled_by_gu_2018-2020.tsv", encoding="utf-8", sep="\t")

df_seoul["date"] = pd.to_datetime(df_seoul["date"])
df_baek["date"] = pd.to_datetime(df_baek["date"])
df_traffic["date"] = pd.to_datetime(df_traffic["date"])

In [51]:
df_seoul.isnull().sum()

district                     0
date                         0
SO2                          0
CO                           0
O3                           0
NO2                          0
PM10                         0
PM25                         0
PM10_AQI                     0
PM25_AQI                     0
temperature                 48
precipitation                0
windSpeed                  216
windDirection              216
humidity                     0
spotAtmosphericPressure    312
groundTemperature          432
dtype: int64

In [63]:
df = pd.read_csv(data_path / f"weather_baek_filled_2018-2020.tsv", encoding="utf-8", sep="\t")
df["date"] = pd.to_datetime(df["date"], format="%Y-%m-%d %H")

In [66]:
df.isnull().sum()

date                       0
temperature                0
precipitation              0
windSpeed                  0
windDirection              0
humidity                   0
spotAtmosphericPressure    0
groundTemperature          0
dtype: int64

In [62]:
df_m = df_seoul.drop(columns=['temperature', 'precipitation', 'windSpeed','windDirection', 'humidity', 'spotAtmosphericPressure','groundTemperature']).merge(df, on=["date"])
df_m.to_csv(data_path / "df_seoul.tsv", encoding="utf-8", sep="\t", index=False)

In [67]:
df_m = df_baek.drop(columns=['temperature', 'precipitation', 'windSpeed','windDirection', 'humidity', 'spotAtmosphericPressure','groundTemperature']).merge(df, on=["date"])
df_m.to_csv(data_path / "df_baek.tsv", encoding="utf-8", sep="\t", index=False)

In [72]:
df_baek = pd.read_csv(data_path / "df_baek.tsv", encoding="utf-8", sep="\t")
df_baek["date"] = pd.to_datetime(df_baek["date"])
df_airq_baek = pd.read_csv(data_path / "airq_baek_filled_2018-2020.tsv", encoding="utf-8", sep="\t")
df_airq_baek["date"] = pd.to_datetime(df_airq_baek["date"])
df_airq_baek = df_airq_baek.loc[:, ['measurename', 'date', 'SO2', 'CO', 'O3', 'NO2', 'PM10', 'PM25', 'PM10_AQI', 'PM25_AQI']].rename(columns={"measurename": "district"})


In [76]:
df_baek

Unnamed: 0,district,date,SO2,CO,O3,NO2,PM10,PM25,PM10_AQI,PM25_AQI,temperature,precipitation,windSpeed,windDirection,humidity,spotAtmosphericPressure,groundTemperature
0,백령도,2018-01-01 00:00:00,0.0026,0.6,0.030,0.0026,40.0,15.0,37.037037,57.098712,-3.2,0.0,0.5,110.0,40.0,1015.4,-2.2
1,백령도,2018-01-01 01:00:00,0.0024,0.6,0.030,0.0021,23.0,16.0,21.296296,59.201717,-3.3,0.0,0.7,360.0,41.0,1015.1,-2.7
2,백령도,2018-01-01 02:00:00,0.0031,0.7,0.028,0.0038,23.0,10.0,21.296296,41.666667,-3.7,0.0,0.9,270.0,42.0,1015.2,-3.0
3,백령도,2018-01-01 03:00:00,0.0038,0.8,0.025,0.0061,32.0,11.0,29.629630,45.833333,-4.0,0.0,1.0,290.0,44.0,1015.5,-3.5
4,백령도,2018-01-01 04:00:00,0.0034,0.8,0.025,0.0055,47.0,21.0,43.518519,69.716738,-4.2,0.0,1.1,290.0,53.0,1015.5,-3.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26299,백령도,2020-12-31 19:00:00,0.0018,0.4,0.040,0.0032,16.0,8.0,14.814815,33.333333,-7.1,0.0,2.4,250.0,58.0,1014.2,-4.3
26300,백령도,2020-12-31 20:00:00,0.0017,0.4,0.040,0.0028,21.0,10.0,19.444444,41.666667,-7.1,0.0,3.2,250.0,59.0,1014.8,-5.2
26301,백령도,2020-12-31 21:00:00,0.0013,0.4,0.040,0.0024,20.0,9.0,18.518519,37.500000,-7.2,0.0,2.7,250.0,61.0,1015.3,-5.7
26302,백령도,2020-12-31 22:00:00,0.0015,0.4,0.041,0.0022,16.0,10.0,14.814815,41.666667,-7.4,0.0,2.5,270.0,66.0,1016.0,-6.1


In [75]:
df_baek.columns

Index(['district', 'date', 'SO2', 'CO', 'O3', 'NO2', 'PM10', 'PM25',
       'PM10_AQI', 'PM25_AQI', 'temperature', 'precipitation', 'windSpeed',
       'windDirection', 'humidity', 'spotAtmosphericPressure',
       'groundTemperature'],
      dtype='object')

In [81]:
df_m = df_baek.drop(columns=['SO2', 'CO', 'O3', 'NO2', 'PM10', 'PM25', 'PM10_AQI', 'PM25_AQI']).merge(df_airq_baek, on=['district', 'date'])
df_m.loc[:, ['district', 'date', 'SO2', 'CO', 'O3', 'NO2', 'PM10', 'PM25',
       'PM10_AQI', 'PM25_AQI', 'temperature', 'precipitation', 'windSpeed',
       'windDirection', 'humidity', 'spotAtmosphericPressure',
       'groundTemperature']].to_csv(data_path / "df_baek.tsv", encoding="utf-8", sep="\t", index=False)

In [80]:
df_m.isnull().sum()

district                   0
date                       0
temperature                0
precipitation              0
windSpeed                  0
windDirection              0
humidity                   0
spotAtmosphericPressure    0
groundTemperature          0
SO2                        0
CO                         0
O3                         0
NO2                        0
PM10                       0
PM25                       0
PM10_AQI                   0
PM25_AQI                   0
dtype: int64

In [82]:
df_seoul = pd.read_csv(data_path / "df_seoul.tsv", encoding="utf-8", sep="\t")
df_baek = pd.read_csv(data_path / "df_baek.tsv", encoding="utf-8", sep="\t")
df_traffic = pd.read_csv(data_path / "traffic_filled_by_gu_2018-2020.tsv", encoding="utf-8", sep="\t")

df_seoul["date"] = pd.to_datetime(df_seoul["date"])
df_baek["date"] = pd.to_datetime(df_baek["date"])
df_traffic["date"] = pd.to_datetime(df_traffic["date"])

df_seoul = df_seoul.merge(df_traffic, on=["date", "district"])

In [86]:
df_seoul.shape

(631296, 18)