# Data Acquisition

In [2]:
import os
import sqlite3
import pandas as pd
from pathlib import Path

from collections import defaultdict
from tqdm.notebook import tqdm
from typing import Any, Union

data_path = Path() / 'data' 

# SQLite3 Database

In [3]:
conn = sqlite3.connect(data_path / "airpollution.db")

# Airquality Data

In [4]:
def load_data(path:Path) -> pd.DataFrame:
    """Load data function

    Args:
        path (Path): path of data with file name
        enc (str, optional): encoding. Defaults to 'utf-8'.
    Returns:
        pd.DataFrame 
    """    
    if path.name.split('.')[-1] == 'xlsx':
        df = pd.read_excel(path)
    else:
        try:
            df = pd.read_csv(path, encoding='cp949')
        except UnicodeDecodeError:
            df = pd.read_csv(path, encoding='utf-8')

    return df

def filter_seoul(df):
    return df.loc[df['지역'].str.contains('서울'), :]

In [None]:
datafiles = sorted([x for x in (data_path / 'airquality').glob("*") if x.is_dir()])
# concatnate all files
parts = []
for p_year in tqdm(datafiles, total=len(datafiles)):
    new_path = data_path / 'airquality' / f'air-seoul-{p_year.name}.csv'
    for p in p_year.glob('*'):
        
        df = load_data(p)
        df = filter_seoul(df)
        parts.append(df)
    
    df_all = pd.concat(parts).reset_index(drop=True)
    if p_year.name == '2018':
        # fillna for air-seoul-2018.csv
        # '망' column contains null value due to policy changed
        # create dictionary for measure point
        m_dict = dict(df_all.loc[~df_all['망'].isna(), ['측정소코드', '망']].drop_duplicates().values)
        df_all.loc[df_all['망'].isna(), '망'] = df_all.loc[df_all['망'].isna(), '망'].fillna(df_all['측정소코드'].map(m_dict)).values
    
    df_all.to_csv(new_path, encoding='utf-8', index=False)
    parts = []

In [33]:
# change column name and insert into database
column_m_dict = {
    '지역': 'district', 
    '측정소코드': 'measurecode', 
    '측정소명': 'measurename', 
    '측정일시': 'date', 
    '주소': 'address',
    '망': 'measurepoint'
}

check_miss_match = {}
for p in sorted((data_path / 'airquality').glob("*.csv")):
    df = pd.read_csv(p, encoding='utf-8').rename(columns=column_m_dict)
    c = df.loc[:, ['district', 'measurepoint', 'measurecode', 'measurename', 'address']].drop_duplicates()
    check_miss_match[int(p.name.rstrip('\.csv').split('-')[-1])] = c
    print(f"{p.name}, num-unique data: {len(c)}, measurecode: {len(c['measurecode'].unique())}, district: {len(c['district'].unique())}, address: {len(c['district'].unique())}")
    # saved changed columns
    df.to_csv(p, encoding='utf-8', index=False)

air-seoul-2018.csv, num-unique data: 80, measurecode: 40,         district: 48, address: 48
air-seoul-2019.csv, num-unique data: 40, measurecode: 40,         district: 25, address: 25
air-seoul-2020.csv, num-unique data: 40, measurecode: 40,         district: 25, address: 25
air-seoul-2021.csv, num-unique data: 40, measurecode: 40,         district: 25, address: 25


In [35]:
# fix the district name and address by 2021 version of measurecode
code2dist = dict(check_miss_match[2021].loc[:, ['measurecode', 'district']].values)
code2add = dict(check_miss_match[2021].loc[:, ['measurecode', 'address']].values)
df = pd.read_csv(data_path / 'airquality' / 'air-seoul-2018.csv', encoding='utf-8').rename(columns=column_m_dict)
df['district'] = df['measurecode'].map(code2dist)
df['address'] = df['measurecode'].map(code2add)
df = df.set_index(['measurecode', 'district', 'measurename', 'address', 'measurepoint']).sort_values(['measurecode', 'date']).reset_index()

# save 
# df.to_csv(data_path / 'airquality' / 'air-seoul-2018.csv', encoding='utf-8', index=False)

In [4]:
def drop_tables(conn):
    cur = conn.cursor()
    conn.execute("DROP TABLE IF EXISTS airquality;")
    conn.execute("DROP TABLE IF EXISTS airmeasure;")
    cur.close()

In [24]:
drop_tables(conn)

In [25]:
cur = conn.cursor()
cur.execute(
    """
    CREATE TABLE IF NOT EXISTS airmeasure (
        sid INTEGER PRIMARY KEY,
        measurecode INTEGER NOT NULL UNIQUE,
        district TEXT, 
        measurename TEXT, 
        address TEXT, 
        measurepoint TEXT
    );
    """
)
cur.execute(
    """
    CREATE TABLE IF NOT EXISTS airquality (
        airid INTEGER PRIMARY KEY,
        measurecode INTEGER, 
        date TEXT, 
        SO2 REAL, 
        CO REAL, 
        O3 REAL,
        NO2 REAL, 
        PM10 REAL, 
        PM25 REAL, 
        FOREIGN KEY (measurecode)
            REFERENCES airmeasure (measurecode)
            ON DELETE CASCADE 
            ON UPDATE NO ACTION
    );
    """
)

airmeasure_columns = ['measurecode', 'district', 'measurename', 'address', 'measurepoint']
airquality_columns = ['measurecode', 'date', 'SO2', 'CO', 'O3', 'NO2', 'PM10', 'PM25']
df_airmeasure = None

sql_airmeasure = """
INSERT INTO airmeasure (sid, measurecode, district, measurename, address, measurepoint)
VALUES (?, ?, ?, ?, ?, ?);
"""
sql_airquality = """
INSERT INTO airquality (airid, measurecode, date, SO2, CO, O3, NO2, PM10, PM25) 
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?);
"""
idx = 0
for p in tqdm(sorted((data_path / 'airquality').glob("*.csv")), total=4):
    df = pd.read_csv(p, encoding='utf-8')
    df = df.set_index(['measurecode', 'district', 'measurename', 'address', 'measurepoint']).sort_values(['measurecode', 'date']).reset_index()

    if df_airmeasure is None:
        df_airmeasure = df.loc[:, airmeasure_columns].drop_duplicates().reset_index(drop=True)
        # insert query
        for i, x in df_airmeasure.iterrows():
            cur.execute(sql_airmeasure, [i+1] + [x[c] for c in airmeasure_columns])
    else:
        df_temp = df.loc[:, airmeasure_columns].drop_duplicates().reset_index(drop=True)
        if (df_temp != df_airmeasure).sum().sum():
            raise ValueError("not equal table")
    for m in df_airmeasure['measurecode'].values:
        df_airquality = df.loc[df['measurecode'] == m, airquality_columns]
        df_airquality['date'] = pd.to_datetime(df_airquality['date']-1, format='%Y%m%d%H').dt.strftime('%Y-%m-%d %H:%M:%S')
        
        # insert row
        for i, x in df_airquality.iterrows():
            idx += 1
            cur.execute(sql_airquality, [idx] + [x[c] for c in airquality_columns])
cur.close()

  0%|          | 0/4 [00:00<?, ?it/s]

# Traffic Data

In [5]:
datafiles = sorted([x for x in (data_path / 'traffic').glob("*") if x.is_dir()])
p = next(datafiles[-1].glob('*'))

df = pd.read_excel(p, sheet_name=1)

In [28]:
df_traffic_info = pd.read_excel(p, sheet_name=2, skipfooter=2)
df_traffic_info = df_traffic_info.loc[df_traffic_info['주소'].str.contains('서울')]
seoul_idx = df_traffic_info["주소"].str.split(expand=True).iloc[:, 0:2].drop_duplicates()[1].index
df_traffic_info.loc[seoul_idx]

Unnamed: 0,지점번호,지점명칭,검지기 유형,위도,경도,주소,도로명 주소,유입 방향,유출방향
0,A-01,성산로(금화터널),지자기,37.568588,126.948436,서울시 서대문구 신촌동 1-142,,[성산로]봉원고가차도->독립문역,[성산로]독립문역->봉원고가차도
1,A-02,사직로(사직터널),지자기,37.572298,126.962853,서울시 종로구 행촌동 1-186,,[사직로]독립문역->사직단,[사직로]사직단->독립문역
3,A-04,대사관로(삼청터널),지자기,37.596359,126.984209,서울시 성북구 성북동 산25-148,,[삼청로]삼청각->삼청공원입구,[삼청로]삼청공원입구->삼청각
8,A-09,퇴계로(신당역),지자기,37.565712,127.020912,서울시 중구 황학동 710,서울시 중구 퇴계로 443,[퇴계로]상왕십리역->신당역,[퇴계로]신당역->상왕십리역
19,A-20,남산1호터널,루프,37.542406,127.001356,서울시 용산구 한남동 산 10-33,서울시 용산구 이태원로 281,[삼일대로]한남1고가차도->남산1호터널북단,[삼일대로]남산1호터널북단->한남1고가차도
24,B-01,도봉로(도봉산역),지자기,37.691792,127.045087,서울시 도봉구 도봉동 378-1,서울시 도봉구 도봉로 969,[도봉로]도봉로의정부시계->도봉산역,[도봉로]도봉산역->도봉로의정부시계
25,B-02,동일로(의정부IC),지자기,37.688571,127.055376,서울시 노원구 상계동 1311-3,,[동일로]의정부IC->수락지하차도,[동일로]수락지하차도->의정부IC
26,B-03,아차산로(워커힐),지자기,37.550897,127.108955,서울시 광진구 광장동 145-2,,[아차산로]아차산로구리시계->광나루역,[아차산로]광나루역->아차산로구리시계
28,B-05,경춘북로(중랑경찰서),지자기,37.619942,127.105325,서울시 중랑구 신내동 271-1,,[경춘북로]경춘북로구리시계->신내IC,[경춘북로]신내IC->경춘북로구리시계
32,B-09,천호대로(상일IC),지자기,37.547459,127.175222,서울시 강동구 상일동 12-2,,[천호대로]상일IC입구->상일초교,[천호대로]상일초교->상일IC입구


In [27]:
df_traffic_info["주소"].str.split(expand=True).iloc[:, 0:2].drop_duplicates()[1]

0      서대문구
1       종로구
3       성북구
8        중구
19      용산구
24      도봉구
25      노원구
26      광진구
28      중랑구
32      강동구
34      서초구
35      송파구
36      강남구
41      금천구
42      구로구
49      강서구
50      양천구
55      강서구
57      은평구
59      마포구
65     영등포구
73      성동구
90      노원구
110     동작구
112     관악구
130    동대문구
Name: 1, dtype: object