In [1]:
import os
import sqlite3
import pandas as pd
from pathlib import Path

from collections import defaultdict
from tqdm.notebook import tqdm
from typing import Any, Union

import dask
import dask.dataframe as dd

data_path = Path() / 'data' 


# SQLite3 Database

In [2]:
conn = sqlite3.connect("airpollution.db")

# Airquality Data

In [3]:
def load_data(path:Path) -> pd.DataFrame:
    """Load data function

    Args:
        path (Path): path of data with file name
        enc (str, optional): encoding. Defaults to 'utf-8'.
    Returns:
        pd.DataFrame 
    """    
    if path.name.split('.')[-1] == 'xlsx':
        df = pd.read_excel(path)
    else:
        try:
            df = pd.read_csv(path, encoding='cp949')
        except UnicodeDecodeError:
            df = pd.read_csv(path, encoding='utf-8')

    return df

def filter_seoul(df):
    return df.loc[df['지역'].str.contains('서울'), :]

In [5]:
datafiles = sorted([x for x in (data_path / 'airquality').glob("*") if x.is_dir()])

# concatnate all files
parts = []
for p_year in tqdm(datafiles, total=len(datafiles)):
    new_path = data_path / 'airquality' / f'air-seoul-{p_year.name}.csv'
    for p in p_year.glob('*'):
        
        df = load_data(p)
        df = filter_seoul(df)
        parts.append(df)
    
    df_all = pd.concat(parts).reset_index(drop=True)
    if p_year.name == '2018':
        # fillna for air-seoul-2018.csv
        # '망' column contains null value due to policy changed
        # create dictionary for measure point
        m_dict = dict(df_all.loc[~df_all['망'].isna(), ['측정소코드', '망']].drop_duplicates().values)
        df_all.loc[df_all['망'].isna(), '망'] = df_all.loc[df_all['망'].isna(), '망'].fillna(df_all['측정소코드'].map(m_dict)).values
    
    df_all.to_csv(new_path, encoding='utf-8', index=False)
    parts = []

  0%|          | 0/4 [00:00<?, ?it/s]

In [32]:
# change column name and insert into database
column_m_dict = {
    '지역': 'district', 
    '측정소코드': 'measurecode', 
    '측정소명': 'measurename', 
    '측정일시': 'date', 
    '주소': 'address',
    '망': 'measurepoint'
}

cur = conn.cursor()
cur.execute(
    """
    CREATE TABLE IF NOT EXISTS airmeasure (
        sid INTEGER PRIMARY KEY,
        measurecode INTEGER NOT NULL UNIQUE,
        district TEXT, 
        measurename TEXT, 
        address TEXT, 
        measurepoint TEXT
    );
    """
)

cur.execute(
    """
    CREATE TABLE IF NOT EXISTS airquality (
        airid INTEGER PRIMARY KEY,
        measurecode INTEGER, 
        date TEXT, 
        SO2 REAL, 
        CO REAL, 
        O3 REAL,
        NO2 REAL, 
        PM10 REAL, 
        PM25 REAL, 
        FOREIGN KEY (measurecode)
            REFERENCES airmeasure (measurecode)
            ON DELETE CASCADE 
            ON UPDATE NO ACTION
    );
    """
)
for p in (data_path / 'airquality').glob("*.csv"):
    df = pd.read_csv(p, encoding='utf-8').rename(columns=column_m_dict)
    
    df['date'] = pd.to_datetime(df['date']-1, format='%Y%m%d%H').dt.strftime('%Y-%m-%d %H')
    break

In [20]:
df.set_index(['district', 'measurecode', 'measurename', 'address'])

Index(['district', 'measurecode', 'measurename', 'date', 'SO2', 'CO', 'O3',
       'NO2', 'PM10', 'PM25', 'address', 'measurepoint'],
      dtype='object')

In [33]:
df.head()

Unnamed: 0,district,measurecode,measurename,date,SO2,CO,O3,NO2,PM10,PM25,address,measurepoint
0,서울 중구,111121,중구,2018-01-01 00,0.004,0.5,0.02,0.02,34.0,19.0,서울 중구 덕수궁길 15,도시대기
1,서울 중구,111121,중구,2018-01-01 01,0.004,0.4,0.024,0.016,27.0,14.0,서울 중구 덕수궁길 15,도시대기
2,서울 중구,111121,중구,2018-01-01 02,0.004,0.4,0.018,0.022,26.0,14.0,서울 중구 덕수궁길 15,도시대기
3,서울 중구,111121,중구,2018-01-01 03,0.004,0.5,0.01,0.03,26.0,15.0,서울 중구 덕수궁길 15,도시대기
4,서울 중구,111121,중구,2018-01-01 04,0.004,0.6,0.011,0.029,28.0,16.0,서울 중구 덕수궁길 15,도시대기


In [25]:
from datetime import datetime as dt
dt.strptime()

In [28]:
df['date'].dt.strftime('%Y-%m-%d %H')

0         2018-01-01 00
1         2018-01-01 01
2         2018-01-01 02
3         2018-01-01 03
4         2018-01-01 04
              ...      
343099    2018-12-31 19
343100    2018-12-31 20
343101    2018-12-31 21
343102    2018-12-31 22
343103    2018-12-31 23
Name: date, Length: 343104, dtype: object

In [15]:
df['date'] - 1

0         2018010100
1         2018010101
2         2018010102
3         2018010103
4         2018010104
             ...    
343099    2018123119
343100    2018123120
343101    2018123121
343102    2018123122
343103    2018123123
Name: date, Length: 343104, dtype: int64

  df = pd.read_csv(p, encoding='utf-8')


In [51]:
len(df)

343104

# Traffic Data

In [19]:
datafiles = sorted([x for x in (data_path / 'traffic').glob("*") if x.is_dir()])
p = next(datafiles[-1].glob('*'))

df = pd.read_excel(p)

In [20]:
df['지점명'].unique()

KeyError: '지점명'

In [21]:
df

Unnamed: 0,지점별 일자별 교통량 범례,Unnamed: 1,Unnamed: 2,Unnamed: 3
0,구분,설명,표현 예시,예시 설명
1,일자,교통량 조사 일자,20181201,43435
2,요일,교통량 조사 요일,토,토요일
3,지점명,교통량 조사 도로명(조사지점명),성산로(금화터널),조사지점의 도로명(지점명)
4,지점번호,"조사지점을 5개 영역(A,B,C,D,F)으로 구분하고 일련번호를 부여함\n- [A(...",A-01,도심 1번 지점
5,방향,유입 : 외곽에서 서울시청으로 들어오는 방향\n유출 : 시울시청에서 외곽으로 나가는 방향,유입/유출,
6,구분,조사지점에서 가까운 교차로명으로 방향표시,봉원고가차도→독립문역,봉원고가차도에서 독립문역 방향의 교통량
7,시간대,1시간 단위를 표시,0시,0시~1시
8,교통량,1시간 교통량,809,809대/시
