# 미세먼지 데이터 크롤링

In [1]:
import requests
from bs4 import BeautifulSoup

url = 'https://air.jihe.go.kr/jnair/airInfo/byObserverDateIndex.do?menuCd=jnair001006&observerId=741&displayType=table&observeDate=2013-01-01&searchType=%EC%8B%9C&observeTerm=day1'
response = requests.get(url)

soup = BeautifulSoup(response.text, 'html.parser')
table = soup.find_all('table')[0]  # 데이터가 있는 테이블 요소를 찾습니다.

data = []
for tr in table.find_all('tr'):  # 각 행을 반복하면서 데이터를 추출합니다.
    tds = tr.find_all('td')
    if len(tds) == 0:  # 첫 번째 행은 헤더이므로 데이터에 포함하지 않습니다.
        continue
    row = [td.text.strip() for td in tds]
    data.append(row)

print(data)  # 추출한 데이터를 출력합니다.


[['2013-01-01 01:00', '29', '21', '0.0040', '0.0220', '-', '0.0060', '', '', ''], ['2013-01-01 02:00', '45', '-', '0.0040', '0.0120', '-', '0.0060', '', '', ''], ['2013-01-01 03:00', '24', '19', '0.0050', '0.0120', '-', '0.0050', '', '', ''], ['2013-01-01 04:00', '24', '19', '0.0060', '0.0090', '-', '0.0050', '', '', ''], ['2013-01-01 05:00', '23', '20', '0.0060', '0.0050', '-', '0.0040', '', '', ''], ['2013-01-01 06:00', '26', '23', '0.0070', '0.0060', '-', '0.0040', '', '', ''], ['2013-01-01 07:00', '26', '-', '0.0060', '0.0100', '-', '0.0040', '', '', ''], ['2013-01-01 08:00', '30', '24', '0.0060', '0.0110', '-', '0.0040', '', '', ''], ['2013-01-01 09:00', '28', '23', '0.0050', '0.0100', '-', '0.0040', '', '', ''], ['2013-01-01 10:00', '29', '27', '0.0070', '0.0060', '-', '0.0050', '', '', ''], ['2013-01-01 11:00', '42', '31', '0.0110', '0.0040', '-', '0.0090', '', '', ''], ['2013-01-01 12:00', '58', '36', '0.0110', '0.0110', '-', '0.0090', '', '', ''], ['2013-01-01 13:00', '54', '3

In [6]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime, timedelta

url_template = 'https://air.jihe.go.kr/jnair/airInfo/byObserverDateIndex.do?menuCd=jnair001006&observerId=741&displayType=table&observeDate={date}&searchType=%EC%8B%9C&observeTerm=day1'

start_date = datetime(2013, 1, 1)
end_date = datetime(2022, 12, 31)

data = []
delta = timedelta(days=1)

while start_date <= end_date:
    date_str = start_date.strftime('%Y-%m-%d')
    url = url_template.format(date=date_str)
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    table = soup.find_all('table')[0]
    for tr in table.find_all('tr'):
        tds = tr.find_all('td')
        if len(tds) == 0:
            continue
        row = [td.text.strip() for td in tds]
        data.append(row)
    start_date += delta

print(data)


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



.

# 여기부터 실행

In [2]:
import pandas as pd
import numpy as np

In [7]:
df = pd.DataFrame(data, columns=['날짜', 'PM10', 'PM2.5','O3','이산화질소','일산화질소','아황산가스','등급','지수','결정항목'])

In [8]:
df.to_csv('data.csv', index=False)

In [9]:
df.to_excel('data.xlsx', index=False)

In [3]:
df=pd.read_csv('data2.csv')
df

Unnamed: 0,날짜,PM10,PM2.5,O3,이산화질소,일산화질소,아황산가스
0,2013-01-01 1:00,29,21,0.004,0.022,-,0.006
1,2013-01-01 2:00,45,-,0.004,0.012,-,0.006
2,2013-01-01 3:00,24,19,0.005,0.012,-,0.005
3,2013-01-01 4:00,24,19,0.006,0.009,-,0.005
4,2013-01-01 5:00,23,20,0.006,0.005,-,0.004
...,...,...,...,...,...,...,...
87444,2022-12-31 20:00,32,22,0.018,0.022,0.65,0.002
87445,2022-12-31 21:00,44,33,0.006,0.032,0.61,0.002
87446,2022-12-31 22:00,46,37,0.005,0.03,0.7,0.002
87447,2022-12-31 23:00,47,35,0.002,0.032,0.83,0.003


1. df에서 날짜 따로 저장해두고 드롭하고
2. '-' 전부 ''으로 replace하고
3. 날짜 다시 붙임

In [5]:
tmp = df['날짜']

In [7]:
df.drop(['날짜'],axis=1,inplace=True)

In [8]:
df

Unnamed: 0,PM10,PM2.5,O3,이산화질소,일산화질소,아황산가스
0,29,21,0.004,0.022,-,0.006
1,45,-,0.004,0.012,-,0.006
2,24,19,0.005,0.012,-,0.005
3,24,19,0.006,0.009,-,0.005
4,23,20,0.006,0.005,-,0.004
...,...,...,...,...,...,...
87444,32,22,0.018,0.022,0.65,0.002
87445,44,33,0.006,0.032,0.61,0.002
87446,46,37,0.005,0.03,0.7,0.002
87447,47,35,0.002,0.032,0.83,0.003


In [14]:
df.replace(' ', np.nan, inplace=True)

In [15]:
df

Unnamed: 0,PM10,PM2.5,O3,이산화질소,일산화질소,아황산가스
0,29,21,0.004,0.022,,0.006
1,45,,0.004,0.012,,0.006
2,24,19,0.005,0.012,,0.005
3,24,19,0.006,0.009,,0.005
4,23,20,0.006,0.005,,0.004
...,...,...,...,...,...,...
87444,32,22,0.018,0.022,0.65,0.002
87445,44,33,0.006,0.032,0.61,0.002
87446,46,37,0.005,0.03,0.7,0.002
87447,47,35,0.002,0.032,0.83,0.003


In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87449 entries, 0 to 87448
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   PM10    76434 non-null  object
 1   PM2.5   72456 non-null  object
 2   O3      83998 non-null  object
 3   이산화질소   83918 non-null  object
 4   일산화질소   15419 non-null  object
 5   아황산가스   83193 non-null  object
 6   날짜      87449 non-null  object
dtypes: object(7)
memory usage: 4.7+ MB


In [17]:
df['날짜']=tmp

In [18]:
df

Unnamed: 0,PM10,PM2.5,O3,이산화질소,일산화질소,아황산가스,날짜
0,29,21,0.004,0.022,,0.006,2013-01-01 1:00
1,45,,0.004,0.012,,0.006,2013-01-01 2:00
2,24,19,0.005,0.012,,0.005,2013-01-01 3:00
3,24,19,0.006,0.009,,0.005,2013-01-01 4:00
4,23,20,0.006,0.005,,0.004,2013-01-01 5:00
...,...,...,...,...,...,...,...
87444,32,22,0.018,0.022,0.65,0.002,2022-12-31 20:00
87445,44,33,0.006,0.032,0.61,0.002,2022-12-31 21:00
87446,46,37,0.005,0.03,0.7,0.002,2022-12-31 22:00
87447,47,35,0.002,0.032,0.83,0.003,2022-12-31 23:00


In [19]:
df.to_csv('미세먼지.csv',encoding='cp949')