#### 미세먼지 현황 분석

##### 데이터 수집 사이트

- 에어코리아 : https://www.airkorea.or.kr/web/pastSearch?pMENU_NO=123

- 기상자료 개방포털 : https://data.kma.go.kr/data/grnd/selectAwsRltmList.do?pgmNo=56

##### 에어코리아 데이터 수집, 전처리

In [1]:
import pandas as pd
import numpy as np

In [None]:
dust2203 = pd.read_excel('./raw/202203_data_past_day.xlsx')
dust2203.drop(axis=0, index=[0], inplace=True) # 두 번째 행 삭제
dust2203

In [7]:
dust2203.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31 entries, 1 to 31
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   날짜      31 non-null     object
 1   PM10    31 non-null     object
 2   PM2.5   31 non-null     object
 3   오 존     31 non-null     object
 4   이산화질소   31 non-null     object
 5   일산화탄소   31 non-null     object
 6   아황산가스   31 non-null     object
dtypes: object(7)
memory usage: 1.8+ KB


In [None]:
# 컬럼명을 영문으로
dust2203.rename(columns={'날짜':'date', 'PM10': 'pm10', 'PM2.5':'pm25', '오 존': 'o3', '이산화질소':'no2', '일산화탄소' : 'co', '아황산가스': 'so2'}, inplace=True) # inplace까지 해야 바뀜

dust2203

In [11]:
dust2203.dtypes

date    object
pm10    object
pm25    object
o3      object
no2     object
co      object
so2     object
dtype: object

In [23]:
# 타입변경
dust2203 = dust2203.astype({'date':'datetime64[ns]', 'pm10':'int64', 'pm25':'int64', 'o3': 'float64', 'no2':'float64', 'co':'float64', 'so2':'float64'})
dust2203.tail()

Unnamed: 0,date,pm10,pm25,o3,no2,co,so2
27,2022-03-27,34,15,0.039,0.009,0.3,0.002
28,2022-03-28,24,12,0.034,0.013,0.2,0.002
29,2022-03-29,35,18,0.024,0.024,0.3,0.003
30,2022-03-30,44,25,0.016,0.027,0.3,0.004
31,2022-03-31,33,20,0.03,0.014,0.3,0.002


In [20]:
dust2203.dtypes

date    datetime64[ns]
pm10             int64
pm25             int64
o3             float64
no2            float64
co             float64
so2            float64
dtype: object

In [24]:
# 날짜를 년, 월, 일로 분해해서 컬럼 생성
dust2203['year'] = dust2203['date'].dt.year
dust2203['month'] = dust2203['date'].dt.month
dust2203['day'] = dust2203['date'].dt.day

In [25]:
dust2203.tail()

Unnamed: 0,date,pm10,pm25,o3,no2,co,so2,year,month,day
27,2022-03-27,34,15,0.039,0.009,0.3,0.002,2022,3,27
28,2022-03-28,24,12,0.034,0.013,0.2,0.002,2022,3,28
29,2022-03-29,35,18,0.024,0.024,0.3,0.003,2022,3,29
30,2022-03-30,44,25,0.016,0.027,0.3,0.004,2022,3,30
31,2022-03-31,33,20,0.03,0.014,0.3,0.002,2022,3,31


In [29]:
# 컬럼순서 재정렬
dust2203 = dust2203[['date', 'year', 'month', 'day', 'so2', 'co', 'o3', 'no2', 'pm10', 'pm25']]
dust2203.head()

Unnamed: 0,date,year,month,day,so2,co,o3,no2,pm10,pm25
1,2022-03-01,2022,3,1,0.003,0.3,0.028,0.026,46,32
2,2022-03-02,2022,3,2,0.003,0.3,0.028,0.018,48,33
3,2022-03-03,2022,3,3,0.004,0.3,0.034,0.021,52,30
4,2022-03-04,2022,3,4,0.004,0.4,0.035,0.026,93,47
5,2022-03-05,2022,3,5,0.003,0.2,0.039,0.01,89,21


In [31]:
dust2203.isnull().sum()

date     0
year     0
month    0
day      0
so2      0
co       0
o3       0
no2      0
pm10     0
pm25     0
dtype: int64

##### 기상청 데이터 확인 및 전처리

In [None]:
weather_2203 = pd.read_excel('./raw/OBS_AWS_TIM_202203_data.xlsx')
weather_2203

In [33]:
weather_2203.info()

# 타입 바꿀 필요 없음 잘 되어 있음

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 744 entries, 0 to 743
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   지점       744 non-null    int64         
 1   지점명      744 non-null    object        
 2   일시       744 non-null    datetime64[ns]
 3   기온(°C)   744 non-null    float64       
 4   풍속(m/s)  744 non-null    float64       
 5   강수량(mm)  731 non-null    float64       
 6   습도(%)    744 non-null    int64         
dtypes: datetime64[ns](1), float64(3), int64(2), object(1)
memory usage: 40.8+ KB


In [35]:
# 지점, 지점명 컬럼 삭제
weather_2203.drop('지점',axis=1, inplace=True) #axis=1이 컬럼 삭제임 axis=0은 행 삭제
weather_2203.drop('지점명',axis=1,  inplace=True)

In [37]:
weather_2203.head()

Unnamed: 0,일시,기온(°C),풍속(m/s),강수량(mm),습도(%)
0,2022-03-01 00:00:00,8.7,0.8,0.0,40
1,2022-03-01 01:00:00,8.2,0.5,0.0,48
2,2022-03-01 02:00:00,8.1,0.3,0.0,54
3,2022-03-01 03:00:00,7.2,0.7,0.0,59
4,2022-03-01 04:00:00,7.2,0.1,0.0,63


In [39]:
weather_2203.columns = ['date', 'temp', 'wind', 'rain', 'humid'] # rename말고 다른 방법으로 컬럼명 바꾸기
weather_2203.tail()

Unnamed: 0,date,temp,wind,rain,humid
739,2022-03-31 19:00:00,10.9,4.8,0.0,70
740,2022-03-31 20:00:00,10.5,2.7,0.0,74
741,2022-03-31 21:00:00,10.2,3.1,0.0,68
742,2022-03-31 22:00:00,10.0,2.9,0.0,66
743,2022-03-31 23:00:00,9.4,5.8,0.0,65


In [41]:
weather_2203['date'] = pd.to_datetime(weather_2203['date']).dt.date

In [42]:
weather_2203.dtypes

date      object
temp     float64
wind     float64
rain     float64
humid      int64
dtype: object

In [46]:
weather_2203 = weather_2203.astype({'date':'datetime64[ns]'})

In [47]:
weather_2203.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 744 entries, 0 to 743
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   date    744 non-null    datetime64[ns]
 1   temp    744 non-null    float64       
 2   wind    744 non-null    float64       
 3   rain    731 non-null    float64       
 4   humid   744 non-null    int64         
dtypes: datetime64[ns](1), float64(3), int64(1)
memory usage: 29.2 KB


In [51]:
weather_2203[weather_2203['rain'].isnull()]

Unnamed: 0,date,temp,wind,rain,humid
612,2022-03-26,16.2,5.5,,79
613,2022-03-26,17.0,6.1,,74
614,2022-03-26,17.8,6.3,,72
615,2022-03-26,17.5,7.0,,70
616,2022-03-26,20.2,5.0,,55
617,2022-03-26,19.2,4.5,,58
618,2022-03-26,18.4,4.0,,56
619,2022-03-26,16.6,5.6,,59
620,2022-03-26,15.2,2.7,,64
621,2022-03-26,14.4,3.0,,65


In [57]:
# 강수량 결측치를 0.0으로 설정
weather_2203['rain'].fillna(0.0, inplace=True) #inplace=True를 해야 저장됨

In [56]:
weather_2203.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 744 entries, 0 to 743
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   date    744 non-null    datetime64[ns]
 1   temp    744 non-null    float64       
 2   wind    744 non-null    float64       
 3   rain    744 non-null    float64       
 4   humid   744 non-null    int64         
dtypes: datetime64[ns](1), float64(3), int64(1)
memory usage: 29.2 KB


In [60]:
# 강수량은 0.1단위로 측정하기때문에 0인 데이터는 0.01로 변경
weather_2203['rain'] = weather_2203['rain'].replace([0], 0.01)
weather_2203['rain'].value_counts()

rain
0.01    704
0.50     13
1.50      7
1.00      6
2.50      2
2.00      2
4.00      2
6.50      2
3.00      1
3.50      1
5.00      1
8.50      1
8.00      1
7.50      1
Name: count, dtype: int64

In [69]:
weather_2203_new = weather_2203.groupby(weather_2203['date']).mean()
weather_2203_new.head()

Unnamed: 0_level_0,temp,wind,rain,humid
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2022-03-01,9.7375,1.875,0.091667,71.25
2022-03-02,6.820833,2.920833,0.01,45.5
2022-03-03,6.595833,2.516667,0.01,51.291667
2022-03-04,7.3875,2.804167,0.01,68.583333
2022-03-05,9.104167,5.1375,0.01,35.375
