# 기상청 기상관측 데이터 크롤링

기상청_지상(종관, ASOS) 시간자료 조회서비스<br>https://www.data.go.kr/tcs/dss/selectApiDataDetailView.do?publicDataPk=15057210

- 발전량에 영향을 주는 변수들 이외의 변수 제외
- 강수량, 적설량, 일사량, 일조량 결측치 0.0값으로 대체
- 전운량 결측치 선형보간

In [1]:
from datetime import datetime
from time import time
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import urllib
from urllib.request import urlopen
import json
import pickle
from urllib.parse import quote_plus, urlencode

In [2]:
url = 'http://apis.data.go.kr/1360000/AsosHourlyInfoService/getWthrDataList'
ServiceKey1 = 'tqpC08yZYskkrG+Mj0oJltcqQllRMp1DNStXHGZI5UiJ27ANO6F8hEr0uYp9i1Dza1CbxkbUvC3qfo/U/t0/HQ=='
ServiceKey2 = 'tqpC08yZYskkrG%2BMj0oJltcqQllRMp1DNStXHGZI5UiJ27ANO6F8hEr0uYp9i1Dza1CbxkbUvC3qfo%2FU%2Ft0%2FHQ%3D%3D'

queryParams = '?' + urllib.parse.urlencode({quote_plus('ServiceKey') : ServiceKey1,
                                        quote_plus('pageNo') : '1',
                                        quote_plus('numOfRows') : '1', 
                                        quote_plus('dataType') : 'JSON', 
                                        quote_plus('dataCd') : 'ASOS', 
                                        quote_plus('dateCd') : 'HR', 
                                        quote_plus('startDt') : '20170101', 
                                        quote_plus('startHh') : '01', 
                                        quote_plus('endDt') : '20170701', 
                                        quote_plus('endHh') : '02', 
                                        quote_plus('stnIds') : '192'})
response = urlopen(url + queryParams).read()
response = json.loads(response)

In [3]:
col_list = list(response['response']['body']['items']['item'][0].keys())
col_map = {'tm': 'time', 'stnNm': 'loc_name', 'stnId': 'loc_num', 'ta': 'temp', 'rn': 'precipitation', 
           'ws': 'wind_speed', 'wd': 'wind_direction', 'hm': 'humidity', 'pv': 'pressure_vapor', 'td': 'dew_point', 
           'pa': 'pressure_local', 'ps': 'pressure_sea', 'ss': 'sunshine', 'icsr': 'radiation', 'dsnw': 'snow', 
           'dc10Tca': 'cloud', 'vs': 'air_opacity', 'ts': 'temp_surf', 
           'm005Te': 'temp_5cm', 'm01Te': 'temp_10cm', 'm02Te': 'temp_20cm', 'm03Te': 'temp_30cm'}
col_list = ['time', 'loc_num', 'loc_name', 'temp', 'precipitation', 'wind_speed', 'wind_direction', 'humidity',
            'pressure_vapor', 'dew_point', 'pressure_local', 'pressure_sea', 'sunshine','radiation', 'snow', 'cloud', 
            'air_opacity', 'temp_surf', 'temp_5cm', 'temp_10cm', 'temp_20cm', 'temp_30cm']
col_float_list = ['temp', 'precipitation', 'wind_speed', 'wind_direction', 'humidity',
            'pressure_vapor', 'dew_point', 'pressure_local', 'pressure_sea', 'sunshine','radiation', 'snow', 'cloud', 
            'air_opacity', 'temp_surf', 'temp_5cm', 'temp_10cm', 'temp_20cm', 'temp_30cm']

In [4]:
# 기상청에서 1000row 이상의 데이터를 가져오지 못하므로 960(24*40)개씩 가져오기 위한 코드    
def make_sub_array(start_date, location):
    queryParams = '?' + urllib.parse.urlencode(
        {quote_plus('ServiceKey') : ServiceKey1,
         quote_plus('pageNo') : '1',
         quote_plus('numOfRows') : '960', 
         quote_plus('dataType') : 'JSON', 
         quote_plus('dataCd') : 'ASOS', 
         quote_plus('dateCd') : 'HR', 
         quote_plus('startDt') : start_date,
         quote_plus('startHh') : '01', 
         quote_plus('endDt') : datetime.strftime(pd.to_datetime(start_date)+pd.DateOffset(hours=960), '%Y%m%d'), 
         quote_plus('endHh') : '00', 
         quote_plus('stnIds') : location})
    
    response = urlopen(url + queryParams).read()
    response = json.loads(response)
    
    if response['response']['header']['resultMsg']==('NO_DATA'):
        return 'NoData'
    for i, data in enumerate(response['response']['body']['items']['item']):
        if i==0:
            obs_array = np.delete(
                np.array(list(data.values())), [1, 5, 7, 9, 11, 13, 17, 19, 21, 24, 26, 27, 28, 30, 31, 33]).reshape(-1, 1)
        else:
            obs_array = np.hstack(
                [obs_array, np.delete(np.array(list(data.values())),[1, 5, 7, 9, 11, 13, 17, 19, 21, 24, 26, 27, 28, 30, 31, 33]).reshape(-1, 1)])
        
    return obs_array.T

In [5]:
start_date_list = list(pd.date_range(start='20170101 01:00:00', end='20200701 00:00:00', freq='960H').astype(str))
for i, date in enumerate(start_date_list):
    start_date_list[i] = date.split(' ')[0].replace('-', '')

In [6]:
def make_df(location):
    obs_array = make_sub_array(start_date_list[0], location)
    if type(obs_array)==str:
        return 'NoData'
    sentence = ' ' + obs_array[0][2] + ' 기상 데이터 크롤링 진행 '
    print(f'{sentence:=^56}')
    for i, date in enumerate(tqdm(start_date_list[1:])):
        obs_array = np.vstack([obs_array, make_sub_array(date, location)])
        
    obs_array[:, [4, 12, 13, 14]] = np.where(obs_array[:, [4, 12, 13, 14]]=='', 0.0, obs_array[:, [4, 12, 13, 14]])
    obs_array = np.where(obs_array=='', np.nan, obs_array)
    
    obs_df = pd.DataFrame(obs_array, columns=col_list)
    obs_df[col_float_list] = obs_df[col_float_list].astype('float16')
    obs_df['time'] = pd.to_datetime(obs_df['time'])
    obs_df[['cloud']] = obs_df[['cloud']].interpolate()

    return obs_df

In [7]:
# with open('../pkl/obs_dict.pickle', 'rb') as fw:
#     obs_dict_local = pickle.load(fw)

# 로컬 데이터 크롤링

In [8]:
obs_dict_local = {}
obs_dict_local_list = [105, 129, 192]

In [9]:
for loc_num in obs_dict_local_list: # 강릉, 
    obs_dict_local[loc_num] = make_df(str(loc_num))



  0%|          | 0/31 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]

In [10]:
with open('../pkl/obs_dict_local.pickle','wb') as fw:
    pickle.dump(obs_dict_local, fw)

# 글로벌 데이터 크롤링

In [12]:
obs_dict_global_1 = {}
obs_dict_global_list_1 = np.setdiff1d(np.arange(90, 116), [105, 129, 152, 175, 192])
for loc_num in tqdm(obs_dict_global_list_1): # 전체
    obs_dict_global_1[loc_num] = make_df(str(loc_num))

  0%|          | 0/25 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]

In [13]:
with open('../pkl/obs_dict_global_1.pickle','wb') as fw:
    pickle.dump(obs_dict_global_1, fw)

In [14]:
obs_dict_global_2 = {}
obs_dict_global_list_2 = np.setdiff1d(np.arange(119, 160), [105, 129, 152, 175, 192])
for loc_num in tqdm(obs_dict_global_list_2): # 전체
    obs_dict_global_2[loc_num] = make_df(str(loc_num))

  0%|          | 0/39 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]

In [15]:
with open('../pkl/obs_dict_global_2.pickle','wb') as fw:
    pickle.dump(obs_dict_global_2, fw)

In [16]:
obs_dict_global_3 = {}
obs_dict_global_list_3 = np.setdiff1d(np.arange(160, 212), [105, 129, 152, 175, 192])
for loc_num in tqdm(obs_dict_global_list_3): # 전체
    obs_dict_global_3[loc_num] = make_df(str(loc_num))

  0%|          | 0/50 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]

In [17]:
with open('../pkl/obs_dict_global_3.pickle','wb') as fw:
    pickle.dump(obs_dict_global_3, fw)

In [18]:
obs_dict_global_4 = {}
obs_dict_global_list_4 = np.setdiff1d(np.arange(212, 250), [105, 129, 152, 175, 192])
for loc_num in tqdm(obs_dict_global_list_4): # 전체
    obs_dict_global_4[loc_num] = make_df(str(loc_num))

  0%|          | 0/38 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]

In [19]:
with open('../pkl/obs_dict_global_4.pickle','wb') as fw:
    pickle.dump(obs_dict_global_4, fw)

In [25]:
obs_dict_global_5 = {}
obs_dict_global_list_5 = np.setdiff1d(np.arange(250, 270), [105, 129, 152, 175, 192])
for loc_num in tqdm(obs_dict_global_list_5): # 전체
    obs_dict_global_5[loc_num] = make_df(str(loc_num))

  0%|          | 0/20 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]

In [26]:
with open('../pkl/obs_dict_global_5.pickle','wb') as fw:
    pickle.dump(obs_dict_global_5, fw)

In [28]:
obs_dict_global_6 = {}
obs_dict_global_list_6 = np.setdiff1d(np.arange(270, 300), [105, 129, 152, 175, 192])
for loc_num in tqdm(obs_dict_global_list_6): # 전체
    obs_dict_global_6[loc_num] = make_df(str(loc_num))

  0%|          | 0/30 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]



  0%|          | 0/31 [00:00<?, ?it/s]

In [30]:
with open('../pkl/obs_dict_global_6.pickle','wb') as fw:
    pickle.dump(obs_dict_global_6, fw)

# ===========================================================

# ===========================================================

# ===========================================================

# ===========================================================

# ===========================================================

# ===========================================================

# Removed Code

In [None]:
# 기상청에서 1000row 이상의 데이터를 가져오지 못하므로 960(24*40)개씩 가져오기 위한 코드    
def make_sub_df(start_date, location):
    queryParams = '?' + urllib.parse.urlencode({quote_plus('ServiceKey') : ServiceKey1,
                                            quote_plus('pageNo') : '1',
                                            quote_plus('numOfRows') : '960', 
                                            quote_plus('dataType') : 'JSON', 
                                            quote_plus('dataCd') : 'ASOS', 
                                            quote_plus('dateCd') : 'HR', 
                                            quote_plus('startDt') : start_date, 
                                            quote_plus('startHh') : '01', 
                                            quote_plus('endDt') : '20200701', 
                                            quote_plus('endHh') : '00', 
                                            quote_plus('stnIds') : location})
    response = urlopen(url + queryParams).read()
    response = json.loads(response)
    if response['response']['header']['resultMsg']==('NO_DATA'):
        return 0
    obs_df = pd.DataFrame()
    obs_df['time'] = pd.date_range(start=start_date+' 01:00:00', periods=960, freq='H') # 날짜 컬럼 생성
    for i, data in enumerate(response['response']['body']['items']['item']):
        obs_df.loc[i, 'location_number'] = data['stnId']
        obs_df.loc[i, 'location_name'] = data['stnNm']
        obs_df.loc[i, 'cloud_type'] = data['clfmAbbrCd'] # 운형
        obs_df.loc[i, 'temp'] = data['ta'] # 온도
        obs_df.loc[i, 'temp_30cm'] = data['m03Te'] # 30cm 지중온도
        obs_df.loc[i, 'temp_20cm'] = data['m02Te'] # 20cm 지중온도
        obs_df.loc[i, 'temp_10cm'] = data['m01Te'] # 10cm 지중온도
        obs_df.loc[i, 'temp_5cm'] = data['m005Te'] # 5cm 지중온도
        obs_df.loc[i, 'temp_surf'] = data['ts'] # 지면온도
        obs_df.loc[i, 'dew_point'] = data['td'] # 이슬점온도
        obs_df.loc[i, 'cloud_low'] = data['dc10LmcsCa'] # 중하층운량
        obs_df.loc[i, 'cloud_all'] = data['dc10Tca'] # 전운량, 선형보간
        obs_df.loc[i, 'snow'] = data['dsnw'] # 적설량
        obs_df.loc[i, 'radiation'] = data['dc10LmcsCa'] # 일사량
        obs_df.loc[i, 'sunshine'] = data['ss'] # 일조량
        obs_df.loc[i, 'pressure_sea'] = data['ps'] # 해면기압
        obs_df.loc[i, 'pressure_local'] = data['pa'] # 현지기압
        obs_df.loc[i, 'pressure_vapor'] = data['pv'] # 증기압
        obs_df.loc[i, 'humidity'] = data['hm'] # 습도
        obs_df.loc[i, 'wind_direction'] = data['wd'] # 풍향
        obs_df.loc[i, 'wind_speed'] = data['ws'] # 풍속
        obs_df.loc[i, 'precipitation'] = data['rn'] # 강수량
        
        # 전처리
        obs_df.loc[:, 'cloud_type'] = obs_df['cloud_type'].replace('', 'Def')
        obs_df.loc[:, 'cloud_all'] = obs_df['cloud_all'].replace('', np.nan).interpolate().fillna(0)
        obs_df.loc[:, ['snow', 'sunshine', 'precipitation']] = \
                            obs_df[['snow', 'sunshine', 'precipitation']].replace('', 0)
        obs_df.iloc[:, 4:] = obs_df.replace('', np.nan).iloc[:, 4:]
        
    return obs_df

In [None]:
def make_df(location):
    obs_df_total = make_sub_df(start_date_list[0], location)
    if type(obs_df_total)!=pd.DataFrame:
        return None
    for i, date in enumerate(tqdm(start_date_list[1:])):
        obs_df_total = pd.concat([obs_df_total, make_sub_df(start_date_list[i], location)])
    obs_df_total = obs_df_total.reset_index(drop=True)
    obs_df_total = obs_df_total[obs_df_total['location_number'].isna()!=True].reset_index(drop=True)
    #obs_df_total = obs_df_total.drop(
    #    list(obs_df_total.isna().sum()[obs_df_total.isna().sum()>=len(obs_df_total)/3].index), axis=1)
    #obs_df_total.dropna(inplace=True)
    obs_df_total = obs_df_total.reset_index(drop=True)
    obs_df_total.iloc[:, 4:] = obs_df_total.iloc[:, 4:].astype('float16')
    return obs_df_total