In [78]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# 데이터 크롤링(API)


## pandas 기초문법 - 시계열 데이터
- 일정 시간 간격으로 배치된 데이터 셋
- 시간 안의 패턴과 반복을 찾은 후 분석해야 한다.

- 시간을 다룰려면 **Timestamp**클래스를 사용하면 된다.
  + https://pandas.pydata.org/docs/reference/api/pandas.Timestamp.html


In [79]:
import pandas as pd

timestamp = pd.Timestamp('2023-11-10 12:30:00')
print(timestamp, '\t' ,type(timestamp))

2023-11-10 12:30:00 	 <class 'pandas._libs.tslibs.timestamps.Timestamp'>


### 나라마다 다른 날짜 표기법
- format 활용 https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior

In [80]:
# 미국식

timestamp = pd.Timestamp('06/07/20 12:30:00') # 월/일/년, 20년 6월 7일
print(timestamp, '\t' ,type(timestamp))

2020-06-07 12:30:00 	 <class 'pandas._libs.tslibs.timestamps.Timestamp'>


In [81]:
# 영국식
timestamp = pd.to_datetime('07/06/20', format = '%d/%m/%y') #20년 6월 7일
print(timestamp, '\t' ,type(timestamp))
print(timestamp.year)

2020-06-07 00:00:00 	 <class 'pandas._libs.tslibs.timestamps.Timestamp'>
2020


In [82]:
# 영국식
timestamp = pd.to_datetime('07/06/20', format = '%d/%m/%y') #20년 6월 7일
print(timestamp, '\t' ,type(timestamp))

2020-06-07 00:00:00 	 <class 'pandas._libs.tslibs.timestamps.Timestamp'>


- 활용해보기

In [83]:
ts = pd.to_datetime("2024-10-11 09:45:38")
print(ts)
print(ts.year)

2024-10-11 09:45:38
2024


## Timedelta
- 시계열을 계산할 수 있도록 만든 메서드
- 시간 연산을 가능하게 해 준다.

In [84]:
tmp = pd.Timedelta(days = 100, hours = 2, minutes = 30, seconds = 17)
print(type(tmp), tmp)

<class 'pandas._libs.tslibs.timedeltas.Timedelta'> 100 days 02:30:17


In [85]:
ts - tmp # ts에서 100일이 빠진다!

Timestamp('2024-07-03 07:15:21')

In [86]:
dates = ['2024-10-09', '2024-10-10', '2024-10-18']

idx = pd.to_datetime(dates)
idx

DatetimeIndex(['2024-10-09', '2024-10-10', '2024-10-18'], dtype='datetime64[ns]', freq=None)

In [87]:
idx[2:3]

DatetimeIndex(['2024-10-18'], dtype='datetime64[ns]', freq=None)

In [88]:
idx.year

Index([2024, 2024, 2024], dtype='int32')

## Unix Time

In [89]:
day = 1628899200 /60 /60 /24
year = day/365
month = day/30


In [90]:
print(year, month, day)

51.652054794520545 628.4333333333333 18853.0


In [91]:
dt = pd.to_datetime(1628899200, unit = 's')
print(dt)

2021-08-14 00:00:00


## 날짜 변환

In [92]:
data = [
    {'시가': 100, '고가': 110, '저가': 90, '종가': 105},
    {'시가': 100, '고가': 112, '저가': 80, '종가':  95},
    {'시가':  99, '고가': 115, '저가': 70, '종가':  85},
    {'시가':  70, '고가':  80, '저가': 60, '종가':  75},
]

df = pd.DataFrame(data, index=['20200615', '20200616', '20200717', '20200718'])
df

Unnamed: 0,시가,고가,저가,종가
20200615,100,110,90,105
20200616,100,112,80,95
20200717,99,115,70,85
20200718,70,80,60,75


In [93]:
cond = df.index.str[:6] == "202006"
df.loc[cond]

Unnamed: 0,시가,고가,저가,종가
20200615,100,110,90,105
20200616,100,112,80,95


In [94]:
df.index = pd.to_datetime(df.index)
df.loc['2020-06']

Unnamed: 0,시가,고가,저가,종가
2020-06-15,100,110,90,105
2020-06-16,100,112,80,95


# KRX API 크롤링

In [95]:
! pip install pykrx -q

In [96]:
import pykrx
print(pykrx.__version__)

1.0.48


In [97]:
from pykrx import stock

tickers = pykrx.stock.get_market_ticker_list(market = 'KOSDAQ')
print(len(tickers), tickers)

1754 ['060310', '054620', '265520', '211270', '109960', '139050', '126600', '013720', '083790', '035760', '311690', '051500', '058820', '023460', '056730', '065770', '083660', '456440', '477760', '099520', '060900', '290120', '025440', '068790', '241520', '180400', '245620', '037370', '050120', '214270', '130500', '900290', '083450', '297890', '440290', '078150', '195940', '028300', '278650', '067630', '024850', '047920', '115450', '046210', '403870', '036640', '439730', '442770', '448760', '467930', '469480', '095340', '950170', '067290', '035900', '318000', '024840', '024120', '455250', '458320', '464680', '476470', '478390', '486630', '021320', '036670', '044180', '046440', '151860', '035600', '226360', '111870', '060720', '058400', '101000', '122450', '052900', '376190', '061970', '309960', '225190', '060370', '417200', '078020', '476080', '086960', '038340', '160550', '053290', '060250', '104200', '222160', '024940', '218410', '327260', '091340', '419530', '019550', '950110', '036

In [98]:
tickers = pykrx.stock.get_market_ticker_list('20241010')
print(len(tickers), tickers)

958 ['095570', '006840', '027410', '282330', '138930', '001460', '001465', '001040', '079160', '00104K', '000120', '011150', '011155', '001045', '097950', '097955', '000480', '000590', '012030', '016610', '005830', '000990', '139130', '001530', '000210', '000215', '375500', '37550L', '37550K', '007340', '004840', '155660', '069730', '017860', '017940', '365550', '383220', '007700', '114090', '078930', '006360', '001250', '007070', '078935', '012630', '039570', '089470', '294870', '009540', '267250', '267270', '443060', '071970', '010620', '322000', '042670', '267260', '329180', '097230', '014790', '003580', '204320', '060980', '011200', '035000', '002460', '487570', '298050', '003560', '175330', '234080', '001060', '001067', '001065', '096760', '105560', '432320', '002380', '344820', '009070', '009440', '119650', '092220', '003620', '016380', '001390', '033180', '015590', '001940', '025000', '092230', '000040', '044450', '030200', '033780', '058850', '058860', '093050', '003550', '0342

In [99]:
tickers = stock.get_index_ticker_list("20241010")

for t in tickers:
  name = stock.get_index_ticker_name(t)
  print(t, name)

1001 코스피
1002 코스피 대형주
1003 코스피 중형주
1004 코스피 소형주
1005 음식료품
1006 섬유의복
1007 종이목재
1008 화학
1009 의약품
1010 비금속광물
1011 철강금속
1012 기계
1013 전기전자
1014 의료정밀
1015 운수장비
1016 유통업
1017 전기가스업
1018 건설업
1019 운수창고업
1020 통신업
1021 금융업
1024 증권
1025 보험
1026 서비스업
1027 제조업
1028 코스피 200
1034 코스피 100
1035 코스피 50
1150 코스피 200 커뮤니케이션서비스
1151 코스피 200 건설
1152 코스피 200 중공업
1153 코스피 200 철강/소재
1154 코스피 200 에너지/화학
1155 코스피 200 정보기술
1156 코스피 200 금융
1157 코스피 200 생활소비재
1158 코스피 200 경기소비재
1159 코스피 200 산업재
1160 코스피 200 헬스케어
1167 코스피 200 중소형주
1182 코스피 200 초대형제외 지수
1224 코스피 200 비중상한 30%
1227 코스피 200 비중상한 25%
1232 코스피 200 비중상한 20%
1244 코스피200제외 코스피지수
1894 코스피 200 TOP 10


In [100]:
df = stock.get_market_ohlcv("20240401","20241010","005930")
print(df.head())

               시가     고가     저가     종가       거래량       등락률
날짜                                                        
2024-04-01  83200  83300  82000  82000  20116513 -0.485437
2024-04-02  82900  85000  82900  85000  37077944  3.658537
2024-04-03  84300  85000  83500  84100  30493347 -1.058824
2024-04-04  85200  85500  84300  85300  25248934  1.426873
2024-04-05  84500  85000  83800  84500  18883752 -0.937866


### 과제
- 난수 생성으로 sleep 시키기
- 종목도 같이 출력

In [101]:
import time
import numpy as np
'''
for ticker in stock.get_market_ticker_list("20201010")[:5]:

  df = stock.get_market_ohlcv("20241009","20241010",ticker)
  df['종목명']
  randint = np.random.randint(1,10)
  time.sleep(randint)
  '''

'\nfor ticker in stock.get_market_ticker_list("20201010")[:5]:\n\n  df = stock.get_market_ohlcv("20241009","20241010",ticker)\n  df[\'종목명\']\n  randint = np.random.randint(1,10)\n  time.sleep(randint)\n  '

In [102]:
# 답안지

'''
import time
import random
for ticker in stock.get_market_ticker_list("20190225")[:5]:
    df = stock.get_market_ohlcv("20181210", "20181212", ticker)
    종목 = stock.get_market_ticker_name(ticker)
    df['종목명'] = 종목
    print(df.head())
    time.sleep(random.randint(10, 20))
'''





'\nimport time\nimport random\nfor ticker in stock.get_market_ticker_list("20190225")[:5]:\n    df = stock.get_market_ohlcv("20181210", "20181212", ticker)\n    종목 = stock.get_market_ticker_name(ticker)\n    df[\'종목명\'] = 종목\n    print(df.head())\n    time.sleep(random.randint(10, 20))\n'

# API 데이터 크롤링

1.   열린 데이터 광장
2.   공공데이터 포털



In [193]:
# 키, 데이터 url은 사이트에서 직접 따오기

SERVICE_KEY = '71527565663130313835635a4d5846'
url = f'http://openapi.seoul.go.kr:8088/{SERVICE_KEY}/json/tbLnOpendataRtmsV/1/5/'

print(url)

http://openapi.seoul.go.kr:8088/71527565663130313835635a4d5846/json/tbLnOpendataRtmsV/1/5/


In [194]:
import requests
import pandas as pd
import json

req = requests.get(url)
print(req)

<Response [200]>


In [195]:
type(req)

In [196]:
content = req.json()
content

{'tbLnOpendataRtmsV': {'list_total_count': 2622592,
  'RESULT': {'CODE': 'INFO-000', 'MESSAGE': '정상 처리되었습니다'},
  'row': [{'RCPT_YR': '2024',
    'CGG_CD': '11350',
    'CGG_NM': '노원구',
    'STDG_CD': '10600',
    'STDG_NM': '중계동',
    'LOTNO_SE': '1',
    'LOTNO_SE_NM': '대지',
    'MNO': '0505',
    'SNO': '0000',
    'BLDG_NM': '롯데(505)',
    'CTRT_DAY': '20241009',
    'THING_AMT': '72000',
    'ARCH_AREA': 72.49,
    'LAND_AREA': 0.0,
    'FLR': 14.0,
    'RGHT_SE': '',
    'RTRCN_DAY': '20241009',
    'ARCH_YR': '1989',
    'BLDG_USG': '아파트',
    'DCLR_SE': '중개거래',
    'OPBIZ_RESTAGNT_SGG_NM': '서울 노원구'},
   {'RCPT_YR': '2024',
    'CGG_CD': '11230',
    'CGG_NM': '동대문구',
    'STDG_CD': '10600',
    'STDG_NM': '장안동',
    'LOTNO_SE': '1',
    'LOTNO_SE_NM': '대지',
    'MNO': '0385',
    'SNO': '0002',
    'BLDG_NM': '리치',
    'CTRT_DAY': '20241008',
    'THING_AMT': '28100',
    'ARCH_AREA': 75.48,
    'LAND_AREA': 0.0,
    'FLR': 3.0,
    'RGHT_SE': '',
    'RTRCN_DAY': '',
    'ARCH_

In [126]:
res = content['tbLnOpendataRtmsV']['row']
len(res)

500

In [127]:
pd.DataFrame(res)

Unnamed: 0,RCPT_YR,CGG_CD,CGG_NM,STDG_CD,STDG_NM,LOTNO_SE,LOTNO_SE_NM,MNO,SNO,BLDG_NM,...,THING_AMT,ARCH_AREA,LAND_AREA,FLR,RGHT_SE,RTRCN_DAY,ARCH_YR,BLDG_USG,DCLR_SE,OPBIZ_RESTAGNT_SGG_NM
0,2024,11350,노원구,10600,중계동,1,대지,0505,0000,롯데(505),...,72000,72.49,0.00,14.0,,20241009,1989,아파트,중개거래,서울 노원구
1,2024,11230,동대문구,10600,장안동,1,대지,0385,0002,리치,...,28100,75.48,0.00,3.0,,,2008,아파트,직거래,
2,2024,11620,관악구,10100,봉천동,1,대지,0869,0006,서울대역 마에스트로,...,15000,20.50,31.64,10.0,,,2012,오피스텔,중개거래,서울 관악구
3,2024,11740,강동구,10200,고덕동,1,대지,0486,0000,아남1,...,114000,84.91,0.00,10.0,,,1996,아파트,중개거래,서울 강동구
4,2024,11530,구로구,10200,구로동,1,대지,1256,0000,구로현대,...,48000,56.49,0.00,5.0,,,1992,아파트,중개거래,서울 양천구
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,2024,11215,광진구,10500,자양동,1,대지,0221,0008,신세계빌라,...,23000,26.66,14.00,4.0,,,2002,연립다세대,중개거래,서울 광진구
496,2024,11380,은평구,10700,응암동,1,대지,0759,0000,백련산힐스테이트1차,...,83000,84.21,0.00,2.0,,,2011,아파트,중개거래,서울 은평구
497,2024,11410,서대문구,11900,북가좌동,1,대지,0369,0020,미림주택,...,68500,43.96,21.00,4.0,,,1999,연립다세대,중개거래,서울 은평구
498,2024,11470,양천구,10300,신월동,1,대지,1076,0000,목동센트럴아이파크위브2단지,...,74000,52.85,0.00,7.0,,,2020,아파트,중개거래,서울 양천구


### 반복문을 써서 3000개까지의 데이터를 가져와보기.

In [154]:
import time
SERVICE_KEY = '71527565663130313835635a4d5846'
#url = f'http://openapi.seoul.go.kr:8088/{SERVICE_KEY}/json/tbLnOpendataRtmsV/{index_start}/{index_last}/'

complete_data = []
complete_data = pd.DataFrame(complete_data)
# 위  코드는 complete_data = None으로 대체가 가능하다.

start_time = time.time()

for idx in range(1,5001,1000):

  index_start = idx
  index_last = idx+999

  url = f'http://openapi.seoul.go.kr:8088/{SERVICE_KEY}/json/tbLnOpendataRtmsV/{index_start}/{index_last}/'
  content = requests.get(url).json()
  res = content['tbLnOpendataRtmsV']['row']
  res_df = pd.DataFrame(res)

  complete_data = pd.concat([complete_data, res_df], axis = 0)

end_time = time.time()
elapsed_time = end_time - start_time
print(f"반복문 실행 시간: {elapsed_time:.2f} 초")

반복문 실행 시간: 84.26 초


In [153]:
complete_data.reset_index(drop = True)
complete_data

Unnamed: 0,RCPT_YR,CGG_CD,CGG_NM,STDG_CD,STDG_NM,LOTNO_SE,LOTNO_SE_NM,MNO,SNO,BLDG_NM,...,THING_AMT,ARCH_AREA,LAND_AREA,FLR,RGHT_SE,RTRCN_DAY,ARCH_YR,BLDG_USG,DCLR_SE,OPBIZ_RESTAGNT_SGG_NM
0,2024,11350,노원구,10600,중계동,1,대지,0505,0000,롯데(505),...,72000,72.49,0.00,14.0,,20241009,1989,아파트,중개거래,서울 노원구
1,2024,11230,동대문구,10600,장안동,1,대지,0385,0002,리치,...,28100,75.48,0.00,3.0,,,2008,아파트,직거래,
2,2024,11620,관악구,10100,봉천동,1,대지,0869,0006,서울대역 마에스트로,...,15000,20.50,31.64,10.0,,,2012,오피스텔,중개거래,서울 관악구
3,2024,11740,강동구,10200,고덕동,1,대지,0486,0000,아남1,...,114000,84.91,0.00,10.0,,,1996,아파트,중개거래,서울 강동구
4,2024,11530,구로구,10200,구로동,1,대지,1256,0000,구로현대,...,48000,56.49,0.00,5.0,,,1992,아파트,중개거래,서울 양천구
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,2024,11260,중랑구,10400,묵동,1,대지,0233,0086,골드스타빌(233-86),...,25000,26.11,18.00,5.0,,,2017,연립다세대,중개거래,서울 중랑구
2996,2024,11560,영등포구,12600,양평동2가,1,대지,0045,0000,상록수,...,60000,59.67,0.00,16.0,,,1999,아파트,중개거래,서울 영등포구
2997,2024,11590,동작구,10200,상도동,1,대지,0279,0496,하나빌라,...,52000,56.04,33.00,3.0,,,2008,연립다세대,중개거래,서울 동작구
2998,2024,11680,강남구,10700,신사동,1,대지,0607,0009,"현대맨션(22,23동)",...,220000,120.35,110.00,2.0,,,1984,연립다세대,중개거래,"서울 강남구, 서울 용산구"


In [None]:
'''
# 모범답안

import requests
import pandas as pd

KEY = '78577a466b6a686a3539434662496e'
result = None
for i in range(1, 4):
  url = f'http://openapi.seoul.go.kr:8088/{KEY}/json/tbLnOpendataRtmsV/{(i-1) * 1000 + 1}/{i*1000}/'
  print(url)
  req = requests.get(url)
  content = req.json()
  data = pd.DataFrame(content['tbLnOpendataRtmsV']['row'])
  result = pd.concat([result, data])

result.info()

'''

## xml을 json대신 사용하면...?
- 원리를 먼저 파악하자!

In [155]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

SERVICE_KEY = '71527565663130313835635a4d5846'
url = f'http://openapi.seoul.go.kr:8088/{SERVICE_KEY}/xml/tbLnOpendataRtmsV/1/5/2023/11500/강서구/'
req = requests.get(url)
req

<Response [200]>

In [156]:
soup = BeautifulSoup(req.content, "lxml") # html.parser
print(soup.prettify())

<?xml version="1.0" encoding="UTF-8"?>
<html>
 <body>
  <tblnopendatartmsv>
   <list_total_count>
    5095
   </list_total_count>
   <result>
    <code>
     INFO-000
    </code>
    <message>
     정상 처리되었습니다
    </message>
   </result>
   <row>
    <rcpt_yr>
     2023
    </rcpt_yr>
    <cgg_cd>
     11500
    </cgg_cd>
    <cgg_nm>
     강서구
    </cgg_nm>
    <stdg_cd>
     10300
    </stdg_cd>
    <stdg_nm>
     화곡동
    </stdg_nm>
    <lotno_se>
     1
    </lotno_se>
    <lotno_se_nm>
     대지
    </lotno_se_nm>
    <mno>
     1081
    </mno>
    <sno>
     0006
    </sno>
    <bldg_nm>
     드림파크빌
    </bldg_nm>
    <ctrt_day>
     20231230
    </ctrt_day>
    <thing_amt>
     24000
    </thing_amt>
    <arch_area>
     58.26
    </arch_area>
    <land_area>
     33.280000
    </land_area>
    <flr>
     5
    </flr>
    <rght_se>
    </rght_se>
    <rtrcn_day>
    </rtrcn_day>
    <arch_yr>
     2011
    </arch_yr>
    <bldg_usg>
     연립다세대
    </bldg_usg>
    <dclr_se>
     중개거래
  

  soup = BeautifulSoup(req.content, "lxml") # html.parser


In [159]:
years = soup.find_all('rcpt_yr')         # 접수년월
CGG_CDs = soup.find_all('cgg_cd')
len(years), len(CGG_CDs)

(5, 5)

In [160]:
years = soup.find_all('rcpt_yr')         # 접수년월
CGG_CDs = soup.find_all('cgg_cd')

year_list           = []
sgg_cd_list         = []

for year, cgg_cd in zip(years, CGG_CDs):
  year_list.append(year.get_text())
  sgg_cd_list.append(cgg_cd.get_text())

df = pd.DataFrame({
    "rcpt_yr": year_list,
    "cgg_cd" : sgg_cd_list
})

df

Unnamed: 0,rcpt_yr,cgg_cd
0,2023,11500
1,2023,11500
2,2023,11500
3,2023,11500
4,2023,11500


In [163]:
SERVICE_KEY = '71527565663130313835635a4d5846'
url = f'http://apis.data.go.kr/1613000/RTMSDataSvcAptTrade/getRTMSDataSvcAptTrade'
req = requests.get(url)
req

<Response [200]>

In [164]:
soup = BeautifulSoup(req.content, "lxml") # html.parser
print(soup.prettify())

<html>
 <body>
  <openapi_serviceresponse>
   <cmmmsgheader>
    <errmsg>
     SERVICE ERROR
    </errmsg>
    <returnauthmsg>
     SERVICE_KEY_IS_NOT_REGISTERED_ERROR
    </returnauthmsg>
    <returnreasoncode>
     30
    </returnreasoncode>
   </cmmmsgheader>
  </openapi_serviceresponse>
 </body>
</html>



### 사전 제작 코드 활용

In [165]:
#!pip install xmltodict -qq

In [173]:
import requests
import json
import xmltodict
import pandas as pd
serviceKey = 'HRYWiTp0KEDDEYYyUQV9msoe%2B2YK6PPbdhjgspJt5p%2F3YskNrgY3nyiVew4VxbwA6m4F5kRUXNZaEa2Z0kPFVg%3D%3D'
pageNo = 1 # 페이지 번호
numOfRows = 500 # 한 페이지 결과수
LAWD_CD = 11500 # 지역코드
DEAL_YMD = 202101 # 계약월
url = f'http://openapi.molit.go.kr/OpenAPI_ToolInstallPackage/service/rest/RTMSOBJSvc/getRTMSDataSvcAptTradeDev?serviceKey={serviceKey}&pageNo={pageNo}&numOfRows={numOfRows}&LAWD_CD={LAWD_CD}&DEAL_YMD={DEAL_YMD}'
req = requests.get(url)
req

<Response [200]>

In [174]:
req.text

'<?xml version="1.0" encoding="UTF-8" standalone="yes"?><response><header><resultCode>99</resultCode><resultMsg>SERVICE KEY IS NOT REGISTERED ERROR.</resultMsg></header></response>'

In [175]:
contents = xmltodict.parse(req.text)
contents

{'response': {'header': {'resultCode': '99',
   'resultMsg': 'SERVICE KEY IS NOT REGISTERED ERROR.'}}}

In [None]:
data = pd.DataFrame(contents['response']['body']['items']['item'])
data.head()

In [176]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Columns: 1000 entries, 0 to 999
dtypes: object(1000)
memory usage: 23.6+ KB


In [177]:
data.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
count,3,3,3,3,3,3,3,3,3,3,...,3,3,3,3,3,3,3,3,3,3
unique,3,3,3,3,3,3,3,3,3,3,...,3,3,3,3,3,3,3,3,3,3
top,"{'RCPT_YR': '2024', 'CGG_CD': '11350', 'CGG_NM...","{'RCPT_YR': '2024', 'CGG_CD': '11230', 'CGG_NM...","{'RCPT_YR': '2024', 'CGG_CD': '11620', 'CGG_NM...","{'RCPT_YR': '2024', 'CGG_CD': '11740', 'CGG_NM...","{'RCPT_YR': '2024', 'CGG_CD': '11530', 'CGG_NM...","{'RCPT_YR': '2024', 'CGG_CD': '11260', 'CGG_NM...","{'RCPT_YR': '2024', 'CGG_CD': '11620', 'CGG_NM...","{'RCPT_YR': '2024', 'CGG_CD': '11545', 'CGG_NM...","{'RCPT_YR': '2024', 'CGG_CD': '11290', 'CGG_NM...","{'RCPT_YR': '2024', 'CGG_CD': '11230', 'CGG_NM...",...,"{'RCPT_YR': '2024', 'CGG_CD': '11230', 'CGG_NM...","{'RCPT_YR': '2024', 'CGG_CD': '11230', 'CGG_NM...","{'RCPT_YR': '2024', 'CGG_CD': '11230', 'CGG_NM...","{'RCPT_YR': '2024', 'CGG_CD': '11230', 'CGG_NM...","{'RCPT_YR': '2024', 'CGG_CD': '11230', 'CGG_NM...","{'RCPT_YR': '2024', 'CGG_CD': '11230', 'CGG_NM...","{'RCPT_YR': '2024', 'CGG_CD': '11230', 'CGG_NM...","{'RCPT_YR': '2024', 'CGG_CD': '11230', 'CGG_NM...","{'RCPT_YR': '2024', 'CGG_CD': '11230', 'CGG_NM...","{'RCPT_YR': '2024', 'CGG_CD': '11350', 'CGG_NM..."
freq,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


# 개인공부_오픈API데이터 가져오기

- 전국 휴게소 푸드메뉴 현황 조회 서비스
- https://data.ex.co.kr/openapi/basicinfo/openApiInfoM?apiId=0502
- ***맛있겠다*...**

In [218]:
import requests
import json
import xmltodict
import pandas as pd

SERVICE_KEY = '0170719555'
NumRows = 500
PageNum = 1
url = f"https://data.ex.co.kr/openapi/restinfo/restBestfoodList?key={SERVICE_KEY}&type=json&numOfRows={NumRows}&pageNo={PageNum}"

content2 = requests.get(url).json()

In [219]:
data = pd.DataFrame(content2['list'])
data.columns

Index(['pageNo', 'numOfRows', 'stdRestCd', 'stdRestNm', 'lsttmAltrUser',
       'lsttmAltrDttm', 'svarAddr', 'routeCd', 'routeNm', 'seq', 'foodNm',
       'foodCost', 'etc', 'recommendyn', 'seasonMenu', 'bestfoodyn',
       'premiumyn', 'app', 'restCd', 'foodMaterial', 'lastId', 'lastDtime'],
      dtype='object')

In [220]:
# drop unusing columns
no_need_cols = ['pageNo', 'numOfRows', 'lsttmAltrUser', 'lsttmAltrDttm', 'app', 'lastId','lastDtime']
data2 = data.copy()
data2 = data2.drop(columns=no_need_cols)
data2.tail(1)


Unnamed: 0,stdRestCd,stdRestNm,svarAddr,routeCd,routeNm,seq,foodNm,foodCost,etc,recommendyn,seasonMenu,bestfoodyn,premiumyn,restCd,foodMaterial
98,1,서울만남(부산)휴게소,서울 서초구 원지동10-16,10,경부선,13956,소고기짬뽕곱배기,14000,,N,4,N,N,S000001,


## 반복문을 통해 큰 데이터프레임으로 만들기

In [227]:
import time

Base_dat = None
no_need_cols = ['pageNo', 'numOfRows', 'lsttmAltrUser', 'lsttmAltrDttm', 'app', 'lastId','lastDtime', 'foodMaterial']

start_time = time.time()

for i in range(1, 10):
  SERVICE_KEY = '0170719555'
  NumRows = 100
  PageNum = i
  url = f"https://data.ex.co.kr/openapi/restinfo/restBestfoodList?key={SERVICE_KEY}&type=json&numOfRows={NumRows}&pageNo={PageNum}"

  content = requests.get(url).json()
  data = pd.DataFrame(content['list'])

  data = data.drop(columns = no_need_cols)
  Base_dat = pd.concat([Base_dat, data], axis = 0)

end_time = time.time()
total_time = end_time - start_time
print(f"반복문 실행 시간: {total_time:.2f} 초")

Base_dat = Base_dat.reset_index(drop = True)



반복문 실행 시간: 8.69 초


In [228]:
Base_dat.head(1)

Unnamed: 0,stdRestCd,stdRestNm,svarAddr,routeCd,routeNm,seq,foodNm,foodCost,etc,recommendyn,seasonMenu,bestfoodyn,premiumyn,restCd
0,1,서울만남(부산)휴게소,서울 서초구 원지동10-16,10,경부선,272,농심어묵우동,7000,부산어묵꼬치를 첨가하여 우동만의 시원하고 담백한 맛에 어묵의 식감과 향을 함께 즐길...,N,4,N,N,S000001
