기준금리 크롤링

In [1]:
import requests
import datetime
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup

base_url= "https://www.bok.or.kr/portal/singl/baseRate/list.do?dataSeCd=01&menuNo=200643"
response = requests.get(base_url, 'html.parser')
soup= BeautifulSoup(response.text, 'lxml')

In [2]:
dict_list= []
# 53
for i in range(53):
    dict_list.append({
        'date': pd.to_datetime(datetime.datetime.strptime(soup.find_all('td')[3*i].text + soup.find_all('td')[3*i+1].text, '%Y%m월 %d일')),
        'RATE': float(soup.find_all('td')[3*i+2].text)
    })

In [3]:
pd_result= pd.DataFrame(dict_list)

# date range 일별 추가
idx= pd.date_range(pd_result.date.min(), pd_result.date.max())
idx= pd.Series(idx)

In [4]:
# 없는 행에 Nan을 넣어 병합한 후 Nan값을 이전 값으로 메꾸기
baserate= pd.concat([pd.DataFrame({'date': idx[~idx.isin(pd_result.date)], 'RATE': np.nan}),pd_result]).sort_values('date').reset_index(drop=True).ffill(axis=0)

In [5]:
baserate

Unnamed: 0,date,RATE
0,2000-02-10,5.00
1,2000-02-11,5.00
2,2000-02-12,5.00
3,2000-02-13,5.00
4,2000-02-14,5.00
...,...,...
8319,2022-11-20,3.00
8320,2022-11-21,3.00
8321,2022-11-22,3.00
8322,2022-11-23,3.00


In [6]:
baserate = baserate.merge(baserate.assign(date = baserate.date+pd.Timedelta(days=30)), 
               on='date',
               how='left', suffixes=['', '_30days'])


In [7]:
baserate.dropna(inplace=True)

In [8]:
baserate

Unnamed: 0,date,RATE,RATE_30days
30,2000-03-11,5.00,5.0
31,2000-03-12,5.00,5.0
32,2000-03-13,5.00,5.0
33,2000-03-14,5.00,5.0
34,2000-03-15,5.00,5.0
...,...,...,...
8319,2022-11-20,3.00,3.0
8320,2022-11-21,3.00,3.0
8321,2022-11-22,3.00,3.0
8322,2022-11-23,3.00,3.0


In [9]:
baserate['change'] = np.select(
    condlist=[
        (baserate.RATE > baserate.RATE_30days), 
        (baserate.RATE == baserate.RATE_30days),
    ], 
    choicelist=[
        1,
        0,
    ],
    default=-1
)

In [10]:
baserate

Unnamed: 0,date,RATE,RATE_30days,change
30,2000-03-11,5.00,5.0,0
31,2000-03-12,5.00,5.0,0
32,2000-03-13,5.00,5.0,0
33,2000-03-14,5.00,5.0,0
34,2000-03-15,5.00,5.0,0
...,...,...,...,...
8319,2022-11-20,3.00,3.0,0
8320,2022-11-21,3.00,3.0,0
8321,2022-11-22,3.00,3.0,0
8322,2022-11-23,3.00,3.0,0


In [11]:
baserate.to_csv('./baserate.csv')

콜금리

In [12]:
call_url= "https://finance.naver.com/marketindex/interestDailyQuote.nhn?marketindexCd=IRR_CALL&page={}"
page_num= 622 # 2005년 5월~

In [13]:
call_list= []

for i in range(1,page_num+1):
    call_list.append(BeautifulSoup(requests.get(call_url.format(i), 'html.parser').text,'lxml').find_all('td'))

In [14]:
result_list= []

for list in call_list:
    
    for i in range(len(list)//4):
        result_list.append({
            'date': list[4*i].text.strip(),
            'RATE': float(list[4*i+1].text)
        })

In [15]:
result_list

[{'date': '2022.11.23', 'RATE': 3.08},
 {'date': '2022.11.22', 'RATE': 3.07},
 {'date': '2022.11.21', 'RATE': 3.04},
 {'date': '2022.11.18', 'RATE': 3.02},
 {'date': '2022.11.17', 'RATE': 3.02},
 {'date': '2022.11.16', 'RATE': 3.03},
 {'date': '2022.11.15', 'RATE': 3.03},
 {'date': '2022.11.14', 'RATE': 3.07},
 {'date': '2022.11.11', 'RATE': 3.07},
 {'date': '2022.11.10', 'RATE': 3.09},
 {'date': '2022.11.09', 'RATE': 2.98},
 {'date': '2022.11.08', 'RATE': 2.99},
 {'date': '2022.11.07', 'RATE': 2.97},
 {'date': '2022.11.04', 'RATE': 2.99},
 {'date': '2022.11.03', 'RATE': 3.04},
 {'date': '2022.11.02', 'RATE': 3.03},
 {'date': '2022.11.01', 'RATE': 3.08},
 {'date': '2022.10.31', 'RATE': 3.19},
 {'date': '2022.10.28', 'RATE': 3.04},
 {'date': '2022.10.27', 'RATE': 3.07},
 {'date': '2022.10.26', 'RATE': 3.12},
 {'date': '2022.10.25', 'RATE': 3.14},
 {'date': '2022.10.24', 'RATE': 3.13},
 {'date': '2022.10.21', 'RATE': 3.1},
 {'date': '2022.10.20', 'RATE': 3.13},
 {'date': '2022.10.19', 'R

In [27]:
pd_result2= pd.DataFrame(result_list)
pd_result2.date= pd_result2.date.astype('datetime64')

In [28]:
# 원하는 전체 date range
idx2= pd.date_range(pd_result2.date.min(), pd_result2.date.max())
idx2= pd.Series(idx2)

In [29]:
# 없는 행에 Nan을 넣어 병합한 후 Nan값을 이전 값으로 메꾸기
callrate= pd.concat([pd.DataFrame({'date': idx2[~idx2.isin(pd_result2.date)], 'RATE': np.nan}),pd_result2]).sort_values('date').reset_index(drop=True).ffill(axis=0)

In [30]:
callrate

Unnamed: 0,date,RATE
0,2005-05-09,3.29
1,2005-05-10,3.30
2,2005-05-11,3.30
3,2005-05-12,3.30
4,2005-05-13,3.29
...,...,...
6403,2022-11-19,3.02
6404,2022-11-20,3.02
6405,2022-11-21,3.04
6406,2022-11-22,3.07


In [31]:
callrate = callrate.merge(callrate.assign(date = callrate.date+pd.Timedelta(days=31)), 
               on='date',
               how='left', suffixes=['', '_30days_ago'])


In [32]:
callrate.dropna(inplace=True)

In [33]:
callrate

Unnamed: 0,date,RATE,RATE_30days_ago
31,2005-06-09,3.28,3.29
32,2005-06-10,3.29,3.30
33,2005-06-11,3.29,3.30
34,2005-06-12,3.29,3.30
35,2005-06-13,3.28,3.29
...,...,...,...
6403,2022-11-19,3.02,3.07
6404,2022-11-20,3.02,3.13
6405,2022-11-21,3.04,3.10
6406,2022-11-22,3.07,3.10


In [34]:
# calc_rate_change(callrate)

In [35]:
callrate['change'] = np.select(
    condlist=[
        (callrate.RATE > callrate.RATE_30days_ago), 
        (callrate.RATE == callrate.RATE_30days_ago),
    ], 
    choicelist=[
        'up',
        'same',
    ],
    default='down'
)

In [36]:
callrate

Unnamed: 0,date,RATE,RATE_30days_ago,change
31,2005-06-09,3.28,3.29,down
32,2005-06-10,3.29,3.30,down
33,2005-06-11,3.29,3.30,down
34,2005-06-12,3.29,3.30,down
35,2005-06-13,3.28,3.29,down
...,...,...,...,...
6403,2022-11-19,3.02,3.07,down
6404,2022-11-20,3.02,3.13,down
6405,2022-11-21,3.04,3.10,down
6406,2022-11-22,3.07,3.10,down


In [37]:
callrate.to_csv('./callrate.csv',  index=False)

In [38]:
from nltk.tokenize import word_tokenize
from nltk import FreqDist
a=nltk
fdist = nltk.FreqDist(merge_file) #up_ngram_list, down_ngram_list
count_df = pd.DataFrame(list(zip(fdist.keys(), fdist.values())).set_index(0))



NameError: name 'nltk' is not defined