In [1]:
import pandas as pd
import numpy as np

from bs4 import BeautifulSoup as bs
from sklearn.model_selection import train_test_split
from urllib.request import urlopen

## 데이터 형 변환

In [2]:
def del_comma(st):              # 문자의 ',' 없애는 함수
    ar = st.split(',')
    k = ''
    for i in ar:
       k += i
    return k 

def str2fl(st):                 # string to float
    return float(st)

## Naver에서 데이터 크롤링

In [3]:
def get_data(data_len, stock_num, filename = False):
    stock_num =  stock_num
    data_len1 =  data_len
    data_len2 = int(data_len1) // 20

    date_list = []
    price_list = []
    trade_volume_list = []
    organization_trading_list = []
    foreign_trading_list = []
    foreign_pct_list = []
    for n in range(data_len2):
        url = f'http://finance.naver.com/item/frgn.nhn?code={stock_num}&page={n+1}'
        html = urlopen(url)
        source = bs(html.read(), 'html.parser')
        section = source.find('table', summary = '외국인 기관 순매매 거래량에 관한표이며 날짜별로 정보를 제공합니다.')
        sr_lists = section.find_all('tr')
        print(f'데이터 추출 중...({n+1}/{data_len2})')
        for j in sr_lists:
            if j.span != None:
                date_list.append(j.td.text.replace('.', '-'))
                price_list.append(j.find_all('td', class_ = 'num')[0].text)
                trade_volume_list.append(j.find_all('td', class_ = 'num')[3].text)
                organization_trading_list.append(j.find_all('td', class_ = 'num')[4].text)
                foreign_trading_list.append(j.find_all('td', class_ = 'num')[5].text)
                foreign_pct_list.append(j.find_all('td', class_ = 'num')[7].text[:-1])
    print(f'데이터 추출 완료')
    print('데이터 가공 중...')

    price_list = pd.Series(price_list, index = date_list).map(del_comma).map(str2fl)
    trade_volume_list = pd.Series(trade_volume_list, index = date_list).map(del_comma).map(str2fl)
    organization_trading_list = pd.Series(organization_trading_list, index = date_list).map(del_comma).map(str2fl)
    foreign_trading_list = pd.Series(foreign_trading_list, index = date_list).map(del_comma).map(str2fl)
    foreign_pct_list = pd.Series(foreign_pct_list, index = date_list).map(str2fl)

    print('데이터 병합 중...')
    result = pd.DataFrame([price_list, trade_volume_list,organization_trading_list, foreign_trading_list, foreign_pct_list], 
                          index = ['종가', '거래량','기관매매량', '외국인매매량', '외국인보유율']).T.sort_index()
    if filename == False:
        print('데이터 로드 완료')
    else:
            
        filename = f'data/{filename}'
        result.to_csv(f'{filename}.csv', mode = 'w')
        print('데이터 저장 완료')
    return result

## 데이터 정규화
> 평균: 0, 분산: 1

In [4]:
def normalize(data, values = False):
    if type(data) == list:
        data = np.array(data)

    mean = data.mean(axis = 0)
    std = data.std(axis = 0)
    result = (data - mean) / std
    figs = [mean, std]

    if values == True:
        return result, figs
    else:
        return result

## 데이터 비정규화

In [5]:
def denormalize(data, figs):
    result = [round(index * figs[1] + figs[0], -1) for index in data]
    return result

## 데이터 편집

In [6]:
def windowing(prices, values, window_size):      # window 생성
    x = []
    y = []
    for i in range(len(prices) - window_size):
        x.append([[prices[i+j], values[i+j]] for j in range(window_size)])
        y.append(prices[window_size + i])
    return x, y

In [7]:
def set_data(
    raw_df, prices_idx = 0, added_idx = -1, 
    window_sizer = 10, test_rate = 0.5, values = False
    ):
    df = raw_df
    prices = list(df[df.columns[prices_idx]])
    added = list(df[df.columns[added_idx]])
    
    norm_prices, p_figs = normalize(prices, values = True)
    norm_added, a_figs = normalize(added, values = True)

    x, y = windowing(norm_prices, norm_added, window_size = window_sizer)
    x, y = np.array(x), np.array(y)

    x_train, x_test, y_train, y_test = train_test_split(
        x, y, test_size = test_rate, shuffle = False)
    if values == False:
        return x_train, x_test, y_train, y_test
    elif values == True:
        return prices, added, p_figs, a_figs

In [8]:
def to_1d_list(data):
    result = []
    for idx in range(len(data)):
        result.append(data[idx][0])
    return result