In [1]:
import datetime as dt
from urllib.request import urlopen  
import bs4
import re
import pandas as pd

# Data Crawling

In [2]:
# 크롤링 차단 장치 우회를 위해 웹사이트 호출 시 headers 값을 추가
import urllib.request
headers = {
    'User-Agent': 'Mozilla/5.0',
    'X-Requested-With': 'XMLHttpRequest',
}

## 구성종목 기본정보

In [3]:
'''
한국거래소 시가총액 상위 10종목 (2019년1월 기준)
005930	삼성전자
000660	SK하이닉스
068270	셀트리온
005380	현대차
207940	삼성바이오로직스
051910	LG화학
005490	POSCO
035420	NAVER
017670	SK텔레콤
015760	한국전력
'''
k10_component = ['005930', '000660', '068270', '005380', '207940',\
                 '051910', '005490', '035420', '017670', '015760']

In [4]:
# 구성종목 기본정보
# stock_cd = stock_code
def stock_info(stock_cd):
#     url_float = 'http://companyinfo.stock.naver.com/v1/company/c1010001.aspx?cmp_cd=' + stock_cd
    url_float = 'https://navercomp.wisereport.co.kr/v2/company/c1010001.aspx?cmp_cd=' + stock_cd
    source = urlopen(url_float).read()
    soup = bs4.BeautifulSoup(source, 'lxml')
    
    tmp = soup.find(id='cTB11').find_all('tr')[6].td.text
    tmp = tmp.replace('\r', '')
    tmp = tmp.replace('\n', '')
    tmp = tmp.replace('\t', '')
    
    tmp = re.split('/', tmp)

    # 발행주식 수
    outstanding = tmp[0].replace(',', '')
    outstanding = outstanding.replace('주', '')
    outstanding = outstanding.replace(' ', '')
    outstanding = int(outstanding)

    # 유동비율
    floating = tmp[1].replace(' ', '')
    floating = floating.replace('%', '')
    floating = float(floating)
    
    # 종목명
    name = soup.find(id='pArea').find('div').find('div').find('tr').find('td').find('span').text
    
    k10_outstanding[stock_cd] = outstanding
    k10_floating[stock_cd] = floating
    k10_name[stock_cd] = name

In [5]:
k10_outstanding = dict()
k10_floating = dict()
k10_name = dict()

for stock_cd in k10_component:
    stock_info(stock_cd)

In [6]:
k10_outstanding

{'005930': 5969782550,
 '000660': 728002365,
 '068270': 146390862,
 '005380': 211531506,
 '207940': 71174000,
 '051910': 70592343,
 '005490': 84571230,
 '035420': 164049085,
 '017670': 218833144,
 '015760': 641964077}

In [7]:
k10_floating

{'005930': 75.83,
 '000660': 73.86,
 '068270': 74.3,
 '005380': 65.26,
 '207940': 24.76,
 '051910': 66.63,
 '005490': 72.35,
 '035420': 83.3,
 '017670': 63.35,
 '015760': 46.08}

In [8]:
k10_name

{'005930': '삼성전자',
 '000660': 'SK하이닉스',
 '068270': '셀트리온',
 '005380': '현대차',
 '207940': '삼성바이오로직스',
 '051910': 'LG화학',
 '005490': 'POSCO홀딩스',
 '035420': 'NAVER',
 '017670': 'SK텔레콤',
 '015760': '한국전력'}

In [9]:
# 과거 시세만 반영
tmp = {'Outstanding' : k10_outstanding,\
       'Floating' : k10_floating,\
       'Name' : k10_name}
k10_info = pd.DataFrame(tmp)
k10_info

Unnamed: 0,Outstanding,Floating,Name
5930,5969782550,75.83,삼성전자
660,728002365,73.86,SK하이닉스
68270,146390862,74.3,셀트리온
5380,211531506,65.26,현대차
207940,71174000,24.76,삼성바이오로직스
51910,70592343,66.63,LG화학
5490,84571230,72.35,POSCO홀딩스
35420,164049085,83.3,NAVER
17670,218833144,63.35,SK텔레콤
15760,641964077,46.08,한국전력


## 종목별 주가
* 2021-01-01 ~ 2021-03-01

In [10]:
def date_format(d):

    d = str(d)
    d = d.replace('/', '-')
    d = d.replace('.', '-')

    yyyy = int(d.split('-')[0])
    if yyyy < 50:
        yyyy = yyyy + 2000
    elif yyyy >=50 and yyyy < 100:
        yyyy = yyyy + 1900
    mm = int(d.split('-')[1])
    dd = int(d.split('-')[2])

    return dt.date(yyyy, mm, dd)

In [11]:
def historical_stock_naver(stock_cd, start_date='', end_date='', page_n=1, last_page=0):
    
    if start_date:
        start_date = date_format(start_date)
    else:
        start_date = dt.date.today()
    if end_date:
        end_date = date_format(end_date)
    else:
        end_date = dt.date.today()
        
    naver_stock = 'http://finance.naver.com/item/sise_day.nhn?code=' + stock_cd + '&page=' + str(page_n)
    
    # 기존 코드
    # source = urlopen(naver_stock).read()
    
    # 개정 코드 (1줄에서 2줄로 늘어남)
    url = urllib.request.Request(naver_stock, headers=headers)   # headers 정보 보내기
    source = urlopen(url).read()
    
    source = bs4.BeautifulSoup(source, 'lxml')
    
    dates = source.find_all('span', class_='tah p10 gray03')   # 날짜 수집   
    prices = source.find_all('td', class_='num')   # 종가 수집
    
    for n in range(len(dates)):
    
        if len(dates) > 0:
            
            # 날짜 처리
            this_date = dates[n].text
            this_date = date_format(this_date)
            
            if this_date <= end_date and this_date >= start_date:   
            # start_date와 end_date 사이에서 데이터 저장
                # 종가 처리
                this_close = prices[n*6].text
                this_close = this_close.replace(',', '')
                this_close = float(this_close)

                # 딕셔너리에 저장
                historical_prices[this_date] = this_close
                              
            elif this_date < start_date:   
            # start_date 이전이면 함수 종료
                return historical_prices              
            
    # 페이지 네비게이션
    if last_page == 0:
        last_page = source.find_all('table')[1].find('td', class_='pgRR').find('a')['href']
        last_page = last_page.split('&')[1]
        last_page = last_page.split('=')[1]
        last_page = float(last_page)
        
    # 다음 페이지 호출
    if page_n < last_page:
        page_n = page_n + 1
        historical_stock_naver(stock_cd, start_date, end_date, page_n, last_page)   
        
    return historical_prices  

In [12]:
k10_historical_prices = dict()

for stock_cd in k10_component:
    
    historical_prices = dict()
    start_date = '2021-01-01'   # 최신 날짜로 변경
    end_date = '2021-03-31'
    historical_prices = historical_stock_naver(stock_cd, start_date, end_date)
    
    k10_historical_prices[stock_cd] = historical_prices

In [13]:
k10_historical_price = pd.DataFrame(k10_historical_prices)
k10_historical_price.sort_index(axis=1, inplace=True)   # 컬럼 재정렬 (Python3.7 업데이트를 반영한 수정 패치)

In [14]:
k10_historical_price = k10_historical_price.fillna(method='ffill')
if k10_historical_price.isnull().values.any():
    k10_historical_price = k10_historical_price.fillna(method='bfill')
k10_historical_price.head(3)

Unnamed: 0,000660,005380,005490,005930,015760,017670,035420,051910,068270,207940
2021-03-31,132500.0,218000.0,320000.0,81400.0,23150.0,275000.0,377000.0,805000.0,324500.0,748000.0
2021-03-30,134500.0,219500.0,322500.0,82200.0,23250.0,268500.0,376500.0,808000.0,323500.0,747000.0
2021-03-29,132000.0,215500.0,327000.0,81600.0,23350.0,269000.0,372000.0,799000.0,330500.0,748000.0


In [15]:
k10_historical_price.sort_index(inplace=True)   # 데이터를 날짜순으로 정렬

In [16]:
k10_historical_price['005930'] = k10_historical_price['005930'] / 50   # 삼성전자 액면분할에 따른 수정주가 계산
k10_historical_price.head(3)

Unnamed: 0,000660,005380,005490,005930,015760,017670,035420,051910,068270,207940
2021-01-04,126000.0,207500.0,273000.0,1660.0,26900.0,237000.0,293000.0,889000.0,347500.0,829000.0
2021-01-05,130500.0,209500.0,285000.0,1678.0,26900.0,246000.0,292500.0,893000.0,354500.0,832000.0
2021-01-06,131000.0,203000.0,280000.0,1644.0,27150.0,250500.0,290000.0,890000.0,353500.0,817000.0


지수 산출을 위한 계산

In [17]:
#k10_historical_mc = 일별 시가총액(Market capitalization)
k10_historical_mc = k10_historical_price * k10_info['Outstanding'] * k10_info['Floating'] * 0.01
k10_historical_mc.tail(3)

Unnamed: 0,000660,005380,005490,005930,015760,017670,035420,051910,068270,207940
2021-03-29,70976740000000.0,29748800000000.0,20008240000000.0,7387878000000.0,6907328000000.0,37291680000000.0,50834870000000.0,37581510000000.0,35947960000000.0,13181770000000.0
2021-03-30,72320990000000.0,30300980000000.0,19732900000000.0,7442201000000.0,6877746000000.0,37222370000000.0,51449810000000.0,38004830000000.0,35186580000000.0,13164140000000.0
2021-03-31,71245590000000.0,30093910000000.0,19579930000000.0,7369771000000.0,6848165000000.0,38123470000000.0,51518140000000.0,37863720000000.0,35295350000000.0,13181770000000.0


In [18]:
k10 = pd.DataFrame()
k10['Market Cap'] = k10_historical_mc.sum(axis=1)
#k10.loc[dt.date.today()] = k10_info['f Market Cap'].sum()
k10.head(3)

Unnamed: 0,Market Cap
2021-01-04,295686900000000.0
2021-01-05,301380200000000.0
2021-01-06,300133800000000.0


In [19]:
# 기준시점
base_date = dt.date(2021, 1, 4)

In [20]:
k10['Market Cap'][base_date]

295686931452423.0

In [21]:
k10['Index'] = k10['Market Cap'] / k10['Market Cap'][base_date] * 100
k10.head(3)

Unnamed: 0,Market Cap,Index
2021-01-04,295686900000000.0,100.0
2021-01-05,301380200000000.0,101.925423
2021-01-06,300133800000000.0,101.503913


In [25]:
k10_info.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10 entries, 005930 to 015760
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Outstanding  10 non-null     int64  
 1   Floating     10 non-null     float64
 2   Name         10 non-null     object 
dtypes: float64(1), int64(1), object(1)
memory usage: 620.0+ bytes


In [24]:
k10.info()

<class 'pandas.core.frame.DataFrame'>
Index: 60 entries, 2021-01-04 to 2021-03-31
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Market Cap  60 non-null     float64
 1   Index       60 non-null     float64
dtypes: float64(2)
memory usage: 3.5+ KB


In [28]:
k10_historical_price.tail()

Unnamed: 0,000660,005380,005490,005930,015760,017670,035420,051910,068270,207940
2021-03-25,133000.0,218000.0,308500.0,1624.0,23100.0,253500.0,380000.0,788000.0,307500.0,718000.0
2021-03-26,135000.0,215500.0,318000.0,1630.0,23250.0,274000.0,383000.0,802000.0,314000.0,725000.0
2021-03-29,132000.0,215500.0,327000.0,1632.0,23350.0,269000.0,372000.0,799000.0,330500.0,748000.0
2021-03-30,134500.0,219500.0,322500.0,1644.0,23250.0,268500.0,376500.0,808000.0,323500.0,747000.0
2021-03-31,132500.0,218000.0,320000.0,1628.0,23150.0,275000.0,377000.0,805000.0,324500.0,748000.0


In [31]:
k10_historical_mc.head()

Unnamed: 0,000660,005380,005490,005930,015760,017670,035420,051910,068270,207940
2021-01-04,67750520000000.0,28644430000000.0,16704130000000.0,7514631000000.0,7957479000000.0,32855500000000.0,40039300000000.0,41814720000000.0,37797020000000.0,14609200000000.0
2021-01-05,70170180000000.0,28920520000000.0,17438380000000.0,7596115000000.0,7957479000000.0,34103180000000.0,39970970000000.0,42002860000000.0,38558400000000.0,14662070000000.0
2021-01-06,70439030000000.0,28023230000000.0,17132440000000.0,7442201000000.0,8031433000000.0,34727010000000.0,39629340000000.0,41861750000000.0,38449630000000.0,14397730000000.0
2021-01-07,72320990000000.0,28437360000000.0,17805500000000.0,7505577000000.0,8001851000000.0,37430320000000.0,39561010000000.0,45248320000000.0,38884710000000.0,14432980000000.0
2021-01-08,74202950000000.0,33959180000000.0,18019660000000.0,8039750000000.0,7987060000000.0,36737160000000.0,42635700000000.0,46988640000000.0,39918010000000.0,14750190000000.0


# 펀드 운용 모델★

In [22]:
import numpy as np

In [None]:
CU              = 50000 # 설정/환매단위
base_date       = dt.date(2021,1,4) # 설정 기준일, 2021-01-04
volume          = 1000000  # 최초설정수량
intereste_reate = 0.02   # 이자율

In [29]:
# 설정_환매
def creation_redemption(v):
    creation = np.random.randint(0,5)*CU   # 0~5사이의 임의의 정수
    if v >=50000:                          # 최소 50만좌 이상일 경우에만 환매 
        redemption = np.random.randint(0,5)*CU
    else:
        redemption = 0
    volume = v + creation - redemption     # 총좌수 = 보유좌수 + 설정 - 환매
    return(creation, redemption, volume)

## K10 stock_ratio