In [1]:
!pip install pykrx



In [2]:
from pykrx import stock
from pykrx import bond

import matplotlib.pyplot as plt
import os
import pandas as pd
import numpy as np

from datetime import datetime, timedelta

In [4]:
# ticker 조회
tickers = stock.get_market_ticker_list()
print(tickers)

['095570', '006840', '027410', '282330', '138930', '001460', '001465', '001040', '079160', '00104K', '000120', '011150', '011155', '001045', '097950', '097955', '000590', '012030', '016610', '005830', '000990', '139130', '001530', '000210', '001880', '000215', '375500', '37550L', '37550K', '007340', '004840', '155660', '069730', '017940', '365550', '383220', '007700', '114090', '078930', '006360', '001250', '007070', '078935', '012630', '039570', '089470', '294870', '009540', '267250', '267270', '322000', '042670', '267260', '329180', '097230', '014790', '003580', '204320', '060980', '011200', '082740', '035000', '003560', '175330', '234080', '001060', '001067', '001065', '096760', '105560', '432320', '002380', '344820', '009070', '009440', '119650', '092220', '003620', '016380', '001390', '033180', '001940', '025000', '092230', '000040', '044450', '030200', '033780', '058850', '058860', '093050', '003550', '034220', '051900', '051905', '373220', '003555', '032640', '011070', '066570',

In [5]:
# 특정 ticker에 해당하는 종목명 조회
for ticker in tickers:
    event_name = stock.get_market_ticker_name(ticker)
    if event_name in '삼성전자':
        print(f"ticker: {ticker}, 종목명: {event_name}")

ticker: 005930, 종목명: 삼성전자


In [6]:
# 특정 종목에 대한 1년치 주가 기록 추출 2018년~2023년
df = stock.get_market_ohlcv("20180711", "20230711", "005930")
len(df)

1234

In [7]:
df.reset_index(inplace=True)

In [8]:
# 타임 스탬프 확인
df['날짜'][0]

Timestamp('2018-07-11 00:00:00')

In [9]:
df.head(6)

Unnamed: 0,날짜,시가,고가,저가,종가,거래량,거래대금,등락률
0,2018-07-11,46400,46450,45400,46000,11224077,515223640592,-0.65
1,2018-07-12,45900,46250,45450,45500,11828104,541036686150,-1.09
2,2018-07-13,45800,46500,45750,46500,11543389,533402245850,2.2
3,2018-07-16,46800,46800,46000,46050,7678719,356079488741,-0.97
4,2018-07-17,46150,46200,45600,45850,8892953,408291768123,-0.43
5,2018-07-18,46700,47200,46450,46550,10952645,511712248050,1.53


In [10]:
df.tail(10)

Unnamed: 0,날짜,시가,고가,저가,종가,거래량,거래대금,등락률
1224,2023-06-28,72600,72700,72000,72700,8783093,635516679700,0.14
1225,2023-06-29,73100,73400,72400,72400,12229967,891731369446,-0.41
1226,2023-06-30,72500,72700,71700,72200,11694765,844353820176,-0.28
1227,2023-07-03,72700,73200,72600,73000,10722181,782251095800,1.11
1228,2023-07-04,73400,73600,72900,73000,10214350,747624440200,0.0
1229,2023-07-05,73000,73300,71900,72000,12310610,889637363400,-1.37
1230,2023-07-06,71900,72400,71500,71600,14777667,1061491980700,-0.56
1231,2023-07-07,71100,71400,69800,69900,17308877,1215404338500,-2.37
1232,2023-07-10,70000,70400,69200,69500,11713926,816772079400,-0.57
1233,2023-07-11,70200,71500,70100,71500,12177392,863673766650,2.88


# minmax scailing 하기

In [11]:
from sklearn.preprocessing import MinMaxScaler,StandardScaler

def scaling_standard(dataframe):
    scaler = StandardScaler()
    scale_cols = ['시가','고가', '저가', '종가', '거래량','거래대금','등락률']
    df_scaled = scaler.fit_transform(dataframe[scale_cols])
    df_scaled = pd.DataFrame(df_scaled)
    df_scaled.columns = scale_cols
    for i in df_scaled.columns:
        df[i]=df_scaled[i]
    return df

In [12]:
scaling_standard(df)

Unnamed: 0,날짜,시가,고가,저가,종가,거래량,거래대금,등락률
0,2018-07-11,-1.076539,-1.108372,-1.117096,-1.108039,-0.557094,-0.753284,-0.439072
1,2018-07-12,-1.114861,-1.123622,-1.113238,-1.146470,-0.481509,-0.711008,-0.715858
2,2018-07-13,-1.122525,-1.104560,-1.090093,-1.069608,-0.517137,-0.723511,1.353747
3,2018-07-16,-1.045882,-1.081686,-1.070806,-1.104196,-1.000743,-1.013932,-0.640371
4,2018-07-17,-1.095700,-1.127434,-1.101666,-1.119568,-0.848799,-0.928418,-0.300679
...,...,...,...,...,...,...,...,...
1229,2023-07-05,0.962147,0.938873,0.927361,0.890377,-0.421131,-0.140067,-0.891995
1230,2023-07-06,0.877841,0.870250,0.896501,0.859632,-0.112415,0.141398,-0.382457
1231,2023-07-07,0.816527,0.794003,0.765347,0.728966,0.204328,0.393476,-1.521054
1232,2023-07-10,0.732220,0.717755,0.719058,0.698222,-0.495797,-0.259406,-0.388747


In [13]:
df.drop(columns='날짜',inplace=True)

In [14]:
df

Unnamed: 0,시가,고가,저가,종가,거래량,거래대금,등락률
0,-1.076539,-1.108372,-1.117096,-1.108039,-0.557094,-0.753284,-0.439072
1,-1.114861,-1.123622,-1.113238,-1.146470,-0.481509,-0.711008,-0.715858
2,-1.122525,-1.104560,-1.090093,-1.069608,-0.517137,-0.723511,1.353747
3,-1.045882,-1.081686,-1.070806,-1.104196,-1.000743,-1.013932,-0.640371
4,-1.095700,-1.127434,-1.101666,-1.119568,-0.848799,-0.928418,-0.300679
...,...,...,...,...,...,...,...
1229,0.962147,0.938873,0.927361,0.890377,-0.421131,-0.140067,-0.891995
1230,0.877841,0.870250,0.896501,0.859632,-0.112415,0.141398,-0.382457
1231,0.816527,0.794003,0.765347,0.728966,0.204328,0.393476,-1.521054
1232,0.732220,0.717755,0.719058,0.698222,-0.495797,-0.259406,-0.388747


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1234 entries, 0 to 1233
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   시가      1234 non-null   float64
 1   고가      1234 non-null   float64
 2   저가      1234 non-null   float64
 3   종가      1234 non-null   float64
 4   거래량     1234 non-null   float64
 5   거래대금    1234 non-null   float64
 6   등락률     1234 non-null   float64
dtypes: float64(7)
memory usage: 67.6 KB


# sequence data 만들기

In [16]:
def sequnce_data(dataset):
    feature=dataset.copy() 
    feature=feature[:-1] # 현재 data로 -> 다음날 종가를 예측 
    target=dataset.copy()
    target=target['종가'][1:]
    return feature.values,target.values
####
#     날짜	   시가	 고가	   저가     종가	 거래량	  거래대금	   등락률
#	2018-07-11	46400	46450	45400	46000	11224077	515223640592	-0.65 -> 7/12 일 종가 45500가 타겟이되게 지정
#	2018-07-12	45900	46250	45450	45500	11828104	541036686150	-1.09


In [17]:
feature, target=sequnce_data(df)

In [18]:
feature.shape, target.shape

((1233, 7), (1233,))

In [19]:
target[4]

-1.0657645664703577

In [20]:
feature[:2]

array([[-1.0765394 , -1.10837233, -1.11709555, -1.10803874, -0.55709397,
        -0.75328431, -0.43907212],
       [-1.11486058, -1.12362183, -1.11323809, -1.14646981, -0.48150901,
        -0.71100754, -0.71585823]])

# window size 만들기

In [21]:
len(feature)

1233

In [22]:
len(target)

1233

In [23]:
def window_data(feature,target,num):
    window=num
    seq_feature=[]
    seq_target=[]
    for i in range(len(target)-window+1):
            seq_feature.append(feature[i:i+window])
            seq_target.append(target[i+window-1])
    return seq_feature,seq_target

In [24]:
seq_feature,seq_target=window_data(feature,target,5)

In [25]:
seq_feature[0]

array([[-1.0765394 , -1.10837233, -1.11709555, -1.10803874, -0.55709397,
        -0.75328431, -0.43907212],
       [-1.11486058, -1.12362183, -1.11323809, -1.14646981, -0.48150901,
        -0.71100754, -0.71585823],
       [-1.12252481, -1.10455995, -1.0900933 , -1.06960767, -0.51713684,
        -0.72351127,  1.35374675],
       [-1.04588246, -1.0816857 , -1.07080597, -1.10419564, -1.00074255,
        -1.01393163, -0.64037112],
       [-1.09569999, -1.1274342 , -1.10166569, -1.11956806, -0.84879931,
        -0.92841802, -0.30067911]])

In [26]:
seq_target[0]

-1.0657645664703577

In [27]:
len(seq_feature)

1229

In [28]:
type(seq_feature)

list

In [29]:
seq_feature[0],seq_target[0]

(array([[-1.0765394 , -1.10837233, -1.11709555, -1.10803874, -0.55709397,
         -0.75328431, -0.43907212],
        [-1.11486058, -1.12362183, -1.11323809, -1.14646981, -0.48150901,
         -0.71100754, -0.71585823],
        [-1.12252481, -1.10455995, -1.0900933 , -1.06960767, -0.51713684,
         -0.72351127,  1.35374675],
        [-1.04588246, -1.0816857 , -1.07080597, -1.10419564, -1.00074255,
         -1.01393163, -0.64037112],
        [-1.09569999, -1.1274342 , -1.10166569, -1.11956806, -0.84879931,
         -0.92841802, -0.30067911]]),
 -1.0657645664703577)

# dataset 만들기 

In [30]:
import torch
from torch.utils.data import Dataset
class MakeDataset(Dataset):

    def __init__(self,feature,target):
        super().__init__()
        self.feature = feature
        self.target = target
        self.feature=torch.Tensor(self.feature)
        self.target=torch.Tensor(self.target)
    def __len__(self):
        return self.feature.shape[0]

    def __getitem__(self, idx:int):
        return self.feature[idx],self.target[idx]

In [31]:
dataset_dummy= MakeDataset(seq_feature,seq_target)

  self.feature=torch.Tensor(self.feature)


In [32]:
type(dataset_dummy.feature)

torch.Tensor

In [33]:
dataset_dummy.feature.shape,dataset_dummy.target.shape
# 전체 데이터 갯수,한개에 5개의 데이터가 있고 데이터안에 7개 컬럼이있음 

(torch.Size([1229, 5, 7]), torch.Size([1229]))

In [34]:
dataset_dummy.feature[0].shape

torch.Size([5, 7])

# Dataloader 만들기

In [35]:
from torch.utils.data import DataLoader

In [36]:
data_loader = torch.utils.data.DataLoader(dataset_dummy,batch_size=10,shuffle=False,drop_last=True)

In [37]:
# data set 나오는거 확인
next(iter(data_loader))[0].shape
next(iter(data_loader))[1].shape
# batch,data갯수,컬럼 갯수 

torch.Size([10])

In [None]:
7개를 embedding_size 에 넣어준다

In [38]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cpu'

# LSTM 모델 만들기

In [46]:
input_dim

NameError: name 'input_dim' is not defined

In [52]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
class LSTM_Model(torch.nn.Module):
    def __init__(self,input_dim,device='cpu',hidden_dim=64,layer_dim=3,output_layer=1):
        super().__init__()
        self.device = device
        self.layer_dim=layer_dim
        self.hidden_dim=hidden_dim
        self.lstm = nn.LSTM(input_size=input_dim,hidden_size=hidden_dim,num_layers=self.layer_dim, dropout=0.3, bidirectional=True)
        self.linear = nn.Linear(hidden_dim, output_layer, bias=True)
        self.relu = nn.ReLU()
    def forward(self, x):
        scaler = 2 if self.lstm.bidirectional == True else 1 # Bi-LSTM(2), LSTM(1)
        # 초기화
        hidden_state = torch.zeros(self.layer_dim*scaler, x.size(0),
                                  self.hidden_dim, requires_grad=True).to(self.device)
        cell_state = torch.zeros(self.layer_dim*scaler, x.size(0),
                                  self.hidden_dim, requires_grad=True).to(self.device)
        out, (hidden_state, cell_state) =self.lstm(x,(hidden_state,cell_state))
        h = hidden_state[-1]
        h=self.relu(h)
        predict = self.linear(h)
        
        return predict

In [53]:
model = LSTM_Model(len(dataset_dummy)).to(device)

In [54]:
model

LSTM_Model(
  (lstm): LSTM(1229, 64, num_layers=3, dropout=0.3, bidirectional=True)
  (linear): Linear(in_features=64, out_features=1, bias=True)
  (relu): ReLU()
)

In [55]:
model(next(iter(data_loader))[0].to(device)).shape

RuntimeError: input.size(-1) must be equal to input_size. Expected 1229, got 7

# 학습시 사용할 분리 셋 만들기

In [253]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test= train_test_split(seq_feature, seq_target,test_size=0.3, random_state=0)