In [1]:
# 전처리 동안 사용할 라이브러리 종합
import pandas as pd
import numpy as np

import re
import copy
import datetime
from tqdm import tqdm

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['font.family'] = 'AppleGothic'
sns.set(font="AppleGothic", 
        rc={"axes.unicode_minus":False},
        style='darkgrid')

import warnings
warnings.filterwarnings('ignore')

In [2]:
company_info = pd.read_csv('../data/preprocessed_data/samsung.csv',encoding='cp949',index_col=0)
stock_price = pd.read_csv('../data/preprocessed_data/samsung_price.csv',encoding='cp949',index_col=0)

In [3]:
company_info['보고일자'] = pd.to_datetime(company_info['보고일자'])
stock_price['날짜'] = pd.to_datetime(stock_price['날짜'])

In [4]:
# 데이터 종합
dataset = pd.merge(stock_price, company_info, how='right', left_on=['날짜'], right_on=['보고일자']).\
    sort_values("보고일자", ascending=False).fillna(method='backfill').reset_index(drop=True)
dataset.drop(columns=["보고일자"], inplace=True)
    
# 날짜데이터를 년/월/일/요일로 구분
dataset["년"] = dataset["날짜"].apply(lambda x: int(datetime.datetime.strftime(x, "%Y-%m-%d").split("-")[0]))
dataset["월"] = dataset["날짜"].apply(lambda x: int(datetime.datetime.strftime(x, "%Y-%m-%d").split("-")[1]))
dataset["일"] = dataset["날짜"].apply(lambda x: int(datetime.datetime.strftime(x, "%Y-%m-%d").split("-")[2]))

temp = dataset["날짜"].dt.weekday
dataset["요일"] = pd.Categorical.from_codes(temp, ["월", "화", "수", "목", "금"], ordered=True)

dataset.head()

Unnamed: 0,날짜,1일 뒤 종가,2일 뒤 종가,3일 뒤 종가,4일 뒤 종가,5일 뒤 종가,6일 뒤 종가,7일 뒤 종가,8일 뒤 종가,9일 뒤 종가,...,변동사유,특정증권등의 종류,발행주식 총수,변동전,증감,변동후,년,월,일,요일
0,2022-10-24,1.0,2.0,2.0,0.0,2.0,2.0,2.0,1.0,2.0,...,신규보고(+),우선주,6792669250,0,100,100,2022,10,24,월
1,2022-10-24,1.0,2.0,2.0,0.0,2.0,2.0,2.0,1.0,2.0,...,장내매수(+),우선주,6792669250,100,200,300,2022,10,24,월
2,2022-10-18,0.0,0.0,0.0,1.0,1.0,2.0,2.0,1.0,2.0,...,장내매수(+),보통주,5969782550,0,370,370,2022,10,18,화
3,2022-10-18,0.0,0.0,0.0,1.0,1.0,2.0,2.0,1.0,2.0,...,장내매수(+),우선주,6792669250,176,211,387,2022,10,18,화
4,2022-10-17,0.0,0.0,0.0,0.0,1.0,1.0,2.0,2.0,1.0,...,장내매수(+),보통주,6792669250,600,208,808,2022,10,17,월


In [5]:
dataset.shape

(691, 43)

In [6]:
dataset.columns

Index(['날짜', '1일 뒤 종가', '2일 뒤 종가', '3일 뒤 종가', '4일 뒤 종가', '5일 뒤 종가', '6일 뒤 종가',
       '7일 뒤 종가', '8일 뒤 종가', '9일 뒤 종가', '10일 뒤 종가', '시가', '고가', '저가', '종가',
       '전일비', '등락률', '거래량', '금액(백만)', '신용비', '개인', '기관', '외인수량', '외국계', '프로그램',
       '외인비', '회사명', '종목코드', '보고구분', '보고자 구분', '임원', '직위명', '주요주주', '변동사유',
       '특정증권등의 종류', '발행주식 총수', '변동전', '증감', '변동후', '년', '월', '일', '요일'],
      dtype='object')

In [7]:
dataset.head()

Unnamed: 0,날짜,1일 뒤 종가,2일 뒤 종가,3일 뒤 종가,4일 뒤 종가,5일 뒤 종가,6일 뒤 종가,7일 뒤 종가,8일 뒤 종가,9일 뒤 종가,...,변동사유,특정증권등의 종류,발행주식 총수,변동전,증감,변동후,년,월,일,요일
0,2022-10-24,1.0,2.0,2.0,0.0,2.0,2.0,2.0,1.0,2.0,...,신규보고(+),우선주,6792669250,0,100,100,2022,10,24,월
1,2022-10-24,1.0,2.0,2.0,0.0,2.0,2.0,2.0,1.0,2.0,...,장내매수(+),우선주,6792669250,100,200,300,2022,10,24,월
2,2022-10-18,0.0,0.0,0.0,1.0,1.0,2.0,2.0,1.0,2.0,...,장내매수(+),보통주,5969782550,0,370,370,2022,10,18,화
3,2022-10-18,0.0,0.0,0.0,1.0,1.0,2.0,2.0,1.0,2.0,...,장내매수(+),우선주,6792669250,176,211,387,2022,10,18,화
4,2022-10-17,0.0,0.0,0.0,0.0,1.0,1.0,2.0,2.0,1.0,...,장내매수(+),보통주,6792669250,600,208,808,2022,10,17,월


In [8]:
cols = list(dataset.dtypes[dataset.dtypes == "object"].index)
cols

['회사명', '보고구분', '보고자 구분', '임원', '직위명', '주요주주', '변동사유', '특정증권등의 종류']

In [9]:
for col in cols:
    print("="*50, "\n")
    print(dataset[col].value_counts())
    print("\n카테고리 갯수 :",len(dataset[col].value_counts()), "\n")


삼성전자    691
Name: 회사명, dtype: int64

카테고리 갯수 : 1 


변동    467
신규    224
Name: 보고구분, dtype: int64

카테고리 갯수 : 2 


개인(국내)        657
연기금등 전문투자자     23
개인(외국)         11
Name: 보고자 구분, dtype: int64

카테고리 갯수 : 3 


비등기임원    594
등기임원      72
-         25
Name: 임원, dtype: int64

카테고리 갯수 : 3 


상무          317
부사장         147
전무           90
사장           52
사외이사         42
-            25
대표이사 사장       5
부회장           4
대표이사 부회장      2
회장            2
대표이사부회장       2
대표이사회장        1
상임고문          1
 상무           1
Name: 직위명, dtype: int64

카테고리 갯수 : 14 


-          643
10%이상주주     44
사실상지배주주      4
Name: 주요주주, dtype: int64

카테고리 갯수 : 3 


장내매수(+)       235
장내매도(-)       141
주식분할          128
신규선임(+)        94
임원퇴임(-)        48
신규보고(+)        30
기타(+)           5
증여(-)           3
기타(-)           2
계열사편입(+)        1
시간외매매(-)        1
상속(+)           1
피상속(-)          1
주식매수선택권(+)      1
Name: 변동사유, dtype: int64

카테고리 갯수 : 14 


보통주    625
우선주     63
기타       3
Name: 특정증권등의 종류, dtype: int64

카테

In [10]:
# 직위명 컬럼 정리
temp_pos = dataset["직위명"].value_counts().to_frame().reset_index().rename(columns={"index":"직위명", "직위명":"count"})
temp_pos

Unnamed: 0,직위명,count
0,상무,317
1,부사장,147
2,전무,90
3,사장,52
4,사외이사,42
5,-,25
6,대표이사 사장,5
7,부회장,4
8,대표이사 부회장,2
9,회장,2


In [11]:
names = list(temp_pos["직위명"].values)
names

['상무',
 '부사장',
 '전무',
 '사장',
 '사외이사',
 '-',
 '대표이사 사장',
 '부회장',
 '대표이사 부회장',
 '회장',
 '대표이사부회장',
 '대표이사회장',
 '상임고문',
 ' 상무']

In [12]:
_my_dict = {
    '임원진':['상무','전무','이사','이사대우','사외이사','상무보','대표이사','상임감사','기획지원본부장','임원(실장)',
           '기타비상무이사','본부장','상임이사','이사회의장','임원(제철소부소장)','임원(그룹장)','상무이사','임원(연구소장)','에너지소재본부장',
           '사내이사','위원','분석솔루션센터장','IE소재연구센터장','친환경제품솔루션센터장','ValueCreationCenter장','비상임이사','상무(연구위원)',
           '상ㅁ','임원(국장)','수석연구위원(전무)','의장','신성장동력본부장','홍보실장/SUPEX추구협의회파견/상무','본부임원','전력계통본부장',
           '전문위원','임원(제철소장)','임원(부장)','임원(철강솔루션&cr연구소장)','임원(본부장)','Communication부문장','센터장','FIG리더','부문장',
           '배터리마케팅본부장','ValueCreation&crCenter장','Portfolio부문장','부사장(대행)'],
    
    '대표':['사장','부회장','대표이사사장','회장','대표이사회장','수석부회장','대표이사/CEO','수석부사장','사장(대행)','부사장',
          '사내이사부사장','대표이사부회장','대표이사부사장','수석부사장(대행)','E&P사업대표','대표이사(사장)','대표이사/부회장',
          'E&P사장','기획재무실장','배터리사업대표(사장)','총괄사장'],
    
    '실무진':['담당','실장','연구위원','수석연구위원','소장','PR담당','기술연구소장','연구소장','Compliance담당','Comm.기획담당',
           '기술위원','법무담당','재무실장','IP전략담당','구매담당','사업부장','DT부문장','양극소재실장','내화물생산실장','글로벌로재사업실장',
           'BMR추진담당','세무전략담당','기반기술연구소장'],
    
    '고문':['명예회장','비상근고문','상임고문','상근고문','고문'],
    
    '기타':['자문역','보좌역','직무대행'],
    
    '없음':['-','서울강남구도곡2동'],
}

In [13]:
_names=[i.strip().replace(' ','') for i in names ]

temp = list()

for k in _my_dict.keys():
    for v in _my_dict[k]:
        temp.append((v,k))

mapper=dict(temp)

In [14]:
dataset["직위명"] = dataset["직위명"].apply(lambda x: mapper[x.strip().replace(' ',"")])

In [15]:
dataset["직위명"].unique()

array(['대표', '임원진', '없음', '고문'], dtype=object)

In [16]:
# EDA분석을 위한 데이터 save
dataset.to_csv("../data/preprocessed_data/forEDA.csv", encoding='cp949')

In [17]:
# onehot 인코딩 적용할 컬럼 선정 후 처리
cols = list(dataset.dtypes[dataset.dtypes == "object"].index)
cols.append("요일")

temp = pd.DataFrame()
for col in cols:
    temp = pd.concat([temp, pd.get_dummies(dataset[col], prefix=col)], axis=1)

dataset = pd.concat([dataset.drop(columns=cols), temp], axis=1)

# 파생변수 생성
# 1. 당일 거래량/발행주식 총수
dataset["거래량/발행주식 총수"] = dataset["거래량"] / dataset["발행주식 총수"] * 100
# 2. 개인매매/당일 거래량 비율
dataset["당일개인거래비율"] = dataset["개인"]/dataset["거래량"] * 100
# 3. 기관매매/당일 거래량 비율
dataset["당일기관거래비율"] = dataset["기관"]/dataset["거래량"] * 100
# 4. 프로그램/당일 거래량 비율
dataset["당일프로그램거래비율"] = dataset["프로그램"]/dataset["거래량"] * 100 # 모두 % 단위임.

In [18]:
# 결측치 확인
dataset.isnull().sum().sum()

3

In [19]:
dataset.isnull().sum()

날짜             0
1일 뒤 종가        0
2일 뒤 종가        0
3일 뒤 종가        0
4일 뒤 종가        0
              ..
요일_금           0
거래량/발행주식 총수    0
당일개인거래비율       1
당일기관거래비율       1
당일프로그램거래비율     1
Length: 76, dtype: int64

In [20]:
dataset[dataset["당일개인거래비율"].isna()]["거래량"]

531    0.0
Name: 거래량, dtype: float64

In [21]:
# 결측치 제거
dataset.drop(index=dataset[dataset["거래량"] == 0].index, inplace=True)

In [22]:
dataset.isnull().sum().sum()

0

In [23]:
dataset.head()

Unnamed: 0,날짜,1일 뒤 종가,2일 뒤 종가,3일 뒤 종가,4일 뒤 종가,5일 뒤 종가,6일 뒤 종가,7일 뒤 종가,8일 뒤 종가,9일 뒤 종가,...,특정증권등의 종류_우선주,요일_월,요일_화,요일_수,요일_목,요일_금,거래량/발행주식 총수,당일개인거래비율,당일기관거래비율,당일프로그램거래비율
0,2022-10-24,1.0,2.0,2.0,0.0,2.0,2.0,2.0,1.0,2.0,...,1,1,0,0,0,0,0.249453,-1.744342,0.532279,0.450636
1,2022-10-24,1.0,2.0,2.0,0.0,2.0,2.0,2.0,1.0,2.0,...,1,1,0,0,0,0,0.249453,-1.744342,0.532279,0.450636
2,2022-10-18,0.0,0.0,0.0,1.0,1.0,2.0,2.0,1.0,2.0,...,0,0,1,0,0,0,0.248262,0.613291,-0.146579,-0.79534
3,2022-10-18,0.0,0.0,0.0,1.0,1.0,2.0,2.0,1.0,2.0,...,1,0,1,0,0,0,0.218187,0.613291,-0.146579,-0.79534
4,2022-10-17,0.0,0.0,0.0,0.0,1.0,1.0,2.0,2.0,1.0,...,0,1,0,0,0,0,0.200832,-0.52838,-0.516879,0.845001


In [24]:
# 종속변수와 독립변수 분리
y_data = dataset.loc[:,['날짜', '1일 뒤 종가', '2일 뒤 종가', '3일 뒤 종가', '4일 뒤 종가', '5일 뒤 종가', '6일 뒤 종가','7일 뒤 종가', '8일 뒤 종가', '9일 뒤 종가', '10일 뒤 종가']]
y_data = y_data.drop_duplicates("날짜").set_index("날짜")

X_data = dataset.drop(columns=y_data.columns)

In [25]:
y_data.head()

Unnamed: 0_level_0,1일 뒤 종가,2일 뒤 종가,3일 뒤 종가,4일 뒤 종가,5일 뒤 종가,6일 뒤 종가,7일 뒤 종가,8일 뒤 종가,9일 뒤 종가,10일 뒤 종가
날짜,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2022-10-24,1.0,2.0,2.0,0.0,2.0,2.0,2.0,1.0,2.0,2.0
2022-10-18,0.0,0.0,0.0,1.0,1.0,2.0,2.0,1.0,2.0,2.0
2022-10-17,0.0,0.0,0.0,0.0,1.0,1.0,2.0,2.0,1.0,2.0
2022-09-29,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
2022-09-28,0.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0


In [26]:
X_data.head()

Unnamed: 0,날짜,시가,고가,저가,종가,전일비,등락률,거래량,금액(백만),신용비,...,특정증권등의 종류_우선주,요일_월,요일_화,요일_수,요일_목,요일_금,거래량/발행주식 총수,당일개인거래비율,당일기관거래비율,당일프로그램거래비율
0,2022-10-24,57000.0,57500.0,56700.0,57500.0,1600.0,2.86,16944503.0,968892.0,0.15,...,1,1,0,0,0,0,0.249453,-1.744342,0.532279,0.450636
1,2022-10-24,57000.0,57500.0,56700.0,57500.0,1600.0,2.86,16944503.0,968892.0,0.15,...,1,1,0,0,0,0,0.249453,-1.744342,0.532279,0.450636
2,2022-10-18,56700.0,57100.0,55800.0,56500.0,-100.0,-0.18,14820702.0,835745.0,0.14,...,0,0,1,0,0,0,0.248262,0.613291,-0.146579,-0.79534
3,2022-10-18,56700.0,57100.0,55800.0,56500.0,-100.0,-0.18,14820702.0,835745.0,0.14,...,1,0,1,0,0,0,0.218187,0.613291,-0.146579,-0.79534
4,2022-10-17,55800.0,57000.0,55700.0,56600.0,300.0,0.53,13641878.0,769638.0,0.14,...,0,1,0,0,0,0,0.200832,-0.52838,-0.516879,0.845001


In [27]:
print(X_data.shape)
print(y_data.shape)

(690, 66)
(399, 10)


In [28]:
temp = X_data.groupby("날짜").mean()
temp.reset_index(inplace=True)

In [29]:
cnt = X_data.value_counts("날짜").to_frame().reset_index().rename(columns={0:"당일공시갯수"})
cnt

Unnamed: 0,날짜,당일공시갯수
0,2018-06-11,22
1,2018-06-01,18
2,2021-12-16,12
3,2018-06-07,12
4,2018-06-08,11
...,...,...
394,2019-05-16,1
395,2019-04-26,1
396,2019-03-25,1
397,2019-03-11,1


In [30]:
X_data = pd.merge(temp, cnt, how='inner', on=['날짜']).sort_values("날짜", ascending=False).reset_index(drop=True)

In [31]:
X_data.shape

(399, 67)

In [32]:
# 독립변수 / 종속변수 저장
X_data.to_csv("../data/preprocessed_data/y_data.csv", encoding='cp949')
y_data.to_csv("../data/preprocessed_data/X_data.csv", encoding='cp949')

In [33]:
# 데이터셋 저장
dataset.to_csv("../data/preprocessed_data/data_(main).csv", encoding='cp949')

### 이전 기간에 따른 가중치 적용하기

In [34]:
X_data

Unnamed: 0,날짜,시가,고가,저가,종가,전일비,등락률,거래량,금액(백만),신용비,...,요일_월,요일_화,요일_수,요일_목,요일_금,거래량/발행주식 총수,당일개인거래비율,당일기관거래비율,당일프로그램거래비율,당일공시갯수
0,2022-10-24,57000.0,57500.0,56700.0,57500.0,1600.0,2.86,16944503.0,968892.0,0.15,...,1.0,0.0,0.0,0.0,0.0,0.249453,-1.744342,0.532279,0.450636,2
1,2022-10-18,56700.0,57100.0,55800.0,56500.0,-100.0,-0.18,14820702.0,835745.0,0.14,...,0.0,1.0,0.0,0.0,0.0,0.233224,0.613291,-0.146579,-0.795340,2
2,2022-10-17,55800.0,57000.0,55700.0,56600.0,300.0,0.53,13641878.0,769638.0,0.14,...,1.0,0.0,0.0,0.0,0.0,0.200832,-0.528380,-0.516879,0.845001,1
3,2022-09-29,53300.0,53700.0,52600.0,52600.0,-300.0,-0.57,13882080.0,737731.0,0.17,...,0.0,0.0,0.0,1.0,0.0,0.204369,0.139576,-0.067576,-0.109602,1
4,2022-09-28,53900.0,54400.0,52500.0,52900.0,-1300.0,-2.40,19991129.0,1064666.0,0.17,...,0.0,0.0,1.0,0.0,0.0,0.294304,0.716198,-0.085093,-0.408921,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
394,2015-01-22,27960.0,27980.0,27560.0,27560.0,-17000.0,-1.22,17957700.0,499074.0,0.00,...,0.0,0.0,0.0,1.0,0.0,10.555110,-0.134739,0.468774,0.182718,2
395,2015-01-20,27000.0,27440.0,26900.0,27440.0,29000.0,2.16,13278050.0,360703.0,0.00,...,0.0,1.0,0.0,0.0,0.0,7.804523,-0.812725,0.598815,0.529739,1
396,2015-01-15,26900.0,26980.0,26580.0,26680.0,-11000.0,-0.82,14103900.0,378298.0,0.00,...,0.0,0.0,0.0,1.0,0.0,8.289938,-0.111182,0.073419,0.150668,2
397,2015-01-14,26780.0,27100.0,26700.0,26900.0,6000.0,0.45,14332250.0,385455.0,0.00,...,0.0,0.0,1.0,0.0,0.0,8.424156,-0.461114,0.071782,-0.079025,1


In [35]:
y_data

Unnamed: 0_level_0,1일 뒤 종가,2일 뒤 종가,3일 뒤 종가,4일 뒤 종가,5일 뒤 종가,6일 뒤 종가,7일 뒤 종가,8일 뒤 종가,9일 뒤 종가,10일 뒤 종가
날짜,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2022-10-24,1.0,2.0,2.0,0.0,2.0,2.0,2.0,1.0,2.0,2.0
2022-10-18,0.0,0.0,0.0,1.0,1.0,2.0,2.0,1.0,2.0,2.0
2022-10-17,0.0,0.0,0.0,0.0,1.0,1.0,2.0,2.0,1.0,2.0
2022-09-29,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
2022-09-28,0.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
...,...,...,...,...,...,...,...,...,...,...
2015-01-22,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2015-01-20,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
2015-01-15,0.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0
2015-01-14,0.0,0.0,0.0,1.0,2.0,1.0,2.0,2.0,2.0,1.0


In [36]:
temp_y = y_data.reset_index(["날짜"]).copy()
temp_y.rename(columns={"1일 뒤 종가": "1일 뒤",
                     "2일 뒤 종가": "2일 뒤",
                     "3일 뒤 종가": "3일 뒤",
                     "4일 뒤 종가": "4일 뒤",
                     "5일 뒤 종가": "5일 뒤",
                     "6일 뒤 종가": "6일 뒤",
                     "7일 뒤 종가": "7일 뒤",
                     "8일 뒤 종가": "8일 뒤",
                     "9일 뒤 종가": "9일 뒤",
                     "10일 뒤 종가": "10일 뒤"
                     }, inplace=True)

for i in range(1, 11):
    temp_y[f"{i}일 뒤"] = temp_y["날짜"].apply(lambda x: x + datetime.timedelta(days=i))
temp_y.head(3)

Unnamed: 0,날짜,1일 뒤,2일 뒤,3일 뒤,4일 뒤,5일 뒤,6일 뒤,7일 뒤,8일 뒤,9일 뒤,10일 뒤
0,2022-10-24,2022-10-25,2022-10-26,2022-10-27,2022-10-28,2022-10-29,2022-10-30,2022-10-31,2022-11-01,2022-11-02,2022-11-03
1,2022-10-18,2022-10-19,2022-10-20,2022-10-21,2022-10-22,2022-10-23,2022-10-24,2022-10-25,2022-10-26,2022-10-27,2022-10-28
2,2022-10-17,2022-10-18,2022-10-19,2022-10-20,2022-10-21,2022-10-22,2022-10-23,2022-10-24,2022-10-25,2022-10-26,2022-10-27


In [37]:
# 가중치 벡터가 담긴 데이터프레임 생성

cols = X_data["날짜"]
dy = 30 # 가중치 적용할 날짜 기준

def weight_arr(n, cols, dy):
    weight = pd.DataFrame(index=temp_y[f"{n}일 뒤"], columns=cols)
    weight = weight.apply(lambda x: cols, axis=1)
    weight.reset_index(inplace=True)
    
    for i in range(X_data.shape[0]):
        weight[i] = (weight[f"{n}일 뒤"]-weight[i]).dt.days

    weight.loc[:,0:] = weight[(weight.loc[:,0:] > 0) & (weight.loc[:,0:] < dy)]
    weight.loc[:,0:] = np.reciprocal(weight.loc[:,0:]).fillna(0)

    weight.set_index(f"{n}일 뒤", inplace=True)
    weight.columns = cols
    return weight

def weight_func(weight_df, temp_df):
    temp_df = temp_df.astype(float).to_numpy()
    weight_df = weight_df.astype(float).to_numpy()

    weight_df = pd.DataFrame(data=weight_df)
    
    df = pd.DataFrame(data=np.dot(weight_df, temp_df))
    return df

In [38]:
cut_date = cols.iat[-1,] + datetime.timedelta(days=dy)
weight_df = weight_arr(1, cols, dy)
#weight_df = weight_df[weight_df.columns[weight_df.columns > cut_date]]
weight_df

날짜,2022-10-24,2022-10-18,2022-10-17,2022-09-29,2022-09-28,2022-09-21,2022-09-16,2022-09-08,2022-09-06,2022-08-17,...,2015-02-24,2015-02-17,2015-01-29,2015-01-28,2015-01-27,2015-01-22,2015-01-20,2015-01-15,2015-01-14,2015-01-13
1일 뒤,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-10-25,1.0,0.142857,0.125,0.038462,0.037037,0.000000,0.000000,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000
2022-10-19,0.0,1.000000,0.500,0.050000,0.047619,0.035714,0.000000,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000
2022-10-18,0.0,0.000000,1.000,0.052632,0.050000,0.037037,0.000000,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000
2022-09-30,0.0,0.000000,0.000,1.000000,0.500000,0.111111,0.071429,0.045455,0.041667,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000
2022-09-29,0.0,0.000000,0.000,0.000000,1.000000,0.125000,0.076923,0.047619,0.043478,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2015-01-23,0.0,0.000000,0.000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.333333,0.125000,0.111111,0.100000
2015-01-21,0.0,0.000000,0.000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.000000,0.166667,0.142857,0.125000
2015-01-16,0.0,0.000000,0.000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,1.000000,0.500000,0.333333
2015-01-15,0.0,0.000000,0.000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,1.000000,0.500000


In [39]:
temp_df = X_data.copy()
#temp_df = temp_df[temp_df["날짜"] > cut_date]
temp_df.set_index("날짜", inplace=True)
temp_df

Unnamed: 0_level_0,시가,고가,저가,종가,전일비,등락률,거래량,금액(백만),신용비,개인,...,요일_월,요일_화,요일_수,요일_목,요일_금,거래량/발행주식 총수,당일개인거래비율,당일기관거래비율,당일프로그램거래비율,당일공시갯수
날짜,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-10-24,57000.0,57500.0,56700.0,57500.0,1600.0,2.86,16944503.0,968892.0,0.15,-295570.0,...,1.0,0.0,0.0,0.0,0.0,0.249453,-1.744342,0.532279,0.450636,2
2022-10-18,56700.0,57100.0,55800.0,56500.0,-100.0,-0.18,14820702.0,835745.0,0.14,90894.0,...,0.0,1.0,0.0,0.0,0.0,0.233224,0.613291,-0.146579,-0.795340,2
2022-10-17,55800.0,57000.0,55700.0,56600.0,300.0,0.53,13641878.0,769638.0,0.14,-72081.0,...,1.0,0.0,0.0,0.0,0.0,0.200832,-0.528380,-0.516879,0.845001,1
2022-09-29,53300.0,53700.0,52600.0,52600.0,-300.0,-0.57,13882080.0,737731.0,0.17,19376.0,...,0.0,0.0,0.0,1.0,0.0,0.204369,0.139576,-0.067576,-0.109602,1
2022-09-28,53900.0,54400.0,52500.0,52900.0,-1300.0,-2.40,19991129.0,1064666.0,0.17,143176.0,...,0.0,0.0,1.0,0.0,0.0,0.294304,0.716198,-0.085093,-0.408921,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2015-01-22,27960.0,27980.0,27560.0,27560.0,-17000.0,-1.22,17957700.0,499074.0,0.00,-24196.0,...,0.0,0.0,0.0,1.0,0.0,10.555110,-0.134739,0.468774,0.182718,2
2015-01-20,27000.0,27440.0,26900.0,27440.0,29000.0,2.16,13278050.0,360703.0,0.00,-107914.0,...,0.0,1.0,0.0,0.0,0.0,7.804523,-0.812725,0.598815,0.529739,1
2015-01-15,26900.0,26980.0,26580.0,26680.0,-11000.0,-0.82,14103900.0,378298.0,0.00,-15681.0,...,0.0,0.0,0.0,1.0,0.0,8.289938,-0.111182,0.073419,0.150668,2
2015-01-14,26780.0,27100.0,26700.0,26900.0,6000.0,0.45,14332250.0,385455.0,0.00,-66088.0,...,0.0,0.0,1.0,0.0,0.0,8.424156,-0.461114,0.071782,-0.079025,1


In [40]:
weighted_data = weight_func(weight_df, temp_df).values
weighted_data = pd.DataFrame(data=weighted_data, columns=temp_df.columns)
weighted_data.set_index(weight_df.index, inplace=True)

In [41]:
weighted_data = weighted_data[weighted_data.index > cut_date]
weighted_data.reset_index(drop=True,inplace=True)

In [42]:
weighted_data["날짜"] = X_data["날짜"]
weighted_data.set_index(["날짜"], inplace=True)

In [43]:

weighted_data.to_csv("../data/preprocessed_data/weighted_data.csv", encoding='cp949')

In [44]:
y_data = y_data[y_data.index > cut_date]
y_data

Unnamed: 0_level_0,1일 뒤 종가,2일 뒤 종가,3일 뒤 종가,4일 뒤 종가,5일 뒤 종가,6일 뒤 종가,7일 뒤 종가,8일 뒤 종가,9일 뒤 종가,10일 뒤 종가
날짜,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2022-10-24,1.0,2.0,2.0,0.0,2.0,2.0,2.0,1.0,2.0,2.0
2022-10-18,0.0,0.0,0.0,1.0,1.0,2.0,2.0,1.0,2.0,2.0
2022-10-17,0.0,0.0,0.0,0.0,1.0,1.0,2.0,2.0,1.0,2.0
2022-09-29,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
2022-09-28,0.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
...,...,...,...,...,...,...,...,...,...,...
2015-03-20,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2015-03-12,1.0,1.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,0.0
2015-03-03,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,2.0,2.0
2015-02-24,1.0,1.0,0.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0


In [45]:
y_data.to_csv("../data/preprocessed_data/y_data.csv", encoding='cp949')