In [7]:
# 전처리 동안 사용할 라이브러리 종합
import pandas as pd
import numpy as np

import re
import copy
import datetime
from tqdm import tqdm

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['font.family'] = 'AppleGothic'
sns.set(font="AppleGothic", 
        rc={"axes.unicode_minus":False},
        style='darkgrid')

import warnings
warnings.filterwarnings('ignore')

### 주식 가격데이터 전처리

In [8]:
# 주가 데이터 로드 및 통합

# 주가데이터 파일 통합
import glob
import os
from tqdm import tqdm

file_path = glob.glob("../data/raw_data/stock_code/*.csv")

stock_price = pd.DataFrame()
for f in tqdm(file_path):
    code = os.path.basename(f).split(".")[0]
    df_temp = pd.read_csv(f, encoding='cp949', index_col=0)
    df_temp["종목코드"] = code
    stock_price = pd.concat([stock_price, df_temp])

# 사용할 컬럼 선정
use_cols = ["날짜","시가","고가","저가","종가","전일비","등락률","거래량",'금액(백만)'\
    , '신용비', "개인", "기관", "외인수량", "외국계", "프로그램","외인비"]
stock_price = stock_price[use_cols]

stock_price.head()

100%|██████████| 1/1 [00:00<00:00, 119.54it/s]


Unnamed: 0,날짜,시가,고가,저가,종가,전일비,등락률,거래량,금액(백만),신용비,개인,기관,외인수량,외국계,프로그램,외인비
0,20221115,62200,62500,-61600,62400,500,0.81,12310986,764117,0.11,--49462,+10844,+637551,+25182,+31864,49.93
1,20221114,62900,62900,-61700,-61900,-1000,-1.59,15973416,993352,0.11,+63509,--50244,+145835,+17643,--28580,49.92
2,20221111,63100,63200,62300,62900,2500,4.14,20037163,1257130,0.11,--385057,+207573,+2703635,+180103,+240193,49.92
3,20221110,-61400,-61500,-60400,-60400,-1600,-2.58,21087633,1283005,0.11,+183153,+55077,--3558072,--227573,--153148,49.87
4,20221109,62000,62200,-61300,62000,200,0.32,14045592,869645,0.12,--129749,+110819,+1099507,+47708,--38982,49.93


In [9]:
stock_price.shape

(2000, 16)

In [10]:
# 불필요한 부호 제거
stock_price.loc[:,"시가":"종가"] = stock_price.loc[:,"시가":"종가"].apply(lambda x: abs(x))
stock_price.loc[:,"개인":"프로그램"] = stock_price.loc[:,"개인":"프로그램"].astype(str)
stock_price.loc[:, "개인":"프로그램"] = stock_price.loc[:, "개인":"프로그램"].applymap(lambda x: int(re.sub(r"[^0-9]", "", x))\
    if x[0] == '+' else int(re.sub(r"[^0-9]", "", x)) * -1)

# str타입을 datetime타입으로 변경
stock_price["날짜"] = stock_price["날짜"].apply(lambda x: datetime.datetime.strptime(str(x), "%Y%m%d"))

# 종속변수를 위한 컬럼 추가 및 컬럼순서 변경
n = 10 # 최대 n일 뒤의 종가를 예측하고자 함.
for i in range(1, n+1):
    stock_price[f"{i}일 뒤 종가"] = stock_price["종가"].shift(i)

col1=stock_price.columns[1:-n].to_list()
col2=stock_price.columns[-n:].to_list()
new_col = stock_price.columns[0:1].to_list() + col2 + col1
stock_price=stock_price[new_col]

# 쉬프트 이전 제거.
stock_price = stock_price[n:]

###########################
# 종속변수 분류 정의

# 종가기준 p% 이상이면 2
# 종가기준 0% 이상이면 1
# 종가기준 마이너스면 0
profit = 0.03
for i in range(1, n+1):
    stock_price[f"{i}일 뒤 종가"] = stock_price.apply(lambda x: 2 \
        if ((x[f"{i}일 뒤 종가"]-x["종가"])/x["종가"] > profit ) else 1 \
            if x[f"{i}일 뒤 종가"] > x["종가"] else 0, axis=1)

stock_price.reset_index(drop=True, inplace=True)
stock_price.head()

Unnamed: 0,날짜,1일 뒤 종가,2일 뒤 종가,3일 뒤 종가,4일 뒤 종가,5일 뒤 종가,6일 뒤 종가,7일 뒤 종가,8일 뒤 종가,9일 뒤 종가,...,등락률,거래량,금액(백만),신용비,개인,기관,외인수량,외국계,프로그램,외인비
0,2022-11-01,0,0,0,1,1,2,1,2,2,...,1.01,17201647,1030381,0.13,-106230,58346,782640,74211,22233,49.82
1,2022-10-31,1,1,0,0,1,2,2,1,2,...,3.66,18999514,1126468,0.13,-256500,15311,4094053,234328,119366,49.8
2,2022-10-28,2,2,2,2,2,2,2,2,2,...,-3.7,20924937,1213293,0.13,98850,49391,-1140116,-175509,-131488,49.74
3,2022-10-27,0,0,1,1,0,0,1,2,2,...,0.17,21756712,1296031,0.13,-99934,114494,-28429,36819,-4487,49.75
4,2022-10-26,1,0,0,1,1,0,0,1,2,...,2.95,20824967,1227076,0.13,-358278,48387,5717254,255893,269276,49.76


In [11]:
stock_price.shape

(1990, 26)

In [12]:
file_name = "../data/preprocessed_data/samsung_price.csv"
stock_price.to_csv(file_name, encoding='cp949')