In [80]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('mode.chained_assignment',  None)
data = pd.read_csv('train.csv')
sub = pd.read_csv('sample_submission.csv')

data['일자'] = pd.to_datetime(data['일자'], format='%Y%m%d')
data = data.rename(columns={'일자' : 'ds', '종가' : 'y'})
data

Unnamed: 0,ds,종목코드,종목명,거래량,시가,고가,저가,y
0,2021-06-01,A060310,3S,166690,2890,2970,2885,2920
1,2021-06-01,A095570,AJ네트웍스,63836,5860,5940,5750,5780
2,2021-06-01,A006840,AK홀딩스,103691,35500,35600,34150,34400
3,2021-06-01,A054620,APS,462544,14600,14950,13800,14950
4,2021-06-01,A265520,AP시스템,131987,29150,29150,28800,29050
...,...,...,...,...,...,...,...,...
987995,2023-05-30,A189980,흥국에프엔비,272284,3005,3035,2955,2980
987996,2023-05-30,A000540,흥국화재,50218,3250,3255,3195,3215
987997,2023-05-30,A003280,흥아해운,130664,1344,1395,1340,1370
987998,2023-05-30,A037440,희림,141932,9170,9260,9170,9200


In [81]:
code = [str(i).replace('A', '') for i in data['종목코드'].unique()]
ymd = data['ds'].unique()

In [85]:
from pykrx import stock
from tqdm.auto import tqdm
import time

# 수정 종가를 얻기 위해 pykrx 라이브러리 사용
# time.sleep은 크롤링이 막히는 것을 방지
start_date = str(ymd[:1][0])
last_date = str(ymd[-1:][0])
modified_df = pd.DataFrame()

for i in tqdm(code):
    st = stock.get_market_ohlcv_by_date(start_date, last_date, i, adjusted=True).reset_index(drop=False)
    # 티커명을 넣어야지 구분 가능
    st['ticker'] = str(i)
    modified_df = pd.concat([modified_df,st], axis = 0)
    time.sleep(1)

  0%|          | 0/2000 [00:00<?, ?it/s]

In [86]:
modified_df.to_csv("modified_df.csv")

In [101]:
modified_df = pd.read_csv("modified_df.csv")

In [102]:
modified_df['ticker'] = [str(i).zfill(6) for i in modified_df['ticker']]

In [103]:
modified_df

Unnamed: 0.1,Unnamed: 0,날짜,시가,고가,저가,종가,거래량,거래대금,등락률,ticker
0,0,2021-06-01,2890,2970,2885,2920,166690,487455970,1.04,060310
1,1,2021-06-02,2915,2975,2830,2900,134833,388795625,-0.68,060310
2,2,2021-06-03,2900,2925,2875,2900,144470,419668300,0.00,060310
3,3,2021-06-04,2930,3120,2920,2950,934224,2840373820,1.72,060310
4,4,2021-06-07,3000,3150,2955,3150,946560,2929678225,6.78,060310
...,...,...,...,...,...,...,...,...,...,...
987995,489,2023-05-23,6710,6710,6420,6430,40981,266207710,-4.17,238490
987996,490,2023-05-24,6400,6490,6210,6320,19392,123635550,-1.71,238490
987997,491,2023-05-25,6340,6400,6270,6330,6457,40862130,0.16,238490
987998,492,2023-05-26,6330,6410,6300,6330,8905,56621540,0.00,238490


In [104]:
modified_df = modified_df.drop(columns = ['Unnamed: 0'])

In [105]:
diff_1 = modified_df.set_index(['ticker', '날짜'])\
                    .groupby(level='ticker')['종가']\
                    .transform(lambda x: x.sort_index().diff())\
                    .reset_index()\
                    .rename(columns = {'종가':'d1c'})

In [106]:
modified_df = pd.merge(modified_df, diff_1, on = ['ticker' , '날짜'])
modified_df

Unnamed: 0,날짜,시가,고가,저가,종가,거래량,거래대금,등락률,ticker,d1c
0,2021-06-01,2890,2970,2885,2920,166690,487455970,1.04,060310,
1,2021-06-02,2915,2975,2830,2900,134833,388795625,-0.68,060310,-20.0
2,2021-06-03,2900,2925,2875,2900,144470,419668300,0.00,060310,0.0
3,2021-06-04,2930,3120,2920,2950,934224,2840373820,1.72,060310,50.0
4,2021-06-07,3000,3150,2955,3150,946560,2929678225,6.78,060310,200.0
...,...,...,...,...,...,...,...,...,...,...
987995,2023-05-23,6710,6710,6420,6430,40981,266207710,-4.17,238490,-280.0
987996,2023-05-24,6400,6490,6210,6320,19392,123635550,-1.71,238490,-110.0
987997,2023-05-25,6340,6400,6270,6330,6457,40862130,0.16,238490,10.0
987998,2023-05-26,6330,6410,6300,6330,8905,56621540,0.00,238490,0.0


In [107]:
modified_df = modified_df.rename(columns={'날짜': 'ds', 'd1c' : 'y'})
modified_df

Unnamed: 0,ds,시가,고가,저가,종가,거래량,거래대금,등락률,ticker,y
0,2021-06-01,2890,2970,2885,2920,166690,487455970,1.04,060310,
1,2021-06-02,2915,2975,2830,2900,134833,388795625,-0.68,060310,-20.0
2,2021-06-03,2900,2925,2875,2900,144470,419668300,0.00,060310,0.0
3,2021-06-04,2930,3120,2920,2950,934224,2840373820,1.72,060310,50.0
4,2021-06-07,3000,3150,2955,3150,946560,2929678225,6.78,060310,200.0
...,...,...,...,...,...,...,...,...,...,...
987995,2023-05-23,6710,6710,6420,6430,40981,266207710,-4.17,238490,-280.0
987996,2023-05-24,6400,6490,6210,6320,19392,123635550,-1.71,238490,-110.0
987997,2023-05-25,6340,6400,6270,6330,6457,40862130,0.16,238490,10.0
987998,2023-05-26,6330,6410,6300,6330,8905,56621540,0.00,238490,0.0


In [None]:
!pip install prophet

In [108]:
from prophet import Prophet
from pandas.tseries.offsets import CustomBusinessDay
import numpy as np

RANDOM_SEED = 113
np.random.seed(RANDOM_SEED)

def ph_train(df):
    pred = pd.DataFrame()
    pred['y'] = None
    pred['종목코드'] = None

    pred_y = []
    pred_code = []
    for code in df['ticker'].unique():
        d = df[df['ticker'] == code].reset_index().drop(['index','ticker'], axis=1).sort_values('ds')
        
        model = Prophet(growth = 'linear',
                        seasonality_mode = 'additive',
                        yearly_seasonality = 'auto',
                        weekly_seasonality = 'auto',
                        daily_seasonality = 'auto',
                        holidays = None,
                        changepoint_prior_scale = 0.01
                       )
        model.fit(d)
        bday = CustomBusinessDay(weekmask='Mon Tue Wed Thu Fri')
        future = pd.DataFrame()
        future['ds'] = pd.date_range(start='2023-05-31', periods=16, freq=bday)
        future = future[future['ds'] != '2023-06-06']
        
        forecast = model.predict(future)
        pred_y.append(forecast['yhat'].diff().sum())
        pred_code.append(str(code))
    
    pred['y'] = pred_y
    pred['종목코드'] = pred_code
    
    return pred

pred = ph_train(modified_df)

17:31:38 - cmdstanpy - INFO - Chain [1] start processing
17:31:38 - cmdstanpy - INFO - Chain [1] done processing
17:31:38 - cmdstanpy - INFO - Chain [1] start processing
17:31:38 - cmdstanpy - INFO - Chain [1] done processing
17:31:38 - cmdstanpy - INFO - Chain [1] start processing
17:31:38 - cmdstanpy - INFO - Chain [1] done processing
17:31:39 - cmdstanpy - INFO - Chain [1] start processing
17:31:39 - cmdstanpy - INFO - Chain [1] done processing
17:31:39 - cmdstanpy - INFO - Chain [1] start processing
17:31:39 - cmdstanpy - INFO - Chain [1] done processing
17:31:39 - cmdstanpy - INFO - Chain [1] start processing
17:31:39 - cmdstanpy - INFO - Chain [1] done processing
17:31:39 - cmdstanpy - INFO - Chain [1] start processing
17:31:39 - cmdstanpy - INFO - Chain [1] done processing
17:31:39 - cmdstanpy - INFO - Chain [1] start processing
17:31:39 - cmdstanpy - INFO - Chain [1] done processing
17:31:39 - cmdstanpy - INFO - Chain [1] start processing
17:31:39 - cmdstanpy - INFO - Chain [1]

In [109]:
pred['종목코드'] = 'A' + pred['종목코드']

In [110]:
pred = pred.sort_values('y', ascending=False)
save_pred = pred.reset_index().drop(['index', 'y'], axis=1).reset_index().rename(columns={'index' : '순위'})

In [111]:
save_pred['종목코드'] = save_pred['종목코드'].astype(str)

In [112]:
pred

Unnamed: 0,y,종목코드
1127,150.379039,A086520
136,114.297481,A051910
1142,96.775241,A036570
319,91.153326,A003920
131,65.357579,A051900
...,...,...
97,-55.427744,A002380
777,-69.349254,A004690
808,-111.946369,A017390
252,-129.261262,A010130


In [113]:
merged = pd.merge(sub, save_pred, on='종목코드').drop('순위_x', axis=1).rename(columns={'순위_y' : '순위'})
merged['순위'] = merged['순위'] + 1
merged.to_csv('prophet_230710+add_diff_adjust(1).csv', index=False)

In [114]:
merged

Unnamed: 0,종목코드,순위
0,A000020,1536
1,A000040,1221
2,A000050,1450
3,A000070,563
4,A000080,1816
...,...,...
1995,A375500,950
1996,A378850,672
1997,A383220,1974
1998,A383310,66
