# Load S&P500 ticker list

In [12]:
pip install yfinance

Collecting yfinance
  Using cached yfinance-0.2.30-py2.py3-none-any.whl (65 kB)
Collecting peewee>=3.16.2
  Using cached peewee-3.16.3.tar.gz (928 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting multitasking>=0.0.7
  Using cached multitasking-0.0.11-py3-none-any.whl (8.5 kB)
Collecting requests>=2.31
  Using cached requests-2.31.0-py3-none-any.whl (62 kB)
Collecting html5lib>=1.1
  Using cached html5lib-1.1-py2.py3-none-any.whl (112 kB)
Collecting appdirs>=1.4.4
  Downloading appdirs-1.4.4-py2.py3-none-any.whl (9.6 kB)
Collecting frozendict>=2.3.4
  Downloading frozendict-2.3.8-cp310-cp310-win_amd64.whl (35 kB)
Building wheels for collected packages: peewee
  Building wheel for peewee (pyprojec


[notice] A new release of pip available: 22.3.1 -> 23.2.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
# data library load
import os
import numpy as np
import pandas as pd
import yfinance as yf # Yahoo Finance(금융 및 주식)의 데이터를 쉽게 가져올 수 있도록 도와주는 라이브러리
import time
from glob import glob
from collections import defaultdict

In [26]:
#'../data' 경로에 디렉토리를 생성하는 함수 : 디렉토리가 존재하지 않을 시 디렉토리 형성
 
def create_dir(dir_path):
    if os.path.isdir(dir_path) is False: os.makedirs(dir_path)

create_dir('./data')

In [27]:
# 두 가지 다른 버전의 S&P 500 종목 목록을 비교하여 공통된 종목을 찾는 것

# S&P 500 지수 구성 종목의 목록을 웹에서 가져옴
#해당 목록을 데이터프레임에 저장하는 작업을 수행

# get sp500 list in valid
# s&p 500 지수의 유효한(valid) 및 테스트(test) 버전의 url 저장
valid_start_sp500_url = 'https://en.wikipedia.org/w/index.php?title=List_of_S%26P_500_companies&oldid=1095558369'

# get sp500 list in test
test_end_sp500_url = 'https://en.wikipedia.org/w/index.php?title=List_of_S%26P_500_companies&oldid=1173149676'

# get ticker list for modeling
# html 데이터 -> 테이터 프레임으로 저장 
# flavor='bs4'는 HTML 파서로 Beautiful Soup 4를 사용
# pd.read_html 함수는 웹 페이지에서 테이블을 스크래핑하여 데이터프레임으로 반환 -> 첫번째 테이블 선택
df_valid_start = pd.read_html(valid_start_sp500_url, flavor='bs4')[0] # get sp500 page table
df_test_end = pd.read_html(valid_start_sp500_url, flavor='bs4')[0] # get sp500 page table
wiki_set = set(df_valid_start['Symbol'].values) & set(df_test_end['Symbol'].values) # remain tickers
    # df_valid_start& df_test_end 데이터프레임에서 'Symbol' 열의 값 -> (set)의 교집합 찾음
    # df_valid_start와 df_test_end에 모두 포함된 S&P 500 종목의 심볼(Symbol)을 구함.


# Load price data

In [30]:
# load tickers prices
# Yahoo Finance API를 사용하여 주식 종목의 가격 데이터를 가져오고 이를 CSV 파일로 저장하는 함수

def load_tickers_prices():
  
    apple_god = yf.download('AAPL', start='2021-07-01', end='2023-09-01') # get absolute date index
        # 'AAPL' 심볼에 대한 주가 데이터를 가져와서 'apple_god' 변수에 저장
        # 이 데이터는 2021년 7월 1일부터 2023년 9월 1일까지의 주가 데이터를 포함 
    tickers_dir = 'data/ticker'#디렉토리 경로지정
    create_dir(tickers_dir) #위에서 설정한 디렉토리 경로 지정하는 함수 사용
    for ticker in list(wiki_set): # 'wiki_set'에 있는 S&P 500 종목의 심볼(Symbol) 목록을 반복
        if '.' in ticker: ticker = ticker.replace('.', '-') # processing BF.B & BRK.B 
            
        try:
            df = yf.download(ticker, start='2021-07-01', end='2023-09-01', timeout=15) # get OHLCV dataframe of tickers
                # Yahoo Finance API를 사용하여 'ticker'에 해당하는 주식의 OHLCV(Open, High, Low, Close, Volume) 데이터를 가져옴 
                # 'timeout' 매개변수는 데이터를 다운로드하는 데 최대 15초까지 대기하도록 설정
            df.dropna(inplace=True)
            if len(df) != len(apple_god): continue # worship
                #  현재 종목의 데이터 길이가 'apple_god' 변수에 저장된 'AAPL' 데이터 길이와 다르다면, 
                #  이 종목은 건너뛰고 다음 종목으로 진행합니다. 
                #  이렇게 함으로써, 'AAPL'과 동일한 기간 동안 데이터를 갖고 있는 종목만을 선택
            df.to_csv(f'{tickers_dir}/{ticker}.csv')
            time.sleep(1)
                #time.sleep(1): 데이터를 다운로드한 후 1초의 딜레이를 추가
                #이것은 너무 빠른 요청을 방지
        except:
            continue
                # 어떤 예외가 발생하더라도 해당 종목을 건너뛰고 다음 종목으로 계속 진행 

load_tickers_prices()


[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%*******


1 Failed download:
['NLOK']: Exception('%ticker%: No timezone found, symbol may be delisted')



[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%******


1 Failed download:
['FRC']: Exception('%ticker%: No timezone found, symbol may be delisted')



[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%******


1 Failed download:
['PKI']: Exception('%ticker%: No timezone found, symbol may be delisted')



[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%******


1 Failed download:
['DRE']: Exception('%ticker%: No timezone found, symbol may be delisted')



[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%******


1 Failed download:
['NLSN']: Exception('%ticker%: No timezone found, symbol may be delisted')



[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%******


1 Failed download:
['CTXS']: Exception('%ticker%: No timezone found, symbol may be delisted')



[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%******


1 Failed download:
['SIVB']: Exception('%ticker%: No timezone found, symbol may be delisted')



[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed


1 Failed download:
['RE']: Exception('%ticker%: No timezone found, symbol may be delisted')



[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%******


1 Failed download:
['TWTR']: Exception('%ticker%: No timezone found, symbol may be delisted')



[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%******


1 Failed download:
['FISV']: Exception('%ticker%: No timezone found, symbol may be delisted')



[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed


1 Failed download:
['FBHS']: Exception('%ticker%: No timezone found, symbol may be delisted')



[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%******

In [31]:
#  주식 종목의 가격 데이터를 검사하고 데이터의 무결성을 확인하는 함수

# data check (OHLCV)
def get_tickers_paths():
    '''
    이 함수는 주식 데이터가 저장된 디렉토리에서 모든 CSV 파일의 경로를 가져옴
    '''
    tickers_dir = 'data/ticker'
    return sorted(glob(f'{tickers_dir}/*'))

# save tickers to dataframe
def save_tickers_to_df():
    '''
    이 함수는 주식 데이터를 DataFrame으로 저장하는 역할을 합니다. 
    모든 CSV 파일의 경로를 가져와서 각 파일을 읽어들인 후, 
    0 값을 1e-8로 대체하고, NaN 값을 가지는 행이 있는 경우 해당 파일의 경로를 출력합니다. 
    그런 다음 각 주식의 심볼을 추출하고, 
    이를 DataFrame으로 저장한 뒤, 
    'tickers.csv' 파일로 저장합니다.'''
    tickers = []
    for ticker_path in get_tickers_paths():
        df = pd.read_csv(ticker_path)
        df = df.replace(0, 1e-8)
        if df.isnull().sum().sum() > 0: print(ticker_path)
        tickers.append(ticker_path.split('/')[-1].split('.')[0])
    pd.DataFrame(np.array(tickers)).to_csv('data/tickers.csv', header='ticker', index=False)

# get ticker code list
def get_tickers():
    '''이 함수는 'tickers.csv' 파일에서 
    주식 종목의 심볼을 읽어와 정렬된 리스트로 반환'''
    return sorted(list(pd.read_csv('data/tickers.csv').values.flatten()))

save_tickers_to_df()
get_tickers()


['ticker\\A',
 'ticker\\AAL',
 'ticker\\AAP',
 'ticker\\AAPL',
 'ticker\\ABBV',
 'ticker\\ABC',
 'ticker\\ABT',
 'ticker\\ACN',
 'ticker\\ADBE',
 'ticker\\ADI',
 'ticker\\ADM',
 'ticker\\ADP',
 'ticker\\ADSK',
 'ticker\\AEE',
 'ticker\\AEP',
 'ticker\\AES',
 'ticker\\AFL',
 'ticker\\AIG',
 'ticker\\AIZ',
 'ticker\\AJG',
 'ticker\\AKAM',
 'ticker\\ALB',
 'ticker\\ALGN',
 'ticker\\ALK',
 'ticker\\ALL',
 'ticker\\ALLE',
 'ticker\\AMAT',
 'ticker\\AMCR',
 'ticker\\AMD',
 'ticker\\AME',
 'ticker\\AMGN',
 'ticker\\AMP',
 'ticker\\AMT',
 'ticker\\AMZN',
 'ticker\\ANET',
 'ticker\\ANSS',
 'ticker\\AON',
 'ticker\\AOS',
 'ticker\\APA',
 'ticker\\APD',
 'ticker\\APH',
 'ticker\\APTV',
 'ticker\\ARE',
 'ticker\\ATO',
 'ticker\\ATVI',
 'ticker\\AVB',
 'ticker\\AVGO',
 'ticker\\AVY',
 'ticker\\AWK',
 'ticker\\AXP',
 'ticker\\AZO',
 'ticker\\BA',
 'ticker\\BAC',
 'ticker\\BALL',
 'ticker\\BAX',
 'ticker\\BBWI',
 'ticker\\BBY',
 'ticker\\BDX',
 'ticker\\BEN',
 'ticker\\BF-B',
 'ticker\\BIIB',
 'ticke

# Create features based on the price data

가격데이터에 기반해서 특징들을 형성

In [36]:
# get train, valid, test period
def get_apple(date=False):
    if date:
        return pd.read_csv('data/ticker/AAPL.csv').set_index('Date')
    else:
        return pd.read_csv('data/ticker/AAPL.csv')

def get_periods(split=False):
    # get date index
    df_date = get_apple(True)
    
    # set experiment periods: train, valid, test set
    periods = {'train_start':'2021-07-01',
               'valid_start':'2022-07-01',
               'test_start':'2023-01-03',
               'test_end':'2023-08-31'}
    
    train_start = df_date.index.get_loc(periods['train_start'])
    valid_start = df_date.index.get_loc(periods['valid_start'])
    test_start = df_date.index.get_loc(periods['test_start'])
    test_end = df_date.index.get_loc(periods['test_end'])

    # get length of dates
    train_period = valid_start - train_start
    valid_period = test_start - valid_start
    test_period = test_end - test_start + 1
    
    if split:
        return train_period, valid_period, test_period
    else:
        return train_period + valid_period + test_period

period = get_periods()


In [42]:
pip install sklearn.preprocessing

Note: you may need to restart the kernel to use updated packages.


ERROR: Could not find a version that satisfies the requirement sklearn.preprocessing (from versions: none)
ERROR: No matching distribution found for sklearn.preprocessing

[notice] A new release of pip available: 22.3.1 -> 23.2.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [43]:
from sklearn.preprocessing import MinMaxScaler

# create eod, mask, base price, ground truth for each stock
def transform(df, target = 'Close', delay = 3): # default prediction target is the close price after three days
    # calculate return for ground truth before normalization
    df['prev'] = df[target].shift(delay, fill_value=1e-8)
    df['return'] = (df[target] - df['prev']) / df['prev']
    df = df.applymap(lambda x: 1e-8 if x > 1e+8 else x) # return value of delay length is divided 1e-8

    # normalize by min-max scaler except return
    df = df.tail(period) # double check length
    df = df.reset_index().drop(columns = ['Date'])
    columns = ['Open', 'High', 'Low', 'Close', 'Volume']
    single_gt = df['return'].to_numpy().copy()
    df[columns] = MinMaxScaler().fit_transform(df[columns])

    # masking
    single_mk = np.ones(len(df))
    single_mk[df[df[target] < 1e-8].index.values] = 0
    df[columns] = df[columns].applymap(lambda x: 1.1 if x < 1e-8 else x)

    single_eod = df[columns].to_numpy()
    single_bp = df[target].to_numpy()

    return single_eod, single_mk, single_bp, single_gt


ModuleNotFoundError: No module named 'sklearn'

In [34]:
# set feature shape
tickers = get_tickers()
stock_num = len(tickers)
feature_size = 5

'''
    eod: used features for prediction (stock_num, period, feature_size)
    mask: zero masking for target (stock_num, period)
    base price: target (stock_num, period)
    ground truth: return ratio (stock_num, period)
'''
def create_features():
    # init eod, mask, base price, ground truth for all stocks
    eod_data = np.zeros((stock_num, period, feature_size))
    mask = np.zeros((stock_num, period))
    base_price = np.zeros((stock_num, period))
    ground_truth = np.zeros((stock_num, period))

    # create eod, mask, base price, ground truth for all stocks
    tickers_dir = 'data/ticker'
    for i, ticker in enumerate(tickers):
        df = pd.read_csv(f'{tickers_dir}/{ticker}.csv', index_col = 'Date', parse_dates = True, usecols = ['Date', 'Open', 'High', 'Low', 'Close', 'Volume'], na_values=['nan'])
        single_eod, single_mk, single_bp, single_gt = transform(df)
        eod_data[i] = single_eod
        mask[i] = single_mk
        base_price[i] = single_bp
        ground_truth[i] = single_gt

    # save features
    feature_dir = 'data/feature'
    create_dir(feature_dir)
    np.save(f'{feature_dir}/eod.npy', eod_data)
    np.save(f'{feature_dir}/mk.npy', mask)
    np.save(f'{feature_dir}/bp.npy', base_price)
    np.save(f'{feature_dir}/gt.npy', ground_truth)

def load_features():
    feature_dir = 'data/feature'
    eod = np.load(f'{feature_dir}/eod.npy')
    mk = np.load(f'{feature_dir}/mk.npy')
    bp = np.load(f'{feature_dir}/bp.npy')
    gt = np.load(f'{feature_dir}/gt.npy')
    return eod, mk, bp, gt


# create_features()
# load_features()

In [44]:
create_features()
load_features()

FileNotFoundError: [Errno 2] No such file or directory: 'data/ticker/ticker\\A.csv'

# Create hypergraphs based on the sector and industry

In [None]:
# preprocessing the dataframe
tickers = get_tickers()
df = df_valid_start.copy()
df.replace('BF.B','BF-B', inplace=True) # remove 'dot'
df.replace('BRK.B','BRK-B', inplace=True) # remove 'dot'
df = df[df['Symbol'].isin(tickers)] # extract a sub-set of the tickers that have price data
df.set_index('Symbol', drop=True, inplace=True) # set index
df.sort_index(inplace=True)

# dir setting
hyper_dir = 'data/graph/hyper'
create_dir(hyper_dir)

# save sector & industry
df[['GICS Sector']].to_csv(f'{hyper_dir}/sector.csv', index=True)

# split industries for each company
df['GICS Sub-Industry'].str.split(' & ', expand=True).rename(columns={0:'industry1', 1:'industry2', 2:'industry3'}).to_csv(f'{hyper_dir}/industry.csv', index=True)


In [None]:
# get sector hypergraph
df = pd.read_csv(f'{hyper_dir}/sector.csv')
pivot_table = pd.crosstab(df['Symbol'], df['GICS Sector'])
pivot_table.to_csv(f'{hyper_dir}/sector_hyper.csv')


In [None]:
# get industry hypergraph
df = pd.read_csv(f'{hyper_dir}/industry.csv')
industry_ticker_dict = defaultdict(list)
for index, row in df.iterrows():
    ticker = row['Symbol']
    industries = row[1:]
    for industry in industries:
        if pd.notna(industry) and industry != "Other Services": # the other services is not a industry
            industry_ticker_dict[industry].append(ticker)
industry_ticker_dict = {k: v for k, v in industry_ticker_dict.items() if len(v) > 1}
all_tickers = df['Symbol'].unique()
result_df = pd.DataFrame(index=all_tickers, columns=industry_ticker_dict.keys())
result_df.index.name = 'Symbol'
for industry, tickers in industry_ticker_dict.items():
    result_df[industry] = result_df.index.isin(tickers).astype(int)
result_df.to_csv(f'{hyper_dir}/industry_hyper.csv')


In [None]:
# get complete hypergraph
df_sector = pd.read_csv(f'{hyper_dir}/sector_hyper.csv', index_col='Symbol')
df_industry = pd.read_csv(f'{hyper_dir}/industry_hyper.csv', index_col='Symbol')
result_df = pd.merge(df_sector, df_industry, on='Symbol', how='inner')
result_df.to_csv(f'{hyper_dir}/hyper.csv')

# load hypergraph
def load_hyper_graph():
    return pd.read_csv('data/graph/hyper/hyper.csv').set_index('Symbol').values


# Create simple graphs based on the DTW distance
1. Dynamic time warping (DTW)
    - DTW efficiently finds the minimum alignment cost between two sequences
2. Paper
    - https://www.sciencedirect.com/science/article/pii/S003132031000484X
3. Open source
    - https://pypi.org/project/dtaidistance/

In [None]:
# graph library load
import scipy.stats as stats
from dtaidistance import dtw_ndim
import pickle
import networkx as nx
from tqdm import tqdm

# set directory
simple_dir = 'data/graph/simple'
create_dir(simple_dir)

distance_dir = 'data/graph/distance'
create_dir(distance_dir)

# get tickers code and apple
tickers_dir = 'data/ticker'
tickers = get_tickers() # get ticker code list
apple = get_apple()

# set hyperparameters
window_size = 16
sparsity = 0.9

# simple merge tickers for distance matrix
def merge_multiDim(head):
    matrix = []
    for ticker in tickers:
        df = pd.read_csv(f'{tickers_dir}/{ticker}.csv', index_col = 'Date', usecols=['Date', 'Open', 'High', 'Low', 'Close', 'Volume'])
        df = df.head(head).tail(window_size)
        normalized = stats.zscore(df[['Open','High','Low','Close','Volume']])
        matrix.append(normalized)
    return np.array(matrix)

# get single distance on date
def _create_distance(matrix):
    distance = []
    for s, d1 in enumerate(matrix):
        for t, d2 in enumerate(matrix):
            dst = dtw_ndim.distance_fast(d1, d2, window = window_size)
            distance.append([s, t, dst])
    return pd.DataFrame(distance, columns=['source', 'target', 'distance'])

# save distance matrix
def save_distance(distance, name):
    distance.to_csv(f'{distance_dir}/{name}.csv', index=False)

# load distance matrix
def load_distance(path):
    return pd.read_csv(path)

# get all distances for sliding window
def create_distance():
    local_apple = apple.tail(period)
    print('Create distance matrices...')
    for head in tqdm(range(window_size, period + 1)):
        date = local_apple.head(head).iloc[-1,].Date
        matrix = merge_multiDim(head)
        distance = _create_distance(matrix)
        save_distance(distance, date)

# from distance to adjacency
def get_adjacency(path):
    distance = load_distance(path)
    min_dst = distance['distance'].min()
    max_dst = distance['distance'].max()
    distance['distance'] = (distance['distance'] - min_dst) / (max_dst - min_dst)
    return nx.from_pandas_edgelist(distance, 'source', 'target', 'distance')

# dense to sparsity by cutting edges
def _create_simple(A): # get threshold
    distances = sorted([d for _, _, d in A.edges(data = "distance")])
    threshold = distances[int(len(distances) * (1 - sparsity))]
    A.remove_edges_from([(n1, n2) for n1, n2, d in A.edges(data = "distance") if d > threshold])
    return A

# save graph
def save_simple(G, path):
    pickle.dump(G, open(path,'wb'))

# load graph
def load_simple(path):
    return pickle.load(open(path, 'rb'))

# create simple graphs from the distance matrix
def create_simple():
    distances = sorted(glob(f'{distance_dir}/*'))
    print("Create graphs...")
    for dstpath in tqdm(distances):
        name = dstpath.split('/')[-1].split('.')[0]
        path = f'{simple_dir}/{name}.pickle'
        A = get_adjacency(dstpath)
        G = _create_simple(A)
        save_simple(G, path)

# create_distance()
# create_simple()


# Train setting

In [None]:
# model library load
import torch
import torch.nn.functional as F
from torch.nn import GRU

import torch_geometric
from torch_geometric.nn import GATConv
from torch_geometric.nn import HypergraphConv

from scipy import sparse
from torch_scatter import scatter_add

from torch.utils.data import DataLoader
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [None]:
# data split for offset
train_period, valid_period, test_period = get_periods(True)
window_size = 16
delay = 3

def get_data_loader(train = False, valid = False, test = False):
    # train
    if train:
        return DataLoader(list(range(train_period - window_size - delay + 1)),
                          batch_size=1, shuffle=True)
    
    # valid
    elif valid:
        return DataLoader(list(range(train_period - window_size - delay + 1,
                          train_period + valid_period - window_size - delay + 1)),
                          batch_size=1, shuffle=False)
    
    # test
    elif test:
        return DataLoader(list(range(train_period + valid_period - window_size - delay + 1,
                          train_period + valid_period + test_period - window_size - delay + 1)),
                          batch_size=1, shuffle=False)
    
    else: return None

# set result of test
result_dir = '../result'
create_dir(f'{result_dir}')

model_dir = f'{result_dir}/model'
create_dir(f'{model_dir}')

ranking_dir = f'{result_dir}/ranking'
create_dir(f'{ranking_dir}')

test_date = get_apple().tail(test_period)['Date']

def save_model(model, name):
    torch.save(model, f'{model_dir}/{name}.pt')

def load_model(name):
    return torch.load(f'{model_dir}/{name}.pt')

In [None]:
# get path of simple graphs
def get_simple_paths():
    simple_dir = '../data/graph/simple'
    return sorted(glob(f'{simple_dir}/*'))

# get edge index of the simple graph
def get_simple_edges(G):
    return np.array(list(G.edges)).T

# get edge index of the hypergraph
def get_hyper_edges():
    hypergraph = load_hyper_graph()
    inci_sparse = sparse.coo_matrix(hypergraph)
    inci_edge = torch_geometric.utils.from_scipy_sparse_matrix(inci_sparse)
    return inci_edge[0]


In [None]:
# get the input of model given offset
eod, mk, bp, gt = load_features()
def get_batch(offset, window_size = 16, delay = 3, simple = False, hyper = False):
    # batch features
    eod_batch = eod[:, offset : offset + window_size, :]
    mk_batch = mk[:,  offset : offset + window_size + delay]
    mk_batch = np.min(mk_batch, axis=1)
    mk_batch = np.expand_dims(mk_batch, axis=1)
    bp_batch = np.expand_dims(bp[:, offset + window_size - 1], axis=1)
    gt_batch = np.expand_dims(gt[:, offset + window_size + delay - 1], axis=1)
    
    eod_batch = torch.tensor(eod_batch).float().to(device)
    mk_batch = torch.tensor(mk_batch).float().to(device)
    bp_batch = torch.tensor(bp_batch).float().to(device)
    gt_batch = torch.tensor(gt_batch).float().to(device)
    
    # batch graphs
    if simple:
        graph_path = get_simple_paths()[offset]
        graph = load_simple(graph_path)
        ei_batch = get_simple_edges(graph) # edge of index
        ei_batch = ei_batch.clone().detach().long().to(device)
        
    elif hyper:
        ei_batch = get_hyper_edges()
        ei_batch = ei_batch.clone().detach().long().to(device)
    
    else:
        ei_batch = None
        
    return eod_batch, mk_batch, bp_batch, gt_batch, ei_batch


In [None]:
# get ranking loss
def get_loss(pred, mask, base_price, ground_truth, rel_loss_alpha = 1):
    # get absolute loss
    return_ratio = torch.div((pred - base_price), base_price)
    reg_loss = torch.mean(mask * (return_ratio - ground_truth) ** 2)

    # get relative loss
    all_ones = torch.ones(pred.size(0), 1).to(device)
    pred_rel = torch.matmul(return_ratio, torch.transpose(all_ones, 0, 1)) - torch.matmul(all_ones, torch.transpose(return_ratio, 0, 1))
    real_rel = torch.matmul(all_ones, torch.transpose(ground_truth, 0, 1)) - torch.matmul(ground_truth, torch.transpose(all_ones, 0, 1))
    mask_rel = torch.matmul(mask, torch.transpose(mask, 0, 1))
    rel_loss = torch.mean(F.relu(((pred_rel * real_rel) * mask_rel)))

    # get ranking loss
    rank_loss = reg_loss + rel_loss_alpha * rel_loss
    del mask_rel, real_rel, pred_rel, all_ones
    return rank_loss, reg_loss, rel_loss, return_ratio
    

In [None]:
# get optimizer for model
def get_optimizer(model, lr = 1e-3, wd = 5e-4):
    if wd is not None:
        return torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
    else:
        return torch.optim.Adam(model.parameters(), lr=lr)

In [None]:
# train the model and select the model that has lower loss in validation period
def train(model, optim, name, simple = False, hyper = False, epochs = 10):
    best = np.inf
        
    for epoch in range(epochs):
        print(f'\n{epoch+1}/{epochs} epoch')
        
        # model train
        model.train()
        tra_loss = 0
        data_loader = get_data_loader(train=True)
        for offset in tqdm(data_loader): # into batch & each batch offset
            optim.zero_grad()
            
            eod_batch, mk_batch, bp_batch, gt_batch, ei_batch = get_batch(offset, simple = simple, hyper = hyper)
            out = model(eod_batch, ei_batch)
            batch_loss,_,_,_ = get_loss(out, mk_batch, bp_batch, gt_batch)
            
            tra_loss += batch_loss.item()
            batch_loss.backward()
            optim.step()
        print('\tTrain Loss :', round(tra_loss/len(data_loader),6))
        
        # model validation
        model.eval()
        with torch.no_grad():
            val_loss = 0
            data_loader = get_data_loader(valid=True)
            for offset in tqdm(data_loader): # into batch & each batch offset
                eod_batch, mk_batch, bp_batch, gt_batch, ei_batch = get_batch(offset, simple = simple, hyper = hyper)
                out = model(eod_batch, ei_batch)
                batch_loss,_,_,_ = get_loss(out, mk_batch, bp_batch, gt_batch)
                val_loss += batch_loss.detach().cpu().item()
            print('\tValid Loss :', round(val_loss/len(data_loader),6))
            if best > val_loss: # model save to test
                best = val_loss
                save_model(model, name)
                print('model saved')

                

In [None]:
# test the model
def test(model, name, simple = False, hyper = False):
    result=[]
    model.eval()
    with torch.no_grad():
        for offset in tqdm(get_data_loader(test=True)):
            eod_batch, mk_batch, bp_batch, gt_batch, ei_batch = get_batch(offset, simple = simple, hyper = hyper)
            out = model(eod_batch, ei_batch)
            _,_,_,rr = get_loss(out, mk_batch, bp_batch, gt_batch)
            rr = rr.squeeze(0).squeeze(-1).cpu().detach().numpy()
            result.append(rr)

    result = np.array(result)
    pd.DataFrame(result, index=test_date).to_csv(f'{ranking_dir}/{name}.csv')
    del result

# Simple graph-based ranking model
1. Graph attention network (GAT)
    - GAT is an advanced graph neural network to learn the node repesentations using the relations between nodes in the simple graph
    - GAT uses the self-attention mechanism to assign different weights based on the importance of the neighboring nodes
2. Paper
    - P. Velickovic, G. Cucurull, A. Casanova, A. Romero, P. Lio, and Y. Bengio, ‘‘Graph attention networks,’’ in Proc. ICLR, 2018, pp. 1–12.
    - https://arxiv.org/abs/1710.10903
3. Open source
    - https://pytorch-geometric.readthedocs.io/en/latest/generated/torch_geometric.nn.conv.GATConv.html#torch-geometric-nn-conv-gatconv

In [None]:
# model architecture define for the simple graph
class SGM(torch.nn.Module):
    def __init__(self, window_size = 16, feature_size = 5, hidden_size = 32, out_size = 16, head_size = 4):
        super().__init__()
        # temporal modeling
        self.gru = GRU(feature_size, hidden_size, 1, batch_first = True)
        
        # relational modeling
        self.conv1 = GATConv(window_size * hidden_size, window_size * hidden_size, head_size, dropout=0.5)
        self.conv2 = GATConv(window_size * hidden_size * head_size, out_size, heads=1, concat=False, dropout=0.5)
        
        # dense layer for ranking score
        self.linear = torch.nn.Linear(out_size, 1)

    def forward(self, src, edge_index): # stock_num, window_size, feature_size
        out, _ = self.gru(src) # stock_num, window_size, hidden_size
        out = out.reshape(src.size(0), -1) # stock_num, window_size * hidden_size
        out = F.elu(self.conv1(out, edge_index)) # stock_num, window_size * hidden_size * head_size
        out = F.elu(self.conv2(out, edge_index)) # stock_num, out_size
        out = F.leaky_relu(self.linear(out))
        return out

In [None]:
# train the simple graph-based ranking models
for t in range(10):
    model = SGM(16, 5, 32, 16, 4).to(device)
    optim = get_optimizer(model)
    train(model, optim, f'simple{t}', simple = True)
    test(load_model(f'simple{t}'), f'simple{t}', simple = True)

# HConv model
1. Hypergraph Convolution and Hypergraph Atterntion (HConv)
    - HConv can extract the relational features from hypergraphs.
    - The hypergraph represents collective relations among stocks.
2. Paper
    - S. Bai, F. Zhang, and P. H. S. Torr, ‘‘Hypergraph convolution and hypergraph attention,’’ Pattern Recognit., vol. 110, Feb. 2021, Art. no. 107637
    - https://arxiv.org/pdf/1901.08150.pdf
3. Open source
    - https://pytorch-geometric.readthedocs.io/en/latest/generated/torch_geometric.nn.conv.HypergraphConv.html

In [None]:
# model architecture define for the hypergraph
class HGM(torch.nn.Module):
    def __init__(self, window_size = 16, feature_size = 5, hidden_size = 32, out_size = 16, head_size = 4):
        super().__init__()
        # temporal modeling
        self.gru = GRU(feature_size, hidden_size, 1, batch_first = True)
        
        # relational modeling
        self.conv1 = HypergraphConv(window_size * hidden_size, hidden_size,
                                    use_attention=True, heads=head_size, concat=True, dropout=0.5)
        self.conv2 = HypergraphConv(hidden_size * head_size, out_size,
                                    heads=1, concat=False, dropout=0.5)
        
        # dense layer for ranking score
        self.linear = torch.nn.Linear(out_size, 1)
        
    def get_hyperedge_attr(self, x, edge_index):
        src = x[edge_index[0]].T
        idx = edge_index[1]
        return scatter_add(src, idx).T
    
    def forward(self, src, edge_index):
        
        out, _ = self.gru(src) # stock_num, window_size, hidden_size
        out = out.reshape(out.size(0), -1) # stock_num, window_size * hidden_size
        out = F.elu(self.conv1(out, edge_index,
                               hyperedge_weight = None,
                               hyperedge_attr = self.get_hyperedge_attr(out, edge_index)))
        # stock_num, out_size
        out = F.elu(self.conv2(out, edge_index,
                               hyperedge_weight = None,
                               hyperedge_attr = self.get_hyperedge_attr(out, edge_index)))
        out = F.leaky_relu(self.linear(out))
        return out

In [None]:
# train the hypergraph-based ranking models
for t in range(10):
    model = HGM(16, 5, 32, 16, 4).to(device)
    optim = get_optimizer(model)
    train(model, optim, f'hyper{t}', hyper = True)
    test(load_model(f'hyper{t}'), f'hyper{t}', hyper = True)
