In [1]:
import sys
sys.path.append('..') # for import src

import os
import cloudpickle
import lzma
import pandas as pd
import numpy as np
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
import joblib
import yfinance as yf

import src
cloudpickle.register_pickle_by_value(src) # for model portability

In [2]:
import requests
from bs4 import BeautifulSoup

def read_code():
    res = requests.get('https://search.sbisec.co.jp/v2/popwin/info/stock/pop6040_usequity_list.html')
    bs = BeautifulSoup(res.content, "html.parser")
    
    table = bs.find_all(class_="md-l-table-01")[0]
    tbody = table.find_all('tbody')[0]
    rows = []
    for tr in tbody.find_all('tr')[1:]:
        rows.append({
            'symbol': tr.find_all('th')[0].getText(),
            'symbol_name': tr.find_all('td')[0].getText(),
            'description': tr.find_all('td')[1].getText(),
            'exchange': tr.find_all('td')[2].getText(),
        })
    df = pd.DataFrame(rows)
    
    df['symbol'] = df['symbol'].str.normalize('NFKC')
    df['symbol_name'] = df['symbol_name'].str.normalize('NFKC')
    df['description'] = df['description'].str.normalize('NFKC')
    df['exchange'] = df['exchange'].str.normalize('NFKC')
    return df

display(read_code())

Unnamed: 0,symbol,symbol_name,description,exchange
0,A,Agilent Technologiesアジレント テクノロジーズ,環境、食品の品質・安全性等の化学分析を行うツールを開発、提供,NYSE
1,AA,Alcoaアルコア,アルミ原料を掘削する鉱山事業及び精練事業に従事,NYSE
2,AACI,Armada Acquisition Corp1アルマダ アクイジション1,ブランクチェックカンパニー。,NASDAQ
3,AADI,Aadi Biosciences Incアーディ バイオサイエンシズ,眼疾患の治療薬を開発をしている米国のバイオ医薬品企業,NASDAQ
4,AAL,American Airlines Groupアメリカン航空,米国の航空会社,NASDAQ
...,...,...,...,...
4744,ZVSA,ZyVersa Therapeutics Inc ザイバーサ セラピューティクス,炎症や腎疾患等の治療薬を開発する会社,NASDAQ
4745,ZWS,Zurn Elkay Water Solutions Corpザーン エルケイ ウォーター,マルチ・プラットフォームの工業会社,NYSE
4746,ZYME,Zymeworks Incジムワークス,臨床段階のバイオ医薬品会社,NASDAQ
4747,ZYNE,Zynerba Pharmaceuticals Incジナーバ ファーマシューティカルズ,特殊医薬品企業,NASDAQ


In [3]:
df_code = read_code()
# display(df_code)

df = yf.download(df_code['symbol'].iloc[:100].tolist())
df = df.stack()
df = df.reset_index()
display(df)

[*********************100%***********************]  100 of 100 completed


Unnamed: 0,Date,level_1,Adj Close,Close,High,Low,Open,Volume
0,1962-01-02,AA,1.518816,6.545672,6.583219,6.545672,0.000000,55930.0
1,1962-01-03,AA,1.542048,6.645797,6.645797,6.533156,6.545672,74906.0
2,1962-01-04,AA,1.542048,6.645797,6.695859,6.645797,6.645797,80899.0
3,1962-01-05,AA,1.539144,6.633281,6.683344,6.620766,6.645797,70911.0
4,1962-01-08,AA,1.486871,6.408000,6.608250,6.357938,6.608250,93883.0
...,...,...,...,...,...,...,...,...
425775,2023-08-09,AEHL,0.651100,0.651100,0.678200,0.622000,0.678200,70505.0
425776,2023-08-09,AEHR,47.748402,47.748402,48.549999,45.810001,48.000000,654827.0
425777,2023-08-09,AEI,1.488600,1.488600,1.520000,1.470000,1.490000,8341.0
425778,2023-08-09,AEIS,113.730003,113.730003,114.489998,112.790001,114.379997,63796.0


In [9]:
import time
from datetime import datetime
import gc


class Fetcher:
    def __init__(self):
        self.keys = {}
        self.data_id = 'stock_us_ohlcv2'
        self.replace_mode = True
        
    def fetch(self, last_timestamp=None):
        if last_timestamp is not None and time.time() < last_timestamp + (24 + 24) * 60 * 60:
            print('no new data')
            return pd.DataFrame()
        
        df_code = read_code()
        df_code = df_code.set_index('symbol')
        
        dfs = []
        for symbols in np.array_split(df_code.index, 10):
            df = yf.download(symbols.tolist(), period='max', actions=True)
            df = df.stack()

            # for error
            df2 = yf.download(symbols.tolist(), period='max', actions=True)
            df2 = df2.stack()
            df = pd.concat([df, df2])
            df = df.loc[~df.index.duplicated()]

            if df.shape[0] == 0:
                return pd.DataFrame()

            df.reset_index(inplace=True)
            df.rename(columns={
                'Date': 'timestamp',
                'level_1': 'symbol',
                'Open': 'op',
                'High': 'hi',
                'Low': 'lo',
                'Close': 'cl',
                'Adj Close': 'adj_cl',
                'Volume': 'volume',
                'Dividends': 'dividends',
                'Stock Splits': 'splits',
                'Capital Gains': 'capital_gains',
            }, inplace=True)
            df['timestamp'] = df['timestamp'].astype(int) // 10 ** 9
            df.set_index(['timestamp', 'symbol'], inplace=True)

            df = df.join(df_code[['symbol_name', 'description', 'exchange']], on='symbol', how='inner')
            
            dfs.append(df)
            gc.collect()

        df = pd.concat(dfs)
        df['capital_gains'] = df['capital_gains'].fillna(0)
        df = df[[
            'op',
            'hi',
            'lo',
            'cl',
            'adj_cl',
            'volume',
            'dividends',
            'splits',
            'capital_gains',
            'symbol_name',
            'description',
            'exchange',
        ]]
        df.sort_index(inplace=True)
        
        return df


In [8]:
fetcher = Fetcher()
# df = fetcher.fetch(last_timestamp=None)
# display(df)
df = fetcher.fetch(last_timestamp=1690848000)
display(df)

[*********************100%***********************]  3 of 3 completed
[*********************100%***********************]  3 of 3 completed
[*********************100%***********************]  3 of 3 completed
[*********************100%***********************]  3 of 3 completed
[*********************100%***********************]  3 of 3 completed
[*********************100%***********************]  3 of 3 completed
[*********************100%***********************]  3 of 3 completed
[*********************100%***********************]  3 of 3 completed
[*********************100%***********************]  3 of 3 completed
[*********************100%***********************]  3 of 3 completed
[*********************100%***********************]  3 of 3 completed
[*********************100%***********************]  3 of 3 completed
[*********************100%***********************]  3 of 3 completed
[*********************100%***********************]  3 of 3 completed
[*********************100%********

KeyError: 'capital_gains'

In [10]:
fetchers = [Fetcher()]

data = cloudpickle.dumps(fetchers)
data = lzma.compress(data)
with open('/home/jovyan/data/20230810_stock_us2.xz', 'wb') as f:
    f.write(data)