In [1]:
import sys
sys.path.append('..') # for import src

import os
import cloudpickle
import lzma
import pandas as pd
import numpy as np
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
import joblib
import yfinance as yf

import src
cloudpickle.register_pickle_by_value(src) # for model portability

In [2]:
import requests
from bs4 import BeautifulSoup

def read_code():
    res = requests.get('https://search.sbisec.co.jp/v2/popwin/info/stock/pop6040_usequity_list.html')
    bs = BeautifulSoup(res.content, "html.parser")
    
    rows = []
    for table in bs.find_all('table'):
        theads = table.find_all('thead')
        if len(theads) == 0:
            continue
        if theads[0].find_all('tr')[0].find_all('th')[0].getText() != 'ティッカー':
            continue
        tbody = table.find_all('tbody')[0]
        for tr in tbody.find_all('tr')[1:]:
            tds = tr.find_all('td')
            rows.append({
                'symbol': tr.find_all('th')[0].getText(),
                'symbol_name': tds[0].getText(),
                'description': tds[1].getText() if len(tds) >= 3 else '',
                'exchange': tds[-1].getText(),
            })
    df = pd.DataFrame(rows)
    
    df['symbol'] = df['symbol'].str.normalize('NFKC')
    df['symbol_name'] = df['symbol_name'].str.normalize('NFKC')
    df['description'] = df['description'].str.normalize('NFKC')
    df['exchange'] = df['exchange'].str.normalize('NFKC')
    return df.sort_values('symbol').reset_index(drop=True)

df_code = read_code()
display(df_code)

display(df_code[df_code['symbol'].isin(['AAPL', 'SHY', 'EC'])])

Unnamed: 0,symbol,symbol_name,description,exchange
0,AA,Alcoaアルコア,アルミ原料を掘削する鉱山事業及び精練事業に従事,NYSE
1,AACI,Armada Acquisition Corp1アルマダ アクイジション1,ブランクチェックカンパニー。,NASDAQ
2,AADI,Aadi Biosciences Incアーディ バイオサイエンシズ,眼疾患の治療薬を開発をしている米国のバイオ医薬品企業,NASDAQ
3,AAGR,African Agriculture Holdings Incアフリカン・アグリカルチャーHD,アフリカの食糧安全保障に特化した会社。アルファルファ農場を運営。,NASDAQ
4,AAL,American Airlines Groupアメリカン航空,米国の航空会社,NASDAQ
...,...,...,...,...
5211,ZVSA,ZyVersa Therapeutics Inc ザイバーサ セラピューティクス,炎症や腎疾患等の治療薬を開発する会社,NASDAQ
5212,ZWS,Zurn Elkay Water Solutions Corpザーン エルケイ ウォーター,マルチ・プラットフォームの工業会社,NYSE
5213,ZYME,Zymeworks Incジムワークス,臨床段階のバイオ医薬品会社,NASDAQ
5214,ZYXI,Zynex Incジネックス,米国の医療機器メーカー,NASDAQ


Unnamed: 0,symbol,symbol_name,description,exchange
10,AAPL,Appleアップル,パソコン、コミュニケーション機器等の製造販売,NASDAQ
1458,EC,Ecopetrol SA ADRエコペトロール ADR,商業開発及び炭化水素とその副産物の探査、採掘、精製、輸送、保管と販売など関連業務に従事。,NYSE
4128,SHY,iShares 1-3 Year Treasury Bond ETFiシェアーズ 米国国債 ...,,NASDAQ


In [3]:
df_code = read_code()
# display(df_code)

df = yf.download(df_code['symbol'].iloc[:100].tolist())
df = df.stack()
df = df.reset_index()
display(df)

[*********************100%%**********************]  100 of 100 completed

1 Failed download:
['ACGN']: Exception('%ticker%: No timezone found, symbol may be delisted')


Unnamed: 0,Date,level_1,Adj Close,Close,High,Low,Open,Volume
0,1962-01-02,AA,1.512354,6.545672,6.583219,6.545672,0.000000,55930.0
1,1962-01-03,AA,1.535490,6.645797,6.645797,6.533156,6.545672,74906.0
2,1962-01-04,AA,1.535490,6.645797,6.695859,6.645797,6.645797,80899.0
3,1962-01-05,AA,1.532597,6.633281,6.683344,6.620766,6.645797,70911.0
4,1962-01-08,AA,1.480547,6.408000,6.608250,6.357938,6.608250,93883.0
...,...,...,...,...,...,...,...,...
427956,2024-01-09,AEI,1.255000,1.255000,1.340000,1.220100,1.340000,69573.0
427957,2024-01-09,AEIS,103.029999,103.029999,103.190002,101.279999,101.279999,96108.0
427958,2024-01-09,AEL,55.520000,55.520000,55.549999,55.209999,55.209999,297210.0
427959,2024-01-09,AEM,51.570000,51.570000,52.599998,51.509998,52.599998,1964078.0


In [4]:
df = yf.download('MO', actions=True)
display(df[df['Dividends'] != 0])

[*********************100%%**********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1962-03-14,0.000000,0.172743,0.169271,0.171007,0.002573,1497600,0.001563,0.0
1962-06-14,0.000000,0.123264,0.119792,0.122396,0.001866,1843200,0.001563,0.0
1962-09-14,0.000000,0.120226,0.118490,0.119792,0.001850,806400,0.001563,0.0
1962-12-14,0.000000,0.128906,0.127387,0.128906,0.002015,576000,0.001563,0.0
1963-03-14,0.000000,0.137153,0.135417,0.135417,0.002141,1267200,0.001563,0.0
...,...,...,...,...,...,...,...,...
2022-12-21,45.610001,45.759998,45.400002,45.590000,41.778763,8372400,0.940000,0.0
2023-03-23,44.389999,44.400002,43.250000,43.459999,40.673630,15549800,0.940000,0.0
2023-06-14,44.900002,44.980000,43.860001,43.950001,41.994873,11941300,0.940000,0.0
2023-09-14,43.820000,44.150002,43.610001,44.099998,43.081665,9764000,0.980000,0.0


In [5]:
import time
from datetime import datetime
import gc


class Fetcher:
    def __init__(self):
        self.keys = {}
        self.data_id = 'stock_us_ohlcv2'
        self.replace_mode = True
        
    def fetch(self, last_timestamp=None):
        if last_timestamp is not None and time.time() < last_timestamp + (24 + 24) * 60 * 60:
            print('no new data')
            return pd.DataFrame()
        
        df_code = read_code()
        df_code = df_code.set_index('symbol')
        
        dfs = []
        for symbols in np.array_split(df_code.index, 10):
            df = yf.download(symbols.tolist(), period='max', actions=True)
            df = df.stack()

            # for error
            df2 = yf.download(symbols.tolist(), period='max', actions=True)
            df2 = df2.stack()
            df = pd.concat([df, df2])
            df = df.loc[~df.index.duplicated()]

            if df.shape[0] == 0:
                return pd.DataFrame()

            df.reset_index(inplace=True)
            df.rename(columns={
                'Date': 'timestamp',
                'level_1': 'symbol',
                'Open': 'op',
                'High': 'hi',
                'Low': 'lo',
                'Close': 'cl',
                'Adj Close': 'adj_cl',
                'Volume': 'volume',
                'Dividends': 'dividends',
                'Stock Splits': 'splits',
                'Capital Gains': 'capital_gains',
            }, inplace=True)
            df['timestamp'] = df['timestamp'].astype(int) // 10 ** 9
            df.set_index(['timestamp', 'symbol'], inplace=True)

            # df = df.join(df_code[['symbol_name', 'description', 'exchange']], on='symbol', how='inner')
            
            dfs.append(df)
            gc.collect()

        df = pd.concat(dfs)
        df['capital_gains'] = df['capital_gains'].fillna(0)
        df = df[[
            'op',
            'hi',
            'lo',
            'cl',
            'adj_cl',
            'volume',
            'dividends',
            'splits',
            'capital_gains',
            # 'symbol_name',
            # 'description',
            # 'exchange',
        ]]
        df.sort_index(inplace=True)
        
        return df


In [None]:
fetcher = Fetcher()
# df = fetcher.fetch(last_timestamp=None)
# display(df)
df = fetcher.fetch(last_timestamp=1690848000)
display(df)

In [6]:
fetchers = [Fetcher()]

data = cloudpickle.dumps(fetchers)
data = lzma.compress(data)
with open('/home/jovyan/data/20230810_stock_us2.xz', 'wb') as f:
    f.write(data)