In [1]:
import sys
sys.path.append('..') # for import src

import os
import cloudpickle
import lzma
import pandas as pd
import numpy as np
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
import joblib
import yfinance as yf

import src
cloudpickle.register_pickle_by_value(src) # for model portability

In [2]:
def read_code():
    url = 'https://gist.githubusercontent.com/richmanbtc/16740d82ef2d71df8d28b7a46f156b0e/raw/092211c3fd3a1260ba12bb67eea04ff87f1b2b61/au_fr.csv'
    df = pd.read_csv(url)
    df = df.rename(columns={
        'ティッカー': 'symbol',
        '銘柄名': 'symbol_name',
        '業種': 'industry',
    })
    df['symbol_name'] = df['symbol_name'].str.normalize('NFKC')
    return df

display(read_code())

Unnamed: 0,symbol,symbol_name,銘柄名カナ(銘柄名カナ略称),industry
0,A,AgilentTec,アジレントテクノロジー(アジレントテクノロジー),精密機器
1,AA,Alcoa Corporation,アルコア(アルコア),非鉄金属
2,AAL,American Airlines Group Inc.,アメリカン　エアラインズ　グループ(アメリカンエアラインズグループ),空運業
3,AAON,"AAON, Inc.",エイエイオーエヌ(エイエイオーエヌ),機械
4,AAP,"Advance Auto Parts, Inc.",アドバンス オート パーツ(アドバンスオートパーツ),小売業
...,...,...,...,...
1798,ZS,"Zscaler, Inc.",Ｚスケーラー(Ｚスケーラー),サービス業
1799,ZTO,ZTO Express (Cayman) Inc. ADR,ZTO エクスプレス (ケイマン) ADR(ZTOエクスプレスケイマン),陸運業
1800,ZTS,Zoetis,ゾエティス(ゾエティス),医薬品
1801,ZUO,"Zuora, Inc.",ズオラ(ズオラ),サービス業


In [6]:
df_code = read_code()
# display(df_code)

df = yf.download(df_code['symbol'].iloc[:100].tolist())
df = df.stack()
df = df.reset_index()
display(df)

[*********************100%***********************]  100 of 100 completed


Unnamed: 0,Date,level_1,Adj Close,Close,High,Low,Open,Volume
0,1962-01-02,AA,1.527622,6.545672,6.583219,6.545672,0.000000,55930.0
1,1962-01-02,AEP,1.000812,34.312500,35.125000,34.312500,0.000000,5800.0
2,1962-01-03,AA,1.550988,6.645797,6.645797,6.533156,6.545672,74906.0
3,1962-01-03,AEP,0.998988,34.250000,34.750000,34.062500,0.000000,10200.0
4,1962-01-04,AA,1.550988,6.645797,6.695859,6.645797,6.645797,80899.0
...,...,...,...,...,...,...,...,...
613759,2023-03-17,AON,296.589996,296.589996,298.000000,294.690002,297.519989,2935300.0
613760,2023-03-17,AOS,65.470001,65.470001,67.330002,64.980003,67.330002,2484200.0
613761,2023-03-17,AOSL,24.980000,24.980000,25.950001,24.940001,25.650000,689500.0
613762,2023-03-17,APA,31.740000,31.740000,32.790001,31.430000,32.689999,15840200.0


In [2]:
# df = yf.download(['1301.T', '1312.T'])
df = yf.download(['1301.T'], actions=True, period='max')
display(df)

[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2001-01-01,1400.0,1400.0,1400.0,1400.0,1339.794922,0,0.0,0.0
2001-01-02,1400.0,1400.0,1400.0,1400.0,1339.794922,0,0.0,0.0
2001-01-03,1400.0,1400.0,1400.0,1400.0,1339.794922,0,0.0,0.0
2001-01-04,1420.0,1420.0,1370.0,1390.0,1330.225098,19300,0.0,0.0
2001-01-05,1390.0,1400.0,1330.0,1330.0,1272.805298,19700,0.0,0.0
...,...,...,...,...,...,...,...,...
2023-03-13,3610.0,3610.0,3530.0,3550.0,3550.000000,90800,0.0,0.0
2023-03-14,3530.0,3530.0,3455.0,3460.0,3460.000000,81000,0.0,0.0
2023-03-15,3505.0,3505.0,3475.0,3490.0,3490.000000,47600,0.0,0.0
2023-03-16,3455.0,3475.0,3415.0,3475.0,3475.000000,42500,0.0,0.0


In [4]:
df = yf.download(['1001.T'])
display(df)

[*********************100%***********************]  1 of 1 completed

1 Failed download:
- 1001.T: No timezone found, symbol may be delisted


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1


In [8]:
import time
from datetime import datetime
import gc


class Fetcher:
    def __init__(self):
        self.keys = {}
        self.data_id = 'stock_us_ohlcv'
        self.replace_mode = True
        
    def fetch(self, last_timestamp=None):
        if last_timestamp is not None and time.time() < last_timestamp + (24 + 24) * 60 * 60:
            print('no new data')
            return pd.DataFrame()
        
        df_code = read_code()
        df_code = df_code.set_index('symbol')
        
        dfs = []
        for symbols in np.array_split(df_code.index, 10):
            df = yf.download(symbols.tolist(), period='max', actions=True)
            df = df.stack()

            # for error
            df2 = yf.download(symbols.tolist(), period='max', actions=True)
            df2 = df2.stack()
            df = pd.concat([df, df2])
            df = df.loc[~df.index.duplicated()]

            if df.shape[0] == 0:
                return pd.DataFrame()

            df.reset_index(inplace=True)
            df.rename(columns={
                'Date': 'timestamp',
                'level_1': 'symbol',
                'Open': 'op',
                'High': 'hi',
                'Low': 'lo',
                'Close': 'cl',
                'Adj Close': 'adj_cl',
                'Volume': 'volume',
                'Dividends': 'dividends',
                'Stock Splits': 'splits',
                'Capital Gains': 'capital_gains',
            }, inplace=True)
            df['timestamp'] = df['timestamp'].astype(int) // 10 ** 9
            df.set_index(['timestamp', 'symbol'], inplace=True)

            df = df.join(df_code[['symbol_name', 'industry']], on='symbol', how='left')
                
            dfs.append(df)
            gc.collect()

        df = pd.concat(dfs)
        df['capital_gains'] = df['capital_gains'].fillna(0)
        df = df[[
            'op',
            'hi',
            'lo',
            'cl',
            'adj_cl',
            'volume',
            'dividends',
            'splits',
            'capital_gains',
            'symbol_name',
            'industry',
        ]]
        df.sort_index(inplace=True)
        
        return df


In [10]:
fetcher = Fetcher()
df = fetcher.fetch(last_timestamp=None)
display(df)
df = fetcher.fetch(last_timestamp=1600041600)
display(df)

[*********************100%***********************]  181 of 181 completed
[*********************100%***********************]  181 of 181 completed
[*********************100%***********************]  181 of 181 completed
[*********************100%***********************]  181 of 181 completed
[**********************60%****                   ]  108 of 181 completed

KeyboardInterrupt: 

[**********************74%***********            ]  134 of 181 completed

In [9]:
fetchers = [Fetcher()]

data = cloudpickle.dumps(fetchers)
data = lzma.compress(data)
with open('/home/jovyan/data/20230319_stock_us.xz', 'wb') as f:
    f.write(data)