In [6]:
import sys
sys.path.append('..') # for import src

import os
import cloudpickle
import lzma
import pandas as pd
import numpy as np
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
import joblib
import yfinance as yf

import src
cloudpickle.register_pickle_by_value(src) # for model portability

In [7]:
def read_code():
    url = 'https://www.jpx.co.jp/markets/statistics-equities/misc/tvdivq0000001vg2-att/data_j.xls'
    df = pd.read_excel(url)
    df = df.rename(columns={
        'コード': 'symbol',
        '銘柄名': 'symbol_name',
        '33業種コード': 'industry_code',
    })
    df['symbol_name'] = df['symbol_name'].str.normalize('NFKC')
    df['industry_code'] = df['industry_code'].fillna('-').astype(str).str.replace('-', '0').astype(int)
    return df

In [5]:
df_code = read_code()
# display(df_code)

df = yf.download(['{}.T'.format(x) for x in df_code['symbol'].iloc[:100]])
df = df.stack()
df = df.reset_index()
display(df)

[*********************100%***********************]  100 of 100 completed


Unnamed: 0,Date,level_1,Adj Close,Close,High,Low,Open,Volume
0,2000-01-04,1332.T,100.753326,157.0,165.0,157.0,158.0,296000.0
1,2000-01-04,1380.T,469.197021,605.0,625.0,605.0,605.0,0.0
2,2000-01-04,1418.T,1031.356079,1250.0,1250.0,1250.0,1250.0,5000.0
3,2000-01-05,1332.T,100.753326,157.0,170.0,156.0,164.0,637000.0
4,2000-01-05,1380.T,469.197021,605.0,615.0,605.0,605.0,0.0
...,...,...,...,...,...,...,...,...
277563,2023-07-07,1476.T,1919.000000,1919.0,1924.0,1916.0,1919.0,23506.0
277564,2023-07-07,1477.T,2174.000000,2174.0,2177.0,2161.0,2176.0,32.0
277565,2023-07-07,1478.T,2920.000000,2920.0,2928.0,2896.0,2913.0,25434.0
277566,2023-07-07,1479.T,28200.000000,28200.0,28200.0,28200.0,28200.0,2.0


In [3]:
# df = yf.download(['1301.T', '1312.T'])
df = yf.download(['1301.T'], actions=True, period='max')
display(df)

[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2001-01-01,1400.0,1400.0,1400.0,1400.0,953.506348,0,0.0,0.0
2001-01-02,1400.0,1400.0,1400.0,1400.0,953.506348,0,0.0,0.0
2001-01-03,1400.0,1400.0,1400.0,1400.0,953.506348,0,0.0,0.0
2001-01-04,1420.0,1420.0,1370.0,1390.0,946.695740,19300,0.0,0.0
2001-01-05,1390.0,1400.0,1330.0,1330.0,905.831116,19700,0.0,0.0
...,...,...,...,...,...,...,...,...
2023-07-03,3620.0,3655.0,3620.0,3650.0,3650.000000,12500,0.0,0.0
2023-07-04,3640.0,3650.0,3635.0,3635.0,3635.000000,14300,0.0,0.0
2023-07-05,3615.0,3620.0,3600.0,3610.0,3610.000000,16500,0.0,0.0
2023-07-06,3610.0,3630.0,3595.0,3600.0,3600.000000,22300,0.0,0.0


In [4]:
df = yf.download(['1001.T'])
display(df)

[*********************100%***********************]  1 of 1 completed

1 Failed download:
- 1001.T: No timezone found, symbol may be delisted


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1


In [8]:
import time
from datetime import datetime
import gc


class Fetcher:
    def __init__(self):
        self.keys = {}
        self.data_id = 'stock_ohlcv'
        self.replace_mode = True
        
    def fetch(self, last_timestamp=None):
        if last_timestamp is not None and time.time() < last_timestamp + (24 + 9) * 60 * 60:
            print('no new data')
            return pd.DataFrame()
        
        df_code = read_code()
        df_code['symbol'] = df_code['symbol'].astype('str') + '.T'
        df_code = df_code.set_index('symbol')
        
        dfs = []
        for symbols in np.array_split(df_code.index, 10):
            df = yf.download(symbols.tolist(), period='max', actions=True)
            df = df.stack()

            # for error
            df2 = yf.download(symbols.tolist(), period='max', actions=True)
            df2 = df2.stack()
            df = pd.concat([df, df2])
            df = df.loc[~df.index.duplicated()]

            if df.shape[0] == 0:
                return pd.DataFrame()

            df.reset_index(inplace=True)
            df.rename(columns={
                'Date': 'timestamp',
                'level_1': 'symbol',
                'Open': 'op',
                'High': 'hi',
                'Low': 'lo',
                'Close': 'cl',
                'Adj Close': 'adj_cl',
                'Volume': 'volume',
                'Dividends': 'dividends',
                'Stock Splits': 'splits',
                'Capital Gains': 'capital_gains',
            }, inplace=True)
            df['timestamp'] = df['timestamp'].astype(int) // 10 ** 9
            df.set_index(['timestamp', 'symbol'], inplace=True)

            df = df.join(df_code[['symbol_name', 'industry_code']], on='symbol', how='left')
                
            dfs.append(df)
            gc.collect()

        df = pd.concat(dfs)
        df['capital_gains'] = df['capital_gains'].fillna(0)
        df = df[[
            'op',
            'hi',
            'lo',
            'cl',
            'adj_cl',
            'volume',
            'dividends',
            'splits',
            'capital_gains',
            'symbol_name',
            'industry_code',
        ]]
        df.sort_index(inplace=True)
        
        return df


In [6]:
fetcher = Fetcher()
df = fetcher.fetch(last_timestamp=None)
display(df)
df = fetcher.fetch(last_timestamp=1600041600)
display(df)

[*********************100%***********************]  428 of 428 completed

1 Failed download:
- 2073.T: 1d data not available for startTime=-2208994789 and endTime=1688703457. Only 100 years worth of day granularity data are allowed to be fetched per request.
[*********************100%***********************]  428 of 428 completed

1 Failed download:
- 2073.T: 1d data not available for startTime=-2208994789 and endTime=1688703483. Only 100 years worth of day granularity data are allowed to be fetched per request.
[*********************100%***********************]  428 of 428 completed

1 Failed download:
- 25935.T: No timezone found, symbol may be delisted
[*********************100%***********************]  428 of 428 completed

1 Failed download:
- 25935.T: No timezone found, symbol may be delisted
[*********************100%***********************]  428 of 428 completed
[*********************100%***********************]  428 of 428 completed
[*********************100%******************

Unnamed: 0_level_0,Unnamed: 1_level_0,op,hi,lo,cl,adj_cl,volume,dividends,splits,capital_gains,symbol_name,industry_code
timestamp,symbol,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
925948800,6806.T,10571.428711,10666.666992,10180.952148,10380.952148,7998.520020,65310.0,0.0,0.0,0.0,ヒロセ電機,3650
925948800,7203.T,698.000000,722.000000,690.000000,722.000000,75.560051,15575000.0,0.0,0.0,0.0,トヨタ自動車,3700
925948800,7752.T,1170.000000,1190.000000,1160.000000,1170.000000,767.880615,1608000.0,0.0,0.0,0.0,リコー,3650
926035200,6806.T,10371.428711,10371.428711,9714.286133,10238.095703,7888.444824,122010.0,0.0,0.0,0.0,ヒロセ電機,3650
926035200,7203.T,718.000000,720.000000,690.000000,696.000000,72.839073,15165000.0,0.0,0.0,0.0,トヨタ自動車,3700
...,...,...,...,...,...,...,...,...,...,...,...,...
1688688000,9993.T,1278.000000,1282.000000,1277.000000,1280.000000,1280.000000,1800.0,0.0,0.0,0.0,ヤマザワ,6100
1688688000,9994.T,2700.000000,2710.000000,2680.000000,2710.000000,2710.000000,4600.0,0.0,0.0,0.0,やまや,6100
1688688000,9995.T,416.000000,419.000000,415.000000,419.000000,419.000000,47200.0,0.0,0.0,0.0,グローセル,6050
1688688000,9996.T,1227.000000,1227.000000,1222.000000,1227.000000,1227.000000,1500.0,0.0,0.0,0.0,サトー商会,6050


[*********************100%***********************]  428 of 428 completed

1 Failed download:
- 2073.T: 1d data not available for startTime=-2208994789 and endTime=1688703920. Only 100 years worth of day granularity data are allowed to be fetched per request.
[*********************100%***********************]  428 of 428 completed

1 Failed download:
- 2073.T: 1d data not available for startTime=-2208994789 and endTime=1688703948. Only 100 years worth of day granularity data are allowed to be fetched per request.
[*********************100%***********************]  428 of 428 completed

1 Failed download:
- 25935.T: No timezone found, symbol may be delisted
[*********************100%***********************]  428 of 428 completed

1 Failed download:
- 25935.T: No timezone found, symbol may be delisted
[*********************100%***********************]  428 of 428 completed
[*********************100%***********************]  428 of 428 completed
[*********************100%******************

Unnamed: 0_level_0,Unnamed: 1_level_0,op,hi,lo,cl,adj_cl,volume,dividends,splits,capital_gains,symbol_name,industry_code
timestamp,symbol,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
925948800,6806.T,10571.428711,10666.666992,10180.952148,10380.952148,7998.520020,65310.0,0.0,0.0,0.0,ヒロセ電機,3650
925948800,7203.T,698.000000,722.000000,690.000000,722.000000,75.560051,15575000.0,0.0,0.0,0.0,トヨタ自動車,3700
925948800,7752.T,1170.000000,1190.000000,1160.000000,1170.000000,767.880615,1608000.0,0.0,0.0,0.0,リコー,3650
926035200,6806.T,10371.428711,10371.428711,9714.286133,10238.095703,7888.444824,122010.0,0.0,0.0,0.0,ヒロセ電機,3650
926035200,7203.T,718.000000,720.000000,690.000000,696.000000,72.839073,15165000.0,0.0,0.0,0.0,トヨタ自動車,3700
...,...,...,...,...,...,...,...,...,...,...,...,...
1688688000,9993.T,1278.000000,1282.000000,1277.000000,1280.000000,1280.000000,1800.0,0.0,0.0,0.0,ヤマザワ,6100
1688688000,9994.T,2700.000000,2710.000000,2680.000000,2709.000000,2709.000000,4700.0,0.0,0.0,0.0,やまや,6100
1688688000,9995.T,416.000000,419.000000,415.000000,419.000000,419.000000,49000.0,0.0,0.0,0.0,グローセル,6050
1688688000,9996.T,1227.000000,1227.000000,1222.000000,1227.000000,1227.000000,1500.0,0.0,0.0,0.0,サトー商会,6050


In [9]:
fetchers = [Fetcher()]

data = cloudpickle.dumps(fetchers)
data = lzma.compress(data)
with open('/home/jovyan/data/20230313_stock.xz', 'wb') as f:
    f.write(data)