In [3]:
import sys
sys.path.append('..') # for import src

import os
import cloudpickle
import lzma
import pandas as pd
import numpy as np
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
import joblib
import yfinance as yf

import src
cloudpickle.register_pickle_by_value(src) # for model portability

In [9]:
df_code = read_code()
# display(df_code)

df = yf.download(['{}.T'.format(x) for x in df_code['symbol'].iloc[:100]])
df = df.stack()
df = df.reset_index()
display(df)

[*********************100%***********************]  100 of 100 completed


Unnamed: 0,Date,level_1,Adj Close,Close,High,Low,Open,Volume
0,2000-01-04,1332.T,102.602020,157.0,165.0,157.0,158.0,296000.0
1,2000-01-04,1380.T,473.627441,605.0,625.0,605.0,605.0,0.0
2,2000-01-04,1418.T,1031.356079,1250.0,1250.0,1250.0,1250.0,5000.0
3,2000-01-05,1332.T,102.602020,157.0,170.0,156.0,164.0,637000.0
4,2000-01-05,1380.T,473.627441,605.0,615.0,605.0,605.0,0.0
...,...,...,...,...,...,...,...,...
273714,2023-03-14,1473.T,1967.000000,1967.0,1988.5,1958.0,1988.5,107890.0
273715,2023-03-14,1474.T,17660.000000,17660.0,17845.0,17600.0,17845.0,2029.0
273716,2023-03-14,1475.T,1987.000000,1987.0,2004.0,1973.0,2004.0,690042.0
273717,2023-03-14,1476.T,1884.000000,1884.0,1894.0,1856.0,1871.0,92916.0


In [2]:
# df = yf.download(['1301.T', '1312.T'])
df = yf.download(['1301.T'], actions=True, period='max')
display(df)

[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2001-01-01,1400.0,1400.0,1400.0,1400.0,1339.794922,0,0.0,0.0
2001-01-02,1400.0,1400.0,1400.0,1400.0,1339.794922,0,0.0,0.0
2001-01-03,1400.0,1400.0,1400.0,1400.0,1339.794922,0,0.0,0.0
2001-01-04,1420.0,1420.0,1370.0,1390.0,1330.225098,19300,0.0,0.0
2001-01-05,1390.0,1400.0,1330.0,1330.0,1272.805298,19700,0.0,0.0
...,...,...,...,...,...,...,...,...
2023-03-13,3610.0,3610.0,3530.0,3550.0,3550.000000,90800,0.0,0.0
2023-03-14,3530.0,3530.0,3455.0,3460.0,3460.000000,81000,0.0,0.0
2023-03-15,3505.0,3505.0,3475.0,3490.0,3490.000000,47600,0.0,0.0
2023-03-16,3455.0,3475.0,3415.0,3475.0,3475.000000,42500,0.0,0.0


In [4]:
df = yf.download(['1001.T'])
display(df)

[*********************100%***********************]  1 of 1 completed

1 Failed download:
- 1001.T: No timezone found, symbol may be delisted


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1


In [25]:
import time
from datetime import datetime
import gc


def read_code():
    url = 'https://www.jpx.co.jp/markets/statistics-equities/misc/tvdivq0000001vg2-att/data_j.xls'
    df = pd.read_excel(url)
    df = df.rename(columns={
        'コード': 'symbol',
        '銘柄名': 'symbol_name',
        '33業種コード': 'industry_code',
    })
    df['symbol_name'] = df['symbol_name'].str.normalize('NFKC')
    df['industry_code'] = df['industry_code'].astype(str).str.replace('-', '0').astype(int)
    return df

class Fetcher:
    def __init__(self):
        self.keys = {}
        self.data_id = 'stock_ohlcv'
        self.replace_mode = True
        
    def fetch(self, last_timestamp=None):
        if last_timestamp is not None and time.time() < last_timestamp + (24 + 9) * 60 * 60:
            print('no new data')
            return pd.DataFrame()
        
        df_code = read_code()
        df_code['symbol'] = df_code['symbol'].astype('str') + '.T'
        df_code = df_code.set_index('symbol')
        
        dfs = []
        for symbols in np.array_split(df_code.index, 10):
            df = yf.download(symbols.tolist(), period='max', actions=True)
            df = df.stack()

            # for error
            df2 = yf.download(symbols.tolist(), period='max', actions=True)
            df2 = df2.stack()
            df = pd.concat([df, df2])
            df = df.loc[~df.index.duplicated()]

            if df.shape[0] == 0:
                return pd.DataFrame()

            df.reset_index(inplace=True)
            df.rename(columns={
                'Date': 'timestamp',
                'level_1': 'symbol',
                'Open': 'op',
                'High': 'hi',
                'Low': 'lo',
                'Close': 'cl',
                'Adj Close': 'adj_cl',
                'Volume': 'volume',
                'Dividends': 'dividends',
                'Stock Splits': 'splits',
                'Capital Gains': 'capital_gains',
            }, inplace=True)
            df['timestamp'] = df['timestamp'].astype(int) // 10 ** 9
            df.set_index(['timestamp', 'symbol'], inplace=True)

            df = df.join(df_code[['symbol_name', 'industry_code']], on='symbol', how='left')
                
            dfs.append(df)
            gc.collect()

        df = pd.concat(dfs)
        df['capital_gains'] = df['capital_gains'].fillna(0)
        df = df[[
            'op',
            'hi',
            'lo',
            'cl',
            'adj_cl',
            'volume',
            'dividends',
            'splits',
            'capital_gains',
            'symbol_name',
            'industry_code',
        ]]
        df.sort_index(inplace=True)
        
        return df


In [22]:
fetcher = Fetcher()
df = fetcher.fetch(last_timestamp=None)
display(df)
df = fetcher.fetch(last_timestamp=1600041600)
display(df)

[*********************100%***********************]  10 of 10 completed
[*********************100%***********************]  10 of 10 completed
[*********************100%***********************]  10 of 10 completed
[*********************100%***********************]  10 of 10 completed
[*********************100%***********************]  10 of 10 completed
[*********************100%***********************]  10 of 10 completed
[*********************100%***********************]  10 of 10 completed
[*********************100%***********************]  10 of 10 completed
[*********************100%***********************]  10 of 10 completed
[*********************100%***********************]  10 of 10 completed
[*********************100%***********************]  10 of 10 completed
[*********************100%***********************]  10 of 10 completed
[*********************100%***********************]  10 of 10 completed
[*********************100%***********************]  10 of 10 completed
[*****

Unnamed: 0_level_0,Unnamed: 1_level_0,op,hi,lo,cl,adj_cl,volume,dividends,splits,capital_gains,symbol_name,industry_code
timestamp,symbol,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
946944000,1332.T,158.0,165.0,157.0,157.0,102.602005,296000.0,0.0,0.0,0.0,ニッスイ,50
946944000,1380.T,605.0,625.0,605.0,605.0,473.627441,0.0,0.0,0.0,0.0,秋川牧園,50
946944000,1418.T,1250.0,1250.0,1250.0,1250.0,1031.356201,5000.0,0.0,0.0,0.0,インターライフホールディングス,2050
947030400,1332.T,164.0,170.0,156.0,157.0,102.602005,637000.0,0.0,0.0,0.0,ニッスイ,50
947030400,1380.T,605.0,615.0,605.0,605.0,473.627441,0.0,0.0,0.0,0.0,秋川牧園,50
...,...,...,...,...,...,...,...,...,...,...,...,...
1679011200,1473.T,1974.5,1985.0,1973.5,1982.0,1982.000000,14050.0,0.0,0.0,0.0,One ETF トピックス,0
1679011200,1474.T,17600.0,17795.0,17600.0,17795.0,17795.000000,26.0,0.0,0.0,0.0,One ETF JPX日経400,0
1679011200,1475.T,1990.0,2000.0,1986.0,1999.0,1999.000000,352334.0,0.0,0.0,0.0,iシェアーズ・コア TOPIX ETF,0
1679011200,1476.T,1847.0,1861.0,1820.0,1828.0,1828.000000,285099.0,0.0,0.0,0.0,iシェアーズ・コア Jリート ETF,0


[*********************100%***********************]  10 of 10 completed
[*********************100%***********************]  10 of 10 completed
[*********************100%***********************]  10 of 10 completed
[*********************100%***********************]  10 of 10 completed
[*********************100%***********************]  10 of 10 completed
[*********************100%***********************]  10 of 10 completed
[*********************100%***********************]  10 of 10 completed
[*********************100%***********************]  10 of 10 completed
[*********************100%***********************]  10 of 10 completed
[*********************100%***********************]  10 of 10 completed
[*********************100%***********************]  10 of 10 completed
[*********************100%***********************]  10 of 10 completed
[*********************100%***********************]  10 of 10 completed
[*********************100%***********************]  10 of 10 completed
[*****

Unnamed: 0_level_0,Unnamed: 1_level_0,op,hi,lo,cl,adj_cl,volume,dividends,splits,capital_gains,symbol_name,industry_code
timestamp,symbol,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1600128000,1301.T,2738.0,2740.0,2712.0,2727.0,2598.544678,14500.0,0.0,0.0,0.0,極洋,50
1600128000,1305.T,1714.0,1714.0,1703.0,1708.0,1708.000000,236280.0,0.0,0.0,0.0,ダイワ上場投信-トピックス,0
1600128000,1306.T,1694.0,1694.0,1683.0,1688.0,1688.000000,1110520.0,0.0,0.0,0.0,NEXT FUNDS TOPIX連動型上場投信,0
1600128000,1308.T,1674.0,1675.0,1665.0,1672.0,1672.000000,135900.0,0.0,0.0,0.0,上場インデックスファンドTOPIX,0
1600128000,1309.T,38500.0,38800.0,38150.0,38800.0,38800.000000,273.0,0.0,0.0,0.0,NEXT FUNDS ChinaAMC・中国株式・上証50連動型上場投信,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1679011200,1473.T,1974.5,1985.0,1973.5,1982.0,1982.000000,14050.0,0.0,0.0,0.0,One ETF トピックス,0
1679011200,1474.T,17600.0,17795.0,17600.0,17795.0,17795.000000,26.0,0.0,0.0,0.0,One ETF JPX日経400,0
1679011200,1475.T,1990.0,2000.0,1986.0,1999.0,1999.000000,352334.0,0.0,0.0,0.0,iシェアーズ・コア TOPIX ETF,0
1679011200,1476.T,1847.0,1861.0,1820.0,1828.0,1828.000000,285099.0,0.0,0.0,0.0,iシェアーズ・コア Jリート ETF,0


In [26]:
fetchers = [Fetcher()]

data = cloudpickle.dumps(fetchers)
data = lzma.compress(data)
with open('/home/jovyan/data/20230313_stock.xz', 'wb') as f:
    f.write(data)