In [1]:
import sys
sys.path.append('..') # for import src

import os
import cloudpickle
import lzma
import pandas as pd
import numpy as np
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
import joblib
import yfinance as yf

import src
cloudpickle.register_pickle_by_value(src) # for model portability

In [31]:

# ticker = yf.Ticker("1301.T")
# # print(ticker.info)
# display(ticker.history(period="max"))

# display(ticker.actions)
# display(ticker.dividends)
# display(ticker.splits)
# display(ticker.financials)
# display(ticker.quarterly_financials)
# display(ticker.major_holders)
# display(ticker.institutional_holders)
# display(ticker.balance_sheet)
# display(ticker.quarterly_balance_sheet)

# display(ticker.cashflow)
# display(ticker.quarterly_cashflow)
# display(ticker.earnings)
# display(ticker.quarterly_earnings)

# display(ticker.sustainability)
# display(ticker.recommendations)
# display(ticker.calendar)
# display(ticker.earnings_dates)
# display(ticker.isin)
# display(ticker.options)
# display(ticker.news)

# # get option chain for specific expiration
# # opt = msft.option_chain('YYYY-MM-DD')
# # data available via: opt.calls, opt.puts

In [76]:
df = yf.download('1312.T')
display(df)

[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2009-01-05,0.0,8700.0,8500.0,8500.0,8500.0,72
2009-01-06,0.0,8600.0,8450.0,8500.0,8500.0,182
2009-01-07,0.0,8650.0,8540.0,8540.0,8540.0,89
2009-01-08,0.0,8500.0,8260.0,8260.0,8260.0,150
2009-01-09,0.0,8250.0,8160.0,8200.0,8200.0,3
...,...,...,...,...,...,...
2023-03-03,21350.0,21535.0,21350.0,21525.0,21525.0,110
2023-03-06,21605.0,21625.0,21585.0,21625.0,21625.0,459
2023-03-07,21605.0,21660.0,21605.0,21615.0,21615.0,1082
2023-03-08,21615.0,21615.0,21615.0,21615.0,21615.0,0


In [21]:
import time
from datetime import datetime

memory = joblib.Memory('/tmp/joblib_memory')

@memory.cache
def read_code():
    url = 'https://www.jpx.co.jp/markets/statistics-equities/misc/tvdivq0000001vg2-att/data_j.xls'
    df = pd.read_excel(url)
    df = df.rename(columns={
        'コード': 'symbol',
        '銘柄名': 'symbol_name',
        '33業種コード': 'industry_code',
    })
    df['symbol_name'] = df['symbol_name'].str.normalize('NFKC')
    df['industry_code'] = df['industry_code'].astype(str).str.replace('-', '0').astype(int)
    return df

class Fetcher:
    def __init__(self, symbol=None):
        self.symbol = symbol
        self.keys = {
            'symbol': symbol
        }
        self.data_id = 'stock_ohlcv'
        
    def fetch(self, last_timestamp=None):
        if last_timestamp is not None and time.time() < last_timestamp + (24 + 9) * 60 * 60:
            print('no data')
            return pd.DataFrame()
        
        df_code = read_code()
        df_code = df_code[df_code['symbol'].astype(str) + '.T' == self.symbol]
        
        if df_code.shape[0] == 0:
            return pd.DataFrame()
        
        ticker = yf.Ticker(self.symbol)
        
        if last_timestamp is None:
            df = ticker.history(period='max')
            df2 = yf.download(self.symbol, start='1900-01-01')
        else:
            start = datetime.fromtimestamp(last_timestamp + 24 * 60 * 60).strftime('%Y-%m-%d')
            df = ticker.history(start=start)
            df2 = yf.download(self.symbol, start=start)
            
        if df.shape[0] == 0 or df2.shape[0] == 0:
            return pd.DataFrame()
            
        df = df.reset_index()
        df = df.drop(columns=['Open', 'High', 'Low', 'Close', 'Volume'])
        df = df.rename(columns={
            'Date': 'timestamp',
            'Dividends': 'dividends',
            'Capital Gains': 'capital_gains',
            'Stock Splits': 'splits',
        })
        df['timestamp'] = df['timestamp'].astype(int) // 10 ** 9 + 9 * 60 * 60
        df = df.set_index('timestamp')
            
        df2 = df2.reset_index()
        df2 = df2.rename(columns={
            'Date': 'timestamp',
            'Open': 'op',
            'High': 'hi',
            'Low': 'lo',
            'Close': 'cl',
            'Adj Close': 'adj_cl',
            'Volume': 'volume',
        })
        df2['timestamp'] = df2['timestamp'].astype(int) // 10 ** 9
        df2 = df2.set_index('timestamp')
        df = df2.join(df, how='inner')
        
        if 'capital_gains' not in df.columns:
            df['capital_gains'] = 0.0
        
        df['symbol_name'] = df_code['symbol_name'].iloc[0]
        df['industry_code'] = df_code['industry_code'].iloc[0]
        
        if last_timestamp is not None:
            df = df.loc[df.index > last_timestamp]
        if df.shape[0] == 0:
            return pd.DataFrame()
        
        return df


In [22]:
fetcher = Fetcher(symbol='1301.T')
df = fetcher.fetch(last_timestamp=None)
display(df)
df = fetcher.fetch(last_timestamp=1600041600)
display(df)

________________________________________________________________________________
[Memory] Calling __main__--tmp-ipykernel-3916541640.read_code...
read_code()
________________________________________________________read_code - 0.4s, 0.0min
[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,op,hi,lo,cl,adj_cl,volume,dividends,splits,capital_gains,symbol_name,industry_code
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
978307200,1400.0,1400.0,1400.0,1400.0,956.234619,0,0.0,0.0,0.0,極洋,50
978393600,1400.0,1400.0,1400.0,1400.0,956.234619,0,0.0,0.0,0.0,極洋,50
978480000,1400.0,1400.0,1400.0,1400.0,956.234619,0,0.0,0.0,0.0,極洋,50
978566400,1420.0,1420.0,1370.0,1390.0,949.404236,19300,0.0,0.0,0.0,極洋,50
978652800,1390.0,1400.0,1330.0,1330.0,908.422913,19700,0.0,0.0,0.0,極洋,50
...,...,...,...,...,...,...,...,...,...,...,...
1678147200,3650.0,3660.0,3640.0,3645.0,3645.000000,31900,0.0,0.0,0.0,極洋,50
1678233600,3640.0,3665.0,3640.0,3660.0,3660.000000,22000,0.0,0.0,0.0,極洋,50
1678320000,3660.0,3675.0,3655.0,3665.0,3665.000000,33200,0.0,0.0,0.0,極洋,50
1678406400,3650.0,3665.0,3615.0,3625.0,3625.000000,74000,0.0,0.0,0.0,極洋,50


[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,op,hi,lo,cl,adj_cl,volume,dividends,splits,capital_gains,symbol_name,industry_code
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1600128000,2738.0,2740.0,2712.0,2727.0,2598.544678,14500,0.0,0.0,0.0,極洋,50
1600214400,2720.0,2747.0,2719.0,2747.0,2617.602783,10500,0.0,0.0,0.0,極洋,50
1600300800,2749.0,2752.0,2732.0,2740.0,2610.932373,16200,0.0,0.0,0.0,極洋,50
1600387200,2741.0,2748.0,2720.0,2748.0,2618.555420,18600,0.0,0.0,0.0,極洋,50
1600819200,2739.0,2749.0,2733.0,2749.0,2619.508545,21400,0.0,0.0,0.0,極洋,50
...,...,...,...,...,...,...,...,...,...,...,...
1678147200,3650.0,3660.0,3640.0,3645.0,3645.000000,31900,0.0,0.0,0.0,極洋,50
1678233600,3640.0,3665.0,3640.0,3660.0,3660.000000,22000,0.0,0.0,0.0,極洋,50
1678320000,3660.0,3675.0,3655.0,3665.0,3665.000000,33200,0.0,0.0,0.0,極洋,50
1678406400,3650.0,3665.0,3615.0,3625.0,3625.000000,74000,0.0,0.0,0.0,極洋,50


In [23]:
symbols = ['{}.T'.format(code) for code in range(1301, 10000)]
print(symbols)
fetchers = []

for symbol in symbols:
    fetchers.append(Fetcher(symbol=symbol))

data = cloudpickle.dumps(fetchers)
data = lzma.compress(data)
with open('/home/jovyan/data/20230313_stock.xz', 'wb') as f:
    f.write(data)

['1301.T', '1302.T', '1303.T', '1304.T', '1305.T', '1306.T', '1307.T', '1308.T', '1309.T', '1310.T', '1311.T', '1312.T', '1313.T', '1314.T', '1315.T', '1316.T', '1317.T', '1318.T', '1319.T', '1320.T', '1321.T', '1322.T', '1323.T', '1324.T', '1325.T', '1326.T', '1327.T', '1328.T', '1329.T', '1330.T', '1331.T', '1332.T', '1333.T', '1334.T', '1335.T', '1336.T', '1337.T', '1338.T', '1339.T', '1340.T', '1341.T', '1342.T', '1343.T', '1344.T', '1345.T', '1346.T', '1347.T', '1348.T', '1349.T', '1350.T', '1351.T', '1352.T', '1353.T', '1354.T', '1355.T', '1356.T', '1357.T', '1358.T', '1359.T', '1360.T', '1361.T', '1362.T', '1363.T', '1364.T', '1365.T', '1366.T', '1367.T', '1368.T', '1369.T', '1370.T', '1371.T', '1372.T', '1373.T', '1374.T', '1375.T', '1376.T', '1377.T', '1378.T', '1379.T', '1380.T', '1381.T', '1382.T', '1383.T', '1384.T', '1385.T', '1386.T', '1387.T', '1388.T', '1389.T', '1390.T', '1391.T', '1392.T', '1393.T', '1394.T', '1395.T', '1396.T', '1397.T', '1398.T', '1399.T', '1400.T',