In [None]:
import json
import os
from tqdm import tqdm
import pandas as pd
import numpy as np
import joblib

In [None]:
data_dir = 'jquants_data'
files = [os.path.join(data_dir, file) for file in os.listdir(data_dir) if file.endswith('.json')]

In [None]:
def create_feature_df(data, max_lag=10):
    daily_quotes = pd.DataFrame(data['prices_daily_quotes'])
    statements = pd.DataFrame(data['fins_statements'])

    # データを処理
    df = pd.DataFrame()

    # statementsが空の場合は空のデータフレームを返却
    if len(statements) == 0:
        return df

    # 単純コピー
    copy_columns = ['LocalCode', 'DisclosedDate', 'TypeOfDocument', 'TypeOfCurrentPeriod', 'NetSales', 'OperatingProfit', 'OrdinaryProfit', 'Profit', 'EarningsPerShare',
                    'DilutedEarningsPerShare', 'TotalAssets', 'Equity', 'EquityToAssetRatio', 'BookValuePerShare',
                    'CashFlowsFromOperatingActivities', 'CashFlowsFromInvestingActivities', 'CashFlowsFromFinancingActivities',
                    'CashAndEquivalents'] + [f'ResultDividendPerShare{a}Quarter' for a in ['1st', '2nd', '3rd']] + ['ResultDividendPerShareFiscalYearEnd', 'ResultDividendPerShareAnnual',
                    'DistributionsPerUnit(REIT)', 'ResultTotalDividendPaidAnnual', 'ResultPayoutRatioAnnual',
                    "ForecastDividendPerShare1stQuarter", "ForecastDividendPerShare2ndQuarter", "ForecastDividendPerShare3rdQuarter", "ForecastDividendPerShareFiscalYearEnd", "ForecastDividendPerShareAnnual", "ForecastDistributionsPerUnit(REIT)", "ForecastTotalDividendPaidAnnual", "ForecastPayoutRatioAnnual", "NextYearForecastDividendPerShare1stQuarter", "NextYearForecastDividendPerShare2ndQuarter", "NextYearForecastDividendPerShare3rdQuarter", "NextYearForecastDividendPerShareFiscalYearEnd", "NextYearForecastDividendPerShareAnnual", "NextYearForecastDistributionsPerUnit(REIT)", "NextYearForecastPayoutRatioAnnual", "ForecastNetSales2ndQuarter", "ForecastOperatingProfit2ndQuarter", "ForecastOrdinaryProfit2ndQuarter", "ForecastProfit2ndQuarter", "ForecastEarningsPerShare2ndQuarter", "NextYearForecastNetSales2ndQuarter", "NextYearForecastOperatingProfit2ndQuarter", "NextYearForecastOrdinaryProfit2ndQuarter", "NextYearForecastProfit2ndQuarter", "NextYearForecastEarningsPerShare2ndQuarter", "ForecastNetSales", "ForecastOperatingProfit", "ForecastOrdinaryProfit", "ForecastProfit", "ForecastEarningsPerShare", "NextYearForecastNetSales", "NextYearForecastOperatingProfit", "NextYearForecastOrdinaryProfit", "NextYearForecastProfit", "NextYearForecastEarningsPerShare",
                    'MaterialChangesInSubsidiaries', 'SignificantChangesInTheScopeOfConsolidation', 'ChangesBasedOnRevisionsOfAccountingStandard',
                    'ChangesOtherThanOnesBasedOnRevisionsOfAccountingStandard', 'ChangesInAccountingEstimates', 'RetrospectiveRestatement',
                    'NumberOfIssuedAndOutstandingSharesAtTheEndOfFiscalYearIncludingTreasuryStock', 'NumberOfTreasuryStockAtTheEndOfFiscalYear', 'AverageNumberOfShares']
    for col in copy_columns:
        df[col] = statements[col]

    df = df.copy()
    
    df['CurrentPeriodLength'] = (pd.to_datetime(statements['CurrentPeriodEndDate']) - pd.to_datetime(statements['CurrentPeriodStartDate'])).dt.days
    df['DisclosureLagDays'] = (pd.to_datetime(statements['DisclosedDate']) - pd.to_datetime(statements['CurrentPeriodEndDate'])).dt.days
    df['OperatingCosts'] = pd.to_numeric(statements['NetSales']) - pd.to_numeric(statements['OperatingProfit'])
    df['NonOperatingIncome'] = pd.to_numeric(statements['OrdinaryProfit']) - pd.to_numeric(statements['OperatingProfit'])
    df['ExtraordinaryItems'] = pd.to_numeric(statements['Profit']) - pd.to_numeric(statements['OrdinaryProfit'])
    

    df = df.copy()
    
    # 一日あたりの計算
    daily_columns = ['NetSales', 'OperatingProfit', 'OrdinaryProfit', 'Profit', 'EarningsPerShare', 'DilutedEarningsPerShare',
                     'OperatingCosts', 'NonOperatingIncome', 'ExtraordinaryItems',
                     'CashFlowsFromOperatingActivities', 'CashFlowsFromInvestingActivities', 'CashFlowsFromFinancingActivities']
    for col in daily_columns:
        df[f'daily.{col}'] = pd.to_numeric(df[col]) / df['CurrentPeriodLength']

    df = df.copy()

    # 発表日の四本値
    df['DisclosedDateOpen'] = statements['DisclosedDate'].map(daily_quotes.set_index('Date')['Open'].to_dict())
    df['DisclosedDateHigh'] = statements['DisclosedDate'].map(daily_quotes.set_index('Date')['High'].to_dict())
    df['DisclosedDateLow'] = statements['DisclosedDate'].map(daily_quotes.set_index('Date')['Low'].to_dict())
    df['DisclosedDateClose'] = statements['DisclosedDate'].map(daily_quotes.set_index('Date')['Close'].to_dict())
    df['DisclosedDatePriceChange'] = pd.to_numeric(df['DisclosedDateClose']) - pd.to_numeric(df['DisclosedDateOpen'])

    df = df.copy()

    # PER = 株価 / EPS
    # 発表日終値ベースのPER
    df['PER'] = pd.to_numeric(df['DisclosedDateClose']) / pd.to_numeric(df['EarningsPerShare'])
    
    # 予想EPSベースのPER（予想PER）
    df['ForecastPER'] = pd.to_numeric(df['DisclosedDateClose']) / pd.to_numeric(df['ForecastEarningsPerShare'])
    df['NextYearForecastPER'] = pd.to_numeric(df['DisclosedDateClose']) / pd.to_numeric(df['NextYearForecastEarningsPerShare'])

    # PBR = 株価 / BPS
    # 発表日終値ベースのPBR
    df['PBR'] = pd.to_numeric(df['DisclosedDateClose']) / pd.to_numeric(df['BookValuePerShare'])
    
    # PSR = 株価 / 売上高per株 (時価総額 / 売上高)
    df['MarketCap'] = pd.to_numeric(df['DisclosedDateClose']) * pd.to_numeric(df['NumberOfIssuedAndOutstandingSharesAtTheEndOfFiscalYearIncludingTreasuryStock'])
    df['PSR'] = df['MarketCap'] / pd.to_numeric(df['NetSales'])
    df['SalesPerShare'] = pd.to_numeric(df['NetSales']) / pd.to_numeric(df['NumberOfIssuedAndOutstandingSharesAtTheEndOfFiscalYearIncludingTreasuryStock'])
    df['PSR_PerShare'] = pd.to_numeric(df['DisclosedDateClose']) / df['SalesPerShare']
    
    # PCFR = 株価 / キャッシュフローper株
    df['OperatingCashFlowPerShare'] = pd.to_numeric(df['CashFlowsFromOperatingActivities']) / pd.to_numeric(df['NumberOfIssuedAndOutstandingSharesAtTheEndOfFiscalYearIncludingTreasuryStock'])
    df['PCFR'] = pd.to_numeric(df['DisclosedDateClose']) / df['OperatingCashFlowPerShare']
    
    # EV/EBITDA関連（簡易版）
    # EBITDA = 営業利益 + 減価償却費（ここでは営業利益を代用）
    df['EBITDA_Proxy'] = pd.to_numeric(df['OperatingProfit'])  # 減価償却費データがないため営業利益を代用
    df['EV_EBITDA_Proxy'] = df['MarketCap'] / df['EBITDA_Proxy']
    
    # 利益率・効率性指標
    df['OperatingMargin'] = pd.to_numeric(df['OperatingProfit']) / pd.to_numeric(df['NetSales'])  # 営業利益率
    df['NetMargin'] = pd.to_numeric(df['Profit']) / pd.to_numeric(df['NetSales'])  # 純利益率
    df['ROE'] = pd.to_numeric(df['Profit']) / pd.to_numeric(df['Equity'])  # ROE = 純利益 / 自己資本
    df['ROA'] = pd.to_numeric(df['Profit']) / pd.to_numeric(df['TotalAssets'])  # ROA = 純利益 / 総資産
    
    # 安全性指標
    df['DebtToEquityRatio'] = (pd.to_numeric(df['TotalAssets']) - pd.to_numeric(df['Equity'])) / pd.to_numeric(df['Equity'])  # 負債自己資本比率
    df['CurrentRatio_Proxy'] = pd.to_numeric(df['CashAndEquivalents']) / (pd.to_numeric(df['TotalAssets']) - pd.to_numeric(df['Equity']))  # 簡易流動比率
    
    # 成長性指標（前年同期比）
    df['NetSalesGrowth'] = (pd.to_numeric(df['NetSales']) - pd.to_numeric(df['NetSales']).shift(4)) / pd.to_numeric(df['NetSales']).shift(4)  # 売上成長率
    df['OperatingProfitGrowth'] = (pd.to_numeric(df['OperatingProfit']) - pd.to_numeric(df['OperatingProfit']).shift(4)) / pd.to_numeric(df['OperatingProfit']).shift(4)  # 営業利益成長率
    df['ProfitGrowth'] = (pd.to_numeric(df['Profit']) - pd.to_numeric(df['Profit']).shift(4)) / pd.to_numeric(df['Profit']).shift(4)  # 純利益成長率
    df['EPSGrowth'] = (pd.to_numeric(df['EarningsPerShare']) - pd.to_numeric(df['EarningsPerShare']).shift(4)) / pd.to_numeric(df['EarningsPerShare']).shift(4)  # EPS成長率
    
    # 配当関連指標
    df['DividendYield'] = pd.to_numeric(df['ResultDividendPerShareAnnual']) / pd.to_numeric(df['DisclosedDateClose'])  # 配当利回り
    df['PayoutRatio'] = pd.to_numeric(df['ResultDividendPerShareAnnual']) / pd.to_numeric(df['EarningsPerShare'])  # 配当性向
    
    # 予想ベースの指標
    df['ForecastROE'] = pd.to_numeric(df['ForecastProfit']) / pd.to_numeric(df['Equity'])
    df['ForecastOperatingMargin'] = pd.to_numeric(df['ForecastOperatingProfit']) / pd.to_numeric(df['ForecastNetSales'])
    df['ForecastNetMargin'] = pd.to_numeric(df['ForecastProfit']) / pd.to_numeric(df['ForecastNetSales'])
    
    # 価格関連指標
    df['PriceToSales'] = pd.to_numeric(df['DisclosedDateClose']) / df['SalesPerShare']
    df['PriceToBook'] = pd.to_numeric(df['DisclosedDateClose']) / pd.to_numeric(df['BookValuePerShare'])  # PBRと同じ
    df['PriceToCash'] = pd.to_numeric(df['DisclosedDateClose']) / (pd.to_numeric(df['CashAndEquivalents']) / pd.to_numeric(df['NumberOfIssuedAndOutstandingSharesAtTheEndOfFiscalYearIncludingTreasuryStock']))
    
    # 相対的な指標
    df['EarningsYield'] = pd.to_numeric(df['EarningsPerShare']) / pd.to_numeric(df['DisclosedDateClose'])  # PERの逆数
    df['BookToMarket'] = pd.to_numeric(df['BookValuePerShare']) / pd.to_numeric(df['DisclosedDateClose'])  # PBRの逆数
    
    # アナリスト予想との乖離
    df['EPSForecastAccuracy'] = (pd.to_numeric(df['EarningsPerShare']) - pd.to_numeric(df['ForecastEarningsPerShare'])) / pd.to_numeric(df['ForecastEarningsPerShare'])
    df['NetSalesForecastAccuracy'] = (pd.to_numeric(df['NetSales']) - pd.to_numeric(df['ForecastNetSales'])) / pd.to_numeric(df['ForecastNetSales'])
    
    df = df.copy()    

    # カテゴリ特徴量（ラグ有り）
    ftcl_columns = ['TypeOfDocument', 'TypeOfCurrentPeriod', 'MaterialChangesInSubsidiaries', 'SignificantChangesInTheScopeOfConsolidation', 'ChangesBasedOnRevisionsOfAccountingStandard',
                    'ChangesOtherThanOnesBasedOnRevisionsOfAccountingStandard', 'ChangesInAccountingEstimates', 'RetrospectiveRestatement']
    for col in ftcl_columns:
        df[f'ftc.{col}'] = df[col]
        for i in range(1, max_lag + 1):
            df[f"ftc.shift{i}.{col}"] = df[col].shift(i)

        df = df.copy()

    # 数値特徴量（ラグ無し）
    ftn_columns = ['DisclosedDateOpen', 'DisclosedDateHigh', 'DisclosedDateLow', 'DisclosedDateClose', 'DisclosedDatePriceChange']
    for col in ftn_columns:
        df[f'ftn.{col}'] = df[col]

    df = df.copy()

    # 数値特徴量（ラグ有り）
    ftnl_columns = [
        'CurrentPeriodLength', 'DisclosureLagDays', 'TotalAssets', 'Equity', 'EquityToAssetRatio', 'BookValuePerShare', 'CashAndEquivalents',
        'PER', 'ForecastPER', 'NextYearForecastPER', 'PBR',
        'MarketCap', 'PSR', 'SalesPerShare', 'PSR_PerShare', 'OperatingCashFlowPerShare', 'PCFR',
        'EBITDA_Proxy', 'EV_EBITDA_Proxy', 'OperatingMargin', 'NetMargin', 'ROE', 'ROA',
        'DebtToEquityRatio', 'CurrentRatio_Proxy', 'NetSalesGrowth', 'OperatingProfitGrowth', 'ProfitGrowth', 'EPSGrowth',
        'DividendYield', 'PayoutRatio', 'ForecastROE', 'ForecastOperatingMargin', 'ForecastNetMargin',
        'PriceToSales', 'PriceToBook', 'PriceToCash', 'EarningsYield', 'BookToMarket',
        'EPSForecastAccuracy', 'NetSalesForecastAccuracy'
    ] + [f'ResultDividendPerShare{a}Quarter' for a in ['1st', '2nd', '3rd']] + [
        'ResultDividendPerShareFiscalYearEnd', 'ResultDividendPerShareAnnual',
        'DistributionsPerUnit(REIT)', 'ResultTotalDividendPaidAnnual', 'ResultPayoutRatioAnnual',
        "ForecastDividendPerShare1stQuarter", "ForecastDividendPerShare2ndQuarter", "ForecastDividendPerShare3rdQuarter", "ForecastDividendPerShareFiscalYearEnd", "ForecastDividendPerShareAnnual", "ForecastDistributionsPerUnit(REIT)", "ForecastTotalDividendPaidAnnual", "ForecastPayoutRatioAnnual", "NextYearForecastDividendPerShare1stQuarter", "NextYearForecastDividendPerShare2ndQuarter", "NextYearForecastDividendPerShare3rdQuarter", "NextYearForecastDividendPerShareFiscalYearEnd", "NextYearForecastDividendPerShareAnnual", "NextYearForecastDistributionsPerUnit(REIT)", "NextYearForecastPayoutRatioAnnual", "ForecastNetSales2ndQuarter", "ForecastOperatingProfit2ndQuarter", "ForecastOrdinaryProfit2ndQuarter", "ForecastProfit2ndQuarter", "ForecastEarningsPerShare2ndQuarter", "NextYearForecastNetSales2ndQuarter", "NextYearForecastOperatingProfit2ndQuarter", "NextYearForecastOrdinaryProfit2ndQuarter", "NextYearForecastProfit2ndQuarter", "NextYearForecastEarningsPerShare2ndQuarter", "ForecastNetSales", "ForecastOperatingProfit", "ForecastOrdinaryProfit", "ForecastProfit", "ForecastEarningsPerShare", "NextYearForecastNetSales", "NextYearForecastOperatingProfit", "NextYearForecastOrdinaryProfit", "NextYearForecastProfit", "NextYearForecastEarningsPerShare",
        'NumberOfIssuedAndOutstandingSharesAtTheEndOfFiscalYearIncludingTreasuryStock', 'NumberOfTreasuryStockAtTheEndOfFiscalYear', 'AverageNumberOfShares'
    ] + daily_columns + [f"daily.{col}" for col in daily_columns]
    for col in ftnl_columns:
        df[f'ftn.{col}'] = df[col]
        for i in range(1, max_lag + 1):
            df[f"ftn.shift{i}.{col}"] = df[col].shift(i)
            df[f"ftn.diff{i}.{col}"] = pd.to_numeric(df[col]) - pd.to_numeric(df[col].shift(i))

        df = df.copy()

    # 数値特徴量をfloat32にする（欠損値はNaN）
    ftn_columns = [col for col in df.columns if col.startswith('ftn.')]
    for col in ftn_columns:
        df[col] = pd.to_numeric(df[col])

    df = df.copy()
    
    # カテゴリ特徴量の欠損値''をNoneにする
    ftc_columns = [col for col in df.columns if col.startswith('ftc.')]
    for col in ftc_columns:
        df[col] = df[col].replace('', None)

    df = df.copy()

    # ラベル
    daily_quotes['label'] = (daily_quotes['Close'].shift(-20) - daily_quotes['Open'].shift(-1)) / daily_quotes['Open'].shift(-1)
    df['label'] = statements['DisclosedDate'].map(daily_quotes.set_index('Date')['label'].to_dict())

    df = df.copy()

    return df

In [None]:
feature_df = []

for file in tqdm(files):
    # DEBUG
    # if not '9434' in file:
    #     continue
    
    with open(file) as f:
        data = json.load(f)

    # DEBUG
    # if (pd.DataFrame(data['fins_statements'])['RetrospectiveRestatement'] == '').all():
    #     continue

    code = data['code']
    feature_df.append(create_feature_df(data))

    # DEBUG
    # pd.DataFrame(data['fins_statements']).T.to_excel(f'fins_statements_{code}.xlsx')
    # pd.DataFrame(data['prices_daily_quotes']).to_excel(f'prices_daily_quotes_{code}.xlsx')
    # feature_df.T.to_excel(f'preview_{code}.xlsx')
    # break

feature_df = pd.concat(feature_df, axis=0, ignore_index=True)

In [None]:
feature_df.to_pickle('jq_features.pkl')