In [1]:
import pandas as pd
import glob
import os
import sys
import re
from arelle import Cntlr, ModelManager, XbrlConst
from arelle.ModelValue import qname
from sqlalchemy import create_engine, Column, String, Integer, MetaData, Table
from sqlalchemy.dialects.postgresql import NUMERIC, BIGINT
import psycopg2
from tqdm import tqdm
import numpy
import json

with open('parameters.json', 'r') as file:
    data = json.load(file)

#from utils import extract_files_from_zip

CONSOLIDATED_OR_NONCONSOLIDATED_COL = "連結/個別"
pattern = r'/content/([^/]+)/'
DATABASE_URI = data["DATABASE_URI"]

engine = create_engine(DATABASE_URI)
metadata = MetaData()

query = """
select distinct "docID" from t_financials;
"""

In [8]:
#get all the docIds already ingested into PostgresDB
df_docid = pd.read_sql(query, engine)
ingested_docids = []
ingested_docids = df_docid.docID.values

In [10]:
xbrl_file_dirs = list()

for xbrl_file_dir in glob.glob('./content/**/**/**/*.xbrl'):
    extracted_string = re.search(pattern, xbrl_file_dir).group(1)
    if extracted_string not in ingested_docids:
        xbrl_file_dirs.append([extracted_string, xbrl_file_dir])

In [11]:
#fact_datas = list()

for docID, xbrl_file in tqdm(xbrl_file_dirs):
#    print(docID)
    #ctrl = Cntlr.Cntlr(logFileName='logToPrint')
    ctrl = Cntlr.Cntlr()
    model_xbrl = ctrl.modelManager.load(xbrl_file)
    fact_datas = list()

    for fact in model_xbrl.facts:
#        print(fact.unit)

        if fact.unit is not None and str(fact.unit.value) == 'JPY':
            label_ja = fact.concept.label(preferredLabel=None, lang='ja', linkroleHint=None)             
            x_value = fact.xValue

            if fact.context.startDatetime:
                start_date = fact.context.startDatetime
            else:
                start_date = None
            if fact.context.endDatetime:
                end_date = fact.context.endDatetime
            else:
                end_date = None

            fact_datas.append([
               docID,
               label_ja,
               x_value,
               start_date,
               end_date,
               fact.contextID,
            ])
        else:
            continue

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.43s/it]


In [14]:
df = pd.DataFrame(fact_datas, columns=['docID','itemName', 'amount', 'periodStart', 'periodEnd', 'categoryID'] )
df_d = df[df['categoryID'] == 'CurrentYearDuration']
df_i = df[df['categoryID'] == 'CurrentYearInstant']
df_m = pd.concat([df_d, df_i])
df_m.to_sql('t_financials', con=engine, if_exists='append', index=False) #somehow it errors

155