#### Extract ¨documents¨ from each history, from Shiji 史記 through Xin Tang shu 新唐書
#### Instructions: https://hackmd.io/DgWmLHo_SvGBVjUs57O70g?both

In [1]:
import json
from Book2 import Book
import re
import pandas as pd
import types

In [2]:
Histories = '''BeiQishu	Beishi	Chenshu	Hanshu	HouHanShu	Jinshu	JiuTangshu	LiangShu	Nanqishu	Nanshi	Sanguozhi	Shiji	Songshu	SuiShu	Weishu	XinTangshu	Zhoushu'''
Histories = Histories.split()
Histories, len(Histories)
Histories[12]

'Songshu'

In [3]:
books = [None] * len(Histories)   # initialize to a list of None's
for j, history in enumerate(Histories):
    books[j] = Book(bookname=history, date="2019-06-02", creator="RGT")
    books[j].load_htmls(f"./{history}/")
    books[j].extract_paths()
    books[j].strip_all_irrelevant_tags(connect_the_broken_lines=True, html_cutoff=False)


INFO:root:Stop at loading ./BeiQishu/BeiQishu_0342.html.
INFO:root:Total length of the data is 342.
INFO:root:Remove the new lines added by the page dividers, connect the paragraphs before and after the new lines.
INFO:root:Remove 標註, page number, and page dividers from the tree structure.
INFO:root:Stop at loading ./Beishi/Beishi_1435.html.
INFO:root:Total length of the data is 1435.
INFO:root:Remove the new lines added by the page dividers, connect the paragraphs before and after the new lines.
INFO:root:Remove 標註, page number, and page dividers from the tree structure.
INFO:root:Stop at loading ./Chenshu/Chenshu_0255.html.
INFO:root:Total length of the data is 255.
INFO:root:Remove the new lines added by the page dividers, connect the paragraphs before and after the new lines.
INFO:root:Remove 標註, page number, and page dividers from the tree structure.
INFO:root:Stop at loading ./Hanshu/Hanshu_0897.html.
INFO:root:Total length of the data is 897.
INFO:root:Remove the new lines added

In [None]:
books[0].flat_bodies[0]

In [11]:
#%%time
# The code below goes through the entire book and picks out:
# (1) all indented ("2-2") documents
# (2) all quoted (「」) documents
# (3) all "normal" texts ("2-0")
#
# Variable dictionary
#    idocs: indented ("2-2") documents
#    qdocs: quoted documents (i.e., texts enclosed between 「」)
#    regs:  regular text (the above two types and the bookmark excluded)

DEBUG = False

DataTable = []

#
regex_bio = re.compile(r"^史／正史／(\w+)／列傳")  # only biographies are included in this analysis

regex_quoted = re.compile(r"(?<=「).{200,}?(?=」)")  # lookaround - look ahead for 」; look behind for 「

quoted_texts = {} # dictionary indexed by idx (HTML file #), value is a list of quoted texts

for j, book in enumerate(books):
    history = Histories[j]
    print(f"History = {history}")
    for idx, unit in enumerate(book.flat_bodies):
        if regex_bio.search(book.paths[idx]): # if the bookmark of this flat_body matches 列傳
            B = len(book.paths[idx]) # length of bookmark
            # documents
            idocs = unit.find_all('div', {'style': 'text-indent:2em;padding-left:2em;'})
            # "regular" text
            regs = unit.find_all('div', {'style': 'text-indent:2em;padding-left:0em;'})
            if idx in range(0, len(book.paths)):
                iD = sum([len(d.text) for d in idocs])  # documents from indent "2-2"
                # extract quoted docs from "regular" text
                for r in regs:
                    quoted_texts[idx] = regex_quoted.findall(r.text)
                qD = sum(len(q) for q in quoted_texts[idx])
                R  = sum([len(r.text) for r in regs]) - qD # "regular" or normal texts (excluding quoted texts)
                unit_text = re.sub(r"\n", "", unit.text)
                T  = len(unit_text) # total length of HTML text (including bookmark)
                diff = T - iD - qD - R - B
                if DEBUG:
                    print(idx, end='')
                    print(f": diff = {diff}")
                    #print(docs)
                    print(f"  Length of bookmark = {B}")
                    print(f"  Length of indented docs = {iD}")
                    print(f"  Length of quoted docs = {qD}")
                    #print(regs)
                    print(f"  Length of regs = {R}")
                    print(f"  Length of entire HTML = {T}")
                    print('-'*30)
                DataTable.append((history, idx, T, iD, qD, R, B, diff))


History = BeiQishu
History = Beishi
History = Chenshu
History = Hanshu
History = HouHanShu
History = Jinshu
History = JiuTangshu
History = LiangShu
History = Nanqishu
History = Nanshi
History = Sanguozhi
History = Shiji
History = Songshu
History = SuiShu
History = Weishu
History = XinTangshu
History = Zhoushu


In [12]:
df_Histories = pd.DataFrame( \
                data=DataTable, \
                columns=['History', 'Index', 'Total', 'Indent\nDocs', 'Quoted\nDocs', 'Normal', 'Bookmark', 'Diff-Check'])

In [13]:
df_Histories

Unnamed: 0,History,Index,Total,Indent Docs,Quoted Docs,Normal,Bookmark,Diff-Check
0,BeiQishu,40,877,0,0,778,47,52
1,BeiQishu,41,285,0,0,238,47,0
2,BeiQishu,42,389,0,0,342,47,0
3,BeiQishu,43,198,0,0,151,47,0
4,BeiQishu,44,529,0,0,482,47,0
5,BeiQishu,45,881,0,330,497,54,0
6,BeiQishu,46,278,0,0,132,48,98
7,BeiQishu,47,852,0,0,797,55,0
8,BeiQishu,48,199,0,0,144,55,0
9,BeiQishu,49,1083,0,0,1028,55,0


In [14]:
df_Histories.to_excel("medievalHistories17.xlsx")