#### Extract ¨documents¨ from each history, from Shiji 史記 through Xin Tang shu 新唐書
#### Instructions: https://hackmd.io/DgWmLHo_SvGBVjUs57O70g?both

In [1]:
import json
from Book2 import Book
import re
import pandas as pd
import types

In [2]:
Histories = '''BeiQishu	Beishi	Chenshu	Hanshu	HouHanShu	Jinshu	JiuTangshu	LiangShu	Nanqishu	Nanshi	Sanguozhi	Shiji	Songshu	SuiShu	Weishu	XinTangshu	Zhoushu'''
Histories = Histories.split()
Histories, len(Histories)
Histories[12]

'Songshu'

In [3]:
history = Histories[12]
book = Book(bookname=history, date="2019-06-02", creator="RGT")
book.load_htmls(f"./{history}/")

INFO:root:Stop at loading ./Songshu/Songshu_0969.html.
INFO:root:Total length of the data is 969.


In [4]:
book.extract_paths()
book.strip_all_irrelevant_tags(connect_the_broken_lines=True, html_cutoff=False)
book

INFO:root:Remove the new lines added by the page dividers, connect the paragraphs before and after the new lines.
INFO:root:Remove 標註, page number, and page dividers from the tree structure.


       type       variable                 method current_length
0      meta      flat_meta      self.extract_meta              0
1      path          paths     self.extract_paths            969
2  passages  flat_passages  self.extract_passages              0

In [87]:
# The code below goes through the entire book and picks out:
# (1) all indented ("2-2") documents
# (2) all quoted (「」) documents
# (3) all "normal" texts ("2-0")
#
# Variable dictionary
#    idocs: indented ("2-2") documents
#    qdocs: quoted documents (i.e., texts enclosed between 「」)
#    regs:  regular text (the above two types and the bookmark excluded)

DEBUG = False

DataTable = []

#
regex_bio = re.compile(r"^史／正史／(\w+)／列傳")  # only biographies are included in this analysis

regex_quoted = re.compile(r"(?<=「).{200,}?(?=」)")  # lookaround - look ahead for 」; look behind for 「

quoted_texts = {} # dictionary indexed by idx (HTML file #), value is a list of quoted texts

for idx, unit in enumerate(book.flat_bodies):
    if regex_bio.search(book.paths[idx]): # if the bookmark of this flat_body matches 列傳
        B = len(book.paths[idx]) # length of bookmark
        # documents
        idocs = unit.find_all('div', {'style': 'text-indent:2em;padding-left:2em;'})
        # "regular" text
        regs = unit.find_all('div', {'style': 'text-indent:2em;padding-left:0em;'})
        if idx in range(0, len(book.paths)):
            iD = sum([len(d.text) for d in idocs])  # documents from indent "2-2"
            # extract quoted docs from "regular" text
            for r in regs:
                quoted_texts[idx] = regex_quoted.findall(r.text)
            qD = sum(len(q) for q in quoted_texts[idx])
            R  = sum([len(r.text) for r in regs]) - qD # "regular" or normal texts (excluding quoted texts)
            unit_text = re.sub(r"\n", "", unit.text)
            T  = len(unit_text) # total length of HTML text (including bookmark)
            diff = T - iD - qD - R - B
            if DEBUG:
                print(idx, end='')
                print(f": diff = {diff}")
                #print(docs)
                print(f"  Length of bookmark = {B}")
                print(f"  Length of indented docs = {iD}")
                print(f"  Length of quoted docs = {qD}")
                #print(regs)
                print(f"  Length of regs = {R}")
                print(f"  Length of entire HTML = {T}")
                print('-'*30)
            DataTable.append((history, idx, T, iD, qD, R, B, diff))
            

In [88]:
df_Histories = pd.DataFrame( \
                data=DataTable, \
                columns=['History', 'Index', 'Total', 'Indent\nDocs', 'Quoted\nDocs', 'Normal', 'Bookmark', 'Diff-Check'])

In [89]:
df_Histories

Unnamed: 0,History,Index,Total,Indent Docs,Quoted Docs,Normal,Bookmark,Diff-Check
0,Songshu,577,2069,0,0,1923,51,95
1,Songshu,578,444,0,0,387,57,0
2,Songshu,579,563,0,0,506,57,0
3,Songshu,580,250,0,0,193,57,0
4,Songshu,581,244,0,0,187,57,0
5,Songshu,582,144,0,0,86,58,0
6,Songshu,583,253,0,0,196,57,0
7,Songshu,584,1260,499,0,661,57,43
8,Songshu,585,350,0,0,293,57,0
9,Songshu,586,1222,0,0,1154,68,0


In [None]:
df_Histories.to_excel("medievalHistories.xlsx")