#### Extract ¨documents¨ from each history, from Shiji 史記 through Xin Tang shu 新唐書
##### Instructions: https://hackmd.io/DgWmLHo_SvGBVjUs57O70g?both

In [1]:
import json
from Book2 import Book
import re
import pandas as pd
import types

In [4]:
Histories = '''
Shiji
Hanshu
Sanguozhi
HouHanShu
Songshu
Nanqishu
Weishu
LiangShu
Chenshu
BeiQishu
Zhoushu
SuiShu
Jinshu
Nanshi
Beishi
JiuTangshu
XinTangshu
'''
Histories = Histories.strip().split()

In [5]:
books = [None] * len(Histories)   # initialize to a list of None's
len(books)

17

In [6]:
%%time
for j, history in enumerate(Histories):
    books[j] = Book(bookname=history, date="2019-06-03", creator="RGT")
    books[j].load_htmls(f"./{history}/")
    books[j].extract_paths()
    books[j].strip_all_irrelevant_tags(connect_the_broken_lines=True, html_cutoff=False)


INFO:root:Stop at loading ./Shiji/Shiji_0392.html.
INFO:root:Total length of the data is 392.
INFO:root:Remove the new lines added by the page dividers, connect the paragraphs before and after the new lines.
INFO:root:Remove 標註, page number, and page dividers from the tree structure.
INFO:root:Stop at loading ./Hanshu/Hanshu_0897.html.
INFO:root:Total length of the data is 897.
INFO:root:Remove the new lines added by the page dividers, connect the paragraphs before and after the new lines.
INFO:root:Remove 標註, page number, and page dividers from the tree structure.
INFO:root:Stop at loading ./Sanguozhi/Sanguozhi_0495.html.
INFO:root:Total length of the data is 495.
INFO:root:Remove the new lines added by the page dividers, connect the paragraphs before and after the new lines.
INFO:root:Remove 標註, page number, and page dividers from the tree structure.
INFO:root:Stop at loading ./HouHanShu/HouHanShu_1141.html.
INFO:root:Total length of the data is 1141.
INFO:root:Remove the new lines a

Wall time: 1min 53s


In [15]:
books[10]

       type       variable                 method current_length
0      meta      flat_meta      self.extract_meta              0
1      path          paths     self.extract_paths            358
2  passages  flat_passages  self.extract_passages              0

In [27]:
#%%time
# The code below goes through the entire book and picks out:
# (1) all indented ("2-2") documents
# (2) all quoted (「」) documents
# (3) all "normal" texts ("2-0")
#
# Variable dictionary
#    idocs: indented ("2-2") documents
#    qdocs: quoted documents (i.e., texts enclosed between 「」)
#    regs:  regular text (the above two types and the bookmark excluded)

DEBUG = False

DataTable = []

#
regex_bio = re.compile(r"^史／正史／(\w+)／列傳")  # only biographies are included in this analysis

regex_quoted = re.compile(r"(?<=「)[^「]{200,}?(?=」)")  # lookaround - look ahead for 」; look behind for 「

quoted_texts = {} # dictionary indexed by idx (HTML file #), value is a list of quoted texts

for j, book in enumerate(books):
    history = Histories[j]
    print(f"History = {history}")
    for idx, unit in enumerate(book.flat_bodies):
        if regex_bio.search(book.paths[idx]) or history == 'Sanguozhi': # 列傳 in bookmark or 三國志
            B = len(book.paths[idx]) # length of bookmark
            # documents
            idocs = unit.find_all('div', {'style': 'text-indent:2em;padding-left:2em;'})
            # "regular" text
            regs = unit.find_all('div', {'style': re.compile(r"text-indent:[20]em;padding-left:0em;")})
            if idx in range(0, len(book.paths)):
                iD = sum([len(d.text) for d in idocs])  # documents from indent "2-2"
                # extract quoted docs from "regular" text
                for r in regs:
                    quoted_texts[idx] = regex_quoted.findall(r.text)
                qD = sum(len(q) for q in quoted_texts[idx])
                R  = sum([len(r.text) for r in regs]) - qD # "regular" or normal texts (excluding quoted texts)
                unit_text = re.sub(r"\n", "", unit.text)
                T  = len(unit_text) # total length of HTML text (including bookmark)
                diff = T - iD - qD - R - B
                if DEBUG:
                    print(idx, end='')
                    print(f": diff = {diff}")
                    #print(docs)
                    print(f"  Length of bookmark = {B}")
                    print(f"  Length of indented docs = {iD}")
                    print(f"  Length of quoted docs = {qD}")
                    #print(regs)
                    print(f"  Length of regs = {R}")
                    print(f"  Length of entire HTML = {T}")
                    print('-'*30)
                DataTable.append((history, idx, T, iD, qD, R, B, diff))


History = Shiji
History = Hanshu
History = Sanguozhi
History = HouHanShu
History = Songshu
History = Nanqishu
History = Weishu
History = LiangShu
History = Chenshu
History = BeiQishu
History = Zhoushu
History = SuiShu
History = Jinshu
History = Nanshi
History = Beishi
History = JiuTangshu
History = XinTangshu


In [28]:
# column names
df_columns=['History', 'Index', 'Total', 'iDocs', 'qDocs', 'NormalText', 'Bookmark', 'DiffCheck']
df_Histories = pd.DataFrame(data=DataTable, columns=df_columns)

In [36]:
df_Histories[df_Histories.History=='SuiShu'].head(15)

Unnamed: 0,History,Index,Total,iDocs,qDocs,NormalText,Bookmark,DiffCheck
4146,SuiShu,600,1597,0,0,1532,48,17
4147,SuiShu,601,1049,0,0,994,55,0
4148,SuiShu,602,585,0,0,530,55,0
4149,SuiShu,603,157,0,0,102,55,0
4150,SuiShu,604,1207,546,0,607,54,0
4151,SuiShu,605,2535,496,0,1959,48,32
4152,SuiShu,606,1121,0,0,1070,51,0
4153,SuiShu,607,1707,0,0,1646,61,0
4154,SuiShu,608,2198,0,0,2145,48,5
4155,SuiShu,609,2167,1114,0,988,48,17


In [33]:
df_Histories.to_excel("medievalHistories17d.xlsx")

In [None]:
df = df_Histories

df_Histories[df.History == 'Sanguozhi']

In [35]:
books[Histories.index('SuiShu')].flat_bodies[612].find_all('div')

[<div style="text-indent:2em;padding-left:0em;">皇甫績字功明，安定朝那人也。祖穆，魏隴東太守。父道，周湖州刺史、雍州都督。績三歲而孤，為外祖韋孝寬所鞠養。嘗與諸外兄博奕，孝寬以其惰業，督以嚴訓，愍績孤幼，特捨之。績歎曰：「我無庭訓，養於外氏，不能剋躬勵己，何以成立？」深自感激，命左右自杖三十。孝寬聞而對之流涕。於是精心好學，略涉經史。</div>,
 <div style="text-indent:2em;padding-left:0em;">周武帝為魯公時，引為侍讀。建德初，轉宮尹中士。武帝嘗避暑雲陽宮，時宣帝為太子監國。衞剌王作亂，城門已閉，百僚多有遁者。績聞難赴之，於玄武門遇皇太子，太子下樓執績手，悲喜交集。帝聞而嘉之，遷小宮尹。宣政初，錄前後功，封義陽縣男，拜畿伯下大夫，累轉御正下大夫。</div>,
 <div style="text-indent:2em;padding-left:0em;">宣帝崩，高祖總己，績有力焉，語在鄭譯傳。加位上開府，轉內史中大夫，進封郡公，邑千戶。尋拜大將軍。</div>,
 <div style="text-indent:2em;padding-left:0em;">開皇元年，出為豫州刺史，增邑通前二千五百戶。尋拜都官尚書。後數載，轉晉州刺史，將之官，稽首而言曰：「臣實庸鄙，無益於國，每思犯難以報國恩。今偽陳尚存，以臣度之，有三可滅。」上問其故。績答曰：「大吞小，一也；以有道伐無道，二也；納叛臣蕭巖，於我有詞，三也。陛下若命鷹揚之將，臣請預戎行，展絲髮之効。」上嘉其壯志，勞而遣之。及陳平，拜蘇州刺史。</div>,
 <div style="text-indent:2em;padding-left:0em;">高智慧等作亂江南，州民顧子元發兵應之，因以攻績，相持八旬。子元素感績恩，於冬至日遣使奉牛酒。績遺子元書曰：「皇帝握符受籙，合極通靈，受揖讓於唐、虞，棄干戈於湯、武。東踰蟠木，方朔所未窮，西盡流沙，張騫所不至。玄漠黃龍之外，交臂來王，葱嶺、榆關之表，屈膝請吏。曩者偽陳獨阻聲教，江東士民困於荼毒。皇天輔仁，假手朝廷，聊申薄伐，應時瓦解。金陵百姓，死而復生，吳、會臣民，白骨還肉。唯當懷音感德，行歌擊壤，豈宜自同吠主，翻成反噬。卿非吾民，何須酒禮？吾是隋將，何容外交？易子析骸，未