### **Digital SongShu Project**
#### Last updated 2018-03-05 by Ruben G. Tsui

In [1]:
import json
from SongShu import SongShu
import re
import pandas as pd
from bs4 import BeautifulSoup
import altair

## SongShu -- tagging names, offices, places
##### <font color='red'>*Change your Songshu html data folder below*</font>

In [2]:
songshu = SongShu("2019-03-05", "RBT")
songshu.load_htmls(r'C:\NLP\Raft\Song shu-20181231T032348Z-001\data - Song shu')
songshu.extract_all()

INFO:root:Stop at loading C:\NLP\Raft\Song shu-20181231T032348Z-001\data - Song shu\SongShu_0851.html.
INFO:root:Total length of the data is 851.
ERROR:root:[Error] SongShu_rare_char.json does not exist

            try to run these lines: 
            	>> self.extract_rare_chars()
            	>> self.write_rare_chars()

INFO:root:Remove 標註, page number, and page dividers from the tree structure.
INFO:root:Remove the new lines added by the page dividers, connect the paragraphs before and after the new lines.


In [3]:
len(songshu.flat_bodies)

851

### Let's just deal with Scroll 001 for now <font color='red'>(skip this for now)</font>

In [33]:
scroll001 = songshu.flat_bodies[0]  # 卷001

In [34]:
len(scroll001.html.body.span.find_all('div'))  # each <div> tag contains one paragraph 

67

In [35]:
paragraphs = [] # save each paragraph as an element in this list
for c in scroll001.html.body.span.find_all('div'):
    paragraphs.append(c.text)
    
sep = f"\n{'-'*60}\n"    ## paragraph separator (newline, -----, newline)
print(sep.join(paragraphs[:5]))

高祖武皇帝諱裕，字德輿，小名寄奴，彭城縣綏輿里人，漢高帝弟楚元王交之後也。交生紅懿侯富，富生宗正辟彊，辟彊生陽城繆侯德，德生陽城節侯安民，安民生陽城釐侯慶忌，慶忌生陽城肅侯岑，岑生宗正平，平生東武城令某，某生東萊太守景，景生明經洽，洽生博士弘，弘生瑯邪都尉悝，悝生魏定襄太守某，某生邪城令亮，亮生晉北平太守膺，膺生相國掾熙，熙生開封令旭孫。旭孫生混，始過江，居晉陵郡丹徒縣之京口里，官至武原令。混生東安太守靖，靖生郡功曹翹，是為皇考。高祖以晉哀帝興寧元年歲次癸亥三月壬寅夜生。及長，身長七尺六寸，風骨奇特。家貧，有大志，不治廉隅。事繼母以孝謹稱。
------------------------------------------------------------
初為冠軍孫無終司馬。安帝隆安三年十一月，妖賊孫恩作亂於會稽，晉朝衞將軍謝琰、前將軍劉牢之東討。牢之請高祖參府軍事。十二月，牢之至吳，而賊緣道屯結，牢之命高祖與數十人覘賊遠近。會遇賊至，眾數千人，高祖便進與戰。所將人多死，而戰意方厲，手奮長刀，所殺傷甚眾。牢之子敬宣疑高祖淹久，恐為賊所困，乃輕騎尋之。既而眾騎並至，賊乃奔退，斬獲千餘人，推鋒而進，平山陰，恩遁還入海。
------------------------------------------------------------
四年五月，恩復入會稽，殺衞將軍謝琰。十一月，劉牢之復率眾東征，恩退走。牢之屯上虞，使高祖戍句章城。句章城既卑小，戰士不盈數百人，高祖常被堅執銳，為士卒先，每戰輒摧鋒陷陣，賊乃退還浹口。于時東伐諸帥，御軍無律，士卒暴掠，甚為百姓所苦。唯高祖法令明整，所至莫不親賴焉。
------------------------------------------------------------
五年春，孫恩頻攻句章，高祖屢摧破之，恩復走入海。三月，恩北出海鹽，高祖追而翼之，築城于海鹽故治。賊日來攻城，城內兵力甚弱，高祖乃選敢死之士數百人，咸脫甲冑，執短兵，並鼓噪而出，賊震懼奪氣，因其懼而奔之，並棄甲散走，斬其大帥姚盛。雖連戰剋勝，然眾寡不敵，高祖獨深慮之。一夜，偃旗匿眾，若已遁者。明晨開門，使羸疾數人登城。賊遙問劉裕所在。曰：「夜已走矣。」賊信之，乃率眾大上。高祖乘其懈怠，奮擊，大破之。恩知城不可下，乃進向滬瀆。高祖復棄城追之。海鹽令鮑陋

#### Define tagging functions

In [4]:
def annotate_name(tagging_unit, df, col):
    '''
    This function annotates 'tagging_unit' (which is a paragraph for now) by matching the
    values from column 'col' in the dataframe 'df'; it then creates the tag
    <name id=''></name>, where the attribute id is from the "id" column of 'df'. 
    
    Inputs:
        tagging_unit = e.g. text of a paragraph
        df           = Pandas dataframe containing strings to be matched (e.g. names)
        col          = column name for the values to be matched
    Output:
        The function returns a string containing the tagged text
    
    Example usage:
       annotate_name(paragraph[0], bio_df, '姓名')
    '''
    tag = 'n' # 'name'
    tagged = tagging_unit
    for idx, row in df.iterrows():
        id_  = row['id'] # this is interpreted as the unique key to the dataframe 
        name = row[col]  # e.g. 姓名
        if name in tagged:
            tagged = tagged.replace(name, f"<{tag} id='{id_}'>{name}</{tag}>")
    return tagged

In [5]:
def annotate_office(tagging_unit, df, col):
    '''
    This function works similarly as annotate_name().
    
    Inputs:
       tagging_unit = e.g. text of a paragraph
       df           = Pandas dataframe containing strings to be matched (e.g. office)
       col          = column name for the values to be matched
       
    Example usage:
       annotate_name(paragraph[0], bio_df, 'Name') # 'Name' here corresponds to the column containing official tutles
    '''
    tag = 'o' # 'office'
    tagged = tagging_unit
    for idx, row in df.iterrows():
        id_  = row['ID'] # this is interpreted as the unique key to the dataframe 
        office = row[col]
        if office in tagged:
            tagged = tagged.replace(office, f"<{tag} id='{id_}'>{office}</{tag}>")
    return tagged

#### Read names, offices and places data from Excel worksheets

In [24]:
%%time
bio_df    = pd.read_excel("BioCombinedSorted.xlsx")  # This file has been processed previously
                                                              # requires no further actions
office_df = pd.read_excel("劉宋地名與官名 2017-11-14.xlsx", sheet_name="Offices")  # this will be sorted later
place_df  = pd.read_excel("劉宋地名與官名 2017-11-14.xlsx", sheet_name="Places")   # this will be sorted later

Wall time: 712 ms


#### Define function to sort a dataframe by the length of value in a column, in descending order, so that longer string will match first.

In [16]:
def sort_by_length_of_indicated_column(df, col, ascending=False):
    s = df[col].str.len().sort_values(ascending=ascending).index
    df_out = df.reindex(s)
    return df_out

In [17]:
office_df = sort_by_length_of_indicated_column(office_df, 'Name')
place_df  = sort_by_length_of_indicated_column(place_df, 'Name')

In [18]:
office_df = office_df[:-7]  # remove last 7 entries, which are single-character official titles

In [19]:
office_df.tail()

Unnamed: 0,ID,Name
306,lso307,衛尉
309,lso310,廷尉
331,lso332,少府
330,lso331,甸師
319,lso320,太府


### Deal with variant characters

In [60]:
#all_paragraphs = all_paragraphs.replace('衞','衛')

### Apply tagging on entire SongShu

In [25]:
%%time
sep = '|#|#|#|#|'  # paragraph separator

fo = open('songshu.all.names.offices.tagged.000-100.txt', 'w', encoding='utf-8', newline='\n')

i = 0
for section in songshu.flat_bodies[i:i+100]: # start with a smaller subset
    #
    paragraphs = [] # for each section, save each paragraph as an element in this list
    for c in section.html.body.span.find_all('div'):
        paragraphs.append(c.text)
    all_paragraphs = sep.join(paragraphs) # collapse list into a single string for faster processing
    all_paragraphs = all_paragraphs.replace('衞','衛')
    annotated = annotate_name(all_paragraphs, bio_df, 'name')
    annotated = annotate_office(annotated, office_df, 'Name')
    fo.write(f"Flat_bodies section {i}\n")
    for line in annotated.split(sep):
        fo.write(line + '\n')
    fo.write(f"{'-'*60}\n")
    i += 1
    if i % 5 == 0:
        print(f"processed {i} sections")
fo.close()

processed 5 sections
processed 10 sections
processed 15 sections
processed 20 sections
processed 25 sections
processed 30 sections
processed 35 sections
processed 40 sections
processed 45 sections
processed 50 sections
processed 55 sections
processed 60 sections
processed 65 sections
processed 70 sections
processed 75 sections
processed 80 sections
processed 85 sections
processed 90 sections
processed 95 sections
processed 100 sections
Wall time: 2min 30s


In [22]:
fo.close()

In [16]:
songshu.flat_bodies[566]

<html>
<body>
<a class="gobookmark" href="hanji?@96^1115717931^70^^^^@@1047099258" title="開啟書籤管理">史／正史／宋書／志　凡三十卷／卷三十九　志第二十九／百官上(P.1217)..[底本：宋元明三朝遞修本]</a>
<span id="fontstyle" style="FONT-SIZE: 12pt;letter-spacing:1pt; LINE-HEIGHT: 18pt;width:99%;word-break:break-all">
<b><h3>宋書卷三十九</h3></b><b><h3>　　志第二十九</h3></b><b><h3>　　　百官上</h3></b><div style="text-indent:2em;padding-left:0em;">太宰，一人。周武王時，周公旦始居之，掌邦治，為六卿之首。秦、漢、魏不常置。晉初依周禮，備置三公。三公之職，太師居首，景帝名師，故置太宰以代之。太宰，蓋古之太師也。殷紂之時，箕子為太師。周武王時，太公為太師。周成王時，周公為太師。周公薨，畢公代之。漢西京初不置，平帝始復置太師官，而孔光居焉。漢東京又廢。獻帝初，董卓為太師，卓誅又廢。魏世不置。晉既因太師而置太宰，以安平王孚居焉。</div>
<div style="text-indent:2em;padding-left:0em;">太傅，一人。周成王時，畢公為太傅。漢高后元年，初用王陵。</div>
<div style="text-indent:2em;padding-left:0em;">太保，一人。殷太甲時，伊尹為太保。周武王時，召公為太保。漢平帝元始元年，始用王舜。後漢至魏不置，晉初復置焉。自太師至太保，是為三公。論道經邦，燮理陰陽，無其人則闕，所以訓護人主，導以德義者。</div>
<div style="text-indent:2em;padding-left:0em;">相國，一人。漢高帝十一年始置，以蕭何居之，罷丞相；何薨，曹參代之；參薨，罷。魏齊王以晉景帝為相國。晉惠帝時趙王倫，愍帝時南陽王保，安帝時宋高祖，順帝時齊王，並為相國。自魏、晉以來，非復人臣之位矣。</div>
<div style="text-indent:2em;padding-l