### **Digital SongShu Project**
#### Last updated 2018-03-05 by Ruben G. Tsui

In [2]:
import json
from SongShu import SongShu
import re
import pandas as pd
from bs4 import BeautifulSoup
import altair

## SongShu -- tagging names, offices, places
##### <font color='red'>*Change your Songshu html data folder below*</font>

In [3]:
songshu = SongShu("2019-03-05", "RBT")
songshu.load_htmls(r'C:\NLP\Raft\Song shu-20181231T032348Z-001\data - Song shu')
songshu.extract_all()

INFO:root:Stop at loading C:\NLP\Raft\Song shu-20181231T032348Z-001\data - Song shu\SongShu_0851.html.
INFO:root:Total length of the data is 851.
ERROR:root:[Error] SongShu_rare_char.json does not exist

            try to run these lines: 
            	>> self.extract_rare_chars()
            	>> self.write_rare_chars()

INFO:root:Remove 標註, page number, and page dividers from the tree structure.
INFO:root:Remove the new lines added by the page dividers, connect the paragraphs before and after the new lines.


In [4]:
len(songshu.flat_bodies)

851

### Let's just deal with Scroll 001 for now

In [33]:
scroll001 = songshu.flat_bodies[0]  # 卷001

In [34]:
len(scroll001.html.body.span.find_all('div'))  # each <div> tag contains one paragraph 

67

In [35]:
paragraphs = [] # save each paragraph as an element in this list
for c in scroll001.html.body.span.find_all('div'):
    paragraphs.append(c.text)
    
sep = f"\n{'-'*60}\n"    ## paragraph separator (newline, -----, newline)
print(sep.join(paragraphs[:5]))

高祖武皇帝諱裕，字德輿，小名寄奴，彭城縣綏輿里人，漢高帝弟楚元王交之後也。交生紅懿侯富，富生宗正辟彊，辟彊生陽城繆侯德，德生陽城節侯安民，安民生陽城釐侯慶忌，慶忌生陽城肅侯岑，岑生宗正平，平生東武城令某，某生東萊太守景，景生明經洽，洽生博士弘，弘生瑯邪都尉悝，悝生魏定襄太守某，某生邪城令亮，亮生晉北平太守膺，膺生相國掾熙，熙生開封令旭孫。旭孫生混，始過江，居晉陵郡丹徒縣之京口里，官至武原令。混生東安太守靖，靖生郡功曹翹，是為皇考。高祖以晉哀帝興寧元年歲次癸亥三月壬寅夜生。及長，身長七尺六寸，風骨奇特。家貧，有大志，不治廉隅。事繼母以孝謹稱。
------------------------------------------------------------
初為冠軍孫無終司馬。安帝隆安三年十一月，妖賊孫恩作亂於會稽，晉朝衞將軍謝琰、前將軍劉牢之東討。牢之請高祖參府軍事。十二月，牢之至吳，而賊緣道屯結，牢之命高祖與數十人覘賊遠近。會遇賊至，眾數千人，高祖便進與戰。所將人多死，而戰意方厲，手奮長刀，所殺傷甚眾。牢之子敬宣疑高祖淹久，恐為賊所困，乃輕騎尋之。既而眾騎並至，賊乃奔退，斬獲千餘人，推鋒而進，平山陰，恩遁還入海。
------------------------------------------------------------
四年五月，恩復入會稽，殺衞將軍謝琰。十一月，劉牢之復率眾東征，恩退走。牢之屯上虞，使高祖戍句章城。句章城既卑小，戰士不盈數百人，高祖常被堅執銳，為士卒先，每戰輒摧鋒陷陣，賊乃退還浹口。于時東伐諸帥，御軍無律，士卒暴掠，甚為百姓所苦。唯高祖法令明整，所至莫不親賴焉。
------------------------------------------------------------
五年春，孫恩頻攻句章，高祖屢摧破之，恩復走入海。三月，恩北出海鹽，高祖追而翼之，築城于海鹽故治。賊日來攻城，城內兵力甚弱，高祖乃選敢死之士數百人，咸脫甲冑，執短兵，並鼓噪而出，賊震懼奪氣，因其懼而奔之，並棄甲散走，斬其大帥姚盛。雖連戰剋勝，然眾寡不敵，高祖獨深慮之。一夜，偃旗匿眾，若已遁者。明晨開門，使羸疾數人登城。賊遙問劉裕所在。曰：「夜已走矣。」賊信之，乃率眾大上。高祖乘其懈怠，奮擊，大破之。恩知城不可下，乃進向滬瀆。高祖復棄城追之。海鹽令鮑陋

#### Define tagging functions

In [5]:
def annotate_name(tagging_unit, df, col):
    '''
    This function annotates 'tagging_unit' (which is a paragraph for now) by matching the
    values from column 'col' in the dataframe 'df'; it then creates the tag
    <name id=''></name>, where the attribute id is from the "id" column of 'df'. 
    
    Inputs:
        tagging_unit = e.g. text of a paragraph
        df           = Pandas dataframe containing strings to be matched (e.g. names)
        col          = column name for the values to be matched
    Output:
        The function returns a string containing the tagged text
    
    Example usage:
       annotate_name(paragraph[0], bio_df, '姓名')
    '''
    tagged = ''
    tagged = tagging_unit
    for idx, row in df.iterrows():
        id_  = row['id'] # this is interpreted as the unique key to the dataframe 
        name = row[col]  # e.g. 姓名
        if name in tagged:
            tagged = tagged.replace(name, f"<name id='{id_}'>{name}</name>")
    return tagged

In [6]:
def annotate_office(tagging_unit, df, col):
    '''
    This function works similarly as annotate_name().
    
    Inputs:
       tagging_unit = e.g. text of a paragraph
       df           = Pandas dataframe containing strings to be matched (e.g. office)
       col          = column name for the values to be matched
       
    Example usage:
       annotate_name(paragraph[0], bio_df, 'Name') # 'Name' here corresponds to the column containing official tutles
    '''
    tagged = ''
    tagged = tagging_unit
    for idx, row in df.iterrows():
        id_  = row['ID'] # this is interpreted as the unique key to the dataframe 
        office = row[col]
        if office in tagged:
            tagged = tagged.replace(office, f"<office id='{id_}'>{office}</office>")
    return tagged

#### Read names, offices and places data from Excel worksheets

In [7]:
%%time
bio_df    = pd.read_excel("BioCombinedSortedTruncated.xlsx")  # This file has been processed previously
                                                              # requires no further actions
office_df = pd.read_excel("劉宋地名與官名 2017-11-14.xlsx", sheet_name="Offices")  # this will be sorted later
place_df  = pd.read_excel("劉宋地名與官名 2017-11-14.xlsx", sheet_name="Places")   # this will be sorted later

Wall time: 3.57 s


#### Define function to sort a dataframe by the length of value in a column, in descending order, so that longer string will match first.

In [8]:
def sort_by_length_of_indicated_column(df, col, ascending=False):
    s = df[col].str.len().sort_values(ascending=ascending).index
    df_out = df.reindex(s)
    return df_out

In [9]:
office_df = sort_by_length_of_indicated_column(office_df, 'Name')
place_df  = sort_by_length_of_indicated_column(place_df, 'Name')

In [10]:
office_df = office_df[:-7]  # remove last 7 entries, which are single-character official titles

In [11]:
office_df.tail()

Unnamed: 0,ID,Name
306,lso307,衛尉
309,lso310,廷尉
331,lso332,少府
330,lso331,甸師
319,lso320,太府


In [58]:
# collapse list into a single string for faster processing
sep = '|#|#|#|#|'  # paragraph separator
all_paragraphs = sep.join(paragraphs)

### Deal with variant characters

In [60]:
all_paragraphs = all_paragraphs.replace('衞','衛')

In [62]:
%%time
# annotate names
annotated = annotate_name(all_paragraphs, bio_df, '姓名')

Wall time: 7.12 s


In [113]:
%%time
# annotate offices
annotated = annotate_office(annotated, offices)

Wall time: 19.9 ms


In [114]:
annoteted_paragraphs = annotated.split(sep)

### Apply tagging on entire SongShu

In [12]:
sep = '|#|#|#|#|'  # paragraph separator

fo = open('songshu.all.names.offices.tagged.040-059.txt', 'w', encoding='utf-8', newline='\n')

i = 0
for section in songshu.flat_bodies[40:60]:
    #
    paragraphs = [] # for each section, save each paragraph as an element in this list
    for c in section.html.body.span.find_all('div'):
        paragraphs.append(c.text)
    all_paragraphs = sep.join(paragraphs)
    all_paragraphs = all_paragraphs.replace('衞','衛')
    annotated = annotate_name(all_paragraphs, bio_df, '姓名')
    annotated = annotate_office(annotated, office_df, 'Name')
    fo.write(f"Flat_bodies section {i}\n")
    for line in annotated.split(sep):
        fo.write(line + '\n')
    fo.write(f"{'-'*60}\n")
    i += 1
    if i % 5 == 0:
        print(f"processed {i} sections")
fo.close()

processed 5 sections
processed 10 sections
processed 15 sections
processed 20 sections


In [80]:
fo.close()

In [40]:
soup = BeautifulSoup('<tu>' + annoteted_paragraphs[1] + '</tu>', 'lxml')

In [43]:
myNames = soup.find_all('name')
len(myNames)

10

In [None]:
soup = BeautifulSoup('<tu>' + annoteted_paragraphs[8] + '</tu>', 'lxml')
myNames = soup.find_all('name')
for person in myNames:
    print(person['id'], person.text)

## Extract Passages with Zhou, Jun

In [11]:
flat_bodies = []

for path,body in zip(songshu.paths, songshu.flat_bodies):
    if re.search(r"\／(['州郡']+)", path) != None:
        flat_bodies.append(body)
        
songshuGeo = SongShu(date='2018-8-21', creator='MF')
songshuGeo.filename = 'ShongShuGeo'
songshuGeo.flat_bodies = flat_bodies
songshuGeo.extract_paths()

In [25]:
songshuGeo.flat_bodies[0]

<html>
<body>
<a class="gobookmark" href="hanji?@96^1115717931^70^^^^@@682676597" title="開啟書籤管理">史／正史／宋書／志　凡三十卷／卷三十五　志第二十五／州郡一(P.1027)..[底本：宋元明三朝遞修本]</a>
<span id="fontstyle" style="FONT-SIZE: 12pt;letter-spacing:1pt; LINE-HEIGHT: 18pt;width:99%;word-break:break-all">
<b><h3>宋書卷三十五</h3></b><b><h3>　　志第二十五</h3></b><b><h3>　　　州郡一</h3></b><b><h3>　　　　揚州　南徐州　徐州　南兗州　兗州</h3></b><div style="text-indent:2em;padding-left:0em;">唐堯之世，置十有二牧，及禹平水土，更制九州，冀州堯都，土界廣遠，濟、河為兗州，海、岱為青州，海、岱及淮為徐州，淮、海為揚州，荊及衡陽為荊州，荊、河為豫州，華陽、黑水為梁州，黑水、西河為雍州。自虞至殷無所改變。周氏既有天下，以徐并青，以梁并雍，分冀州之地以為幽、并。漢初又立徐、梁二州。武帝攘卻胡、越，開地斥境，南置交趾，北置朔方，改雍曰涼，改梁曰益，凡為十三州，而司隸部三輔、三河諸郡。東京無復朔方，改交趾曰交州，凡十二州；司隸所部如故。及三國鼎跱，吳得揚、荊、交三州，蜀得益州，魏氏猶得九焉。吳又分交為廣。魏末平蜀，又分益為梁。晉武帝太康元年，天下一統，凡十有六州。後又分涼、雍為秦，分荊、揚為江，分益為寧，分幽為平，而為二十矣。</div>
<div style="text-indent:2em;padding-left:0em;">自夷狄亂華，司、冀、雍、涼、青、并、兗、豫、幽、平諸州一時淪沒，遺民南渡，並僑置牧司，非舊土也。江左又分荊為湘，或離或合，凡有揚、荊、湘、江、梁、益、交、廣，其徐州則有過半，豫州唯得譙城而已。及至宋世，分揚州為南徐，徐州為南兗，揚州之江西悉屬豫州，分荊為雍，分荊、湘為郢，分荊為司，分廣為越，分青為冀，分梁為南北秦。太宗初，索虜南侵，青、冀、徐、兗及豫州淮西，並皆不守，自淮以北，化成虜庭。於是於鍾離置徐州，淮陰為北兗，而

In [12]:
len(songshuGeo.flat_bodies)

23