### This script extracts biographical data downloaded from a mainland Chinese website

###### Requires Python 3.6+
###### Last updated: 2019-02-23

In [1]:
from bs4 import BeautifulSoup
import os, time, sys

In [2]:
import glob
def getListOfFiles(dirName, fileExt=None):
    '''
        For the given dirName (path, folder), get the list of all files in the directory tree
        (all subfolders) with the file extension fireExt
    '''
    dirName = dirName.replace('\\', '/')  # MS Windows-specific
    if fileExt:  # if this is not None, empty, etc.
        dirStr = fr'{dirName}/**/*.{fileExt}'
    else:
        dirStr = fr'{dirName}/**/*'
    return list(glob.iglob(dirStr, recursive=True))


### <font color=red> *Make changes to the cell below to reflect your actual folder set-up*</font>

##### Place all biographical data for the dynasties of interest under this folder  

In [3]:
docPath = r'../BioData'  # <== YOUR DATA FOLDER
docList = getListOfFiles(docPath, 'html')

##### A sample directory listing follows

In [5]:
if os.name == 'nt': # MS Windows
    !dir ..\BioData
else: # UNIX, Linux, macOS, etc.
    !ls -l ../BioData

 Volume in drive C is OS
 Volume Serial Number is 18A5-AE9F

 Directory of C:\NLP\Raft\BioData

2019-02-26  10:39 AM    <DIR>          .
2019-02-26  10:39 AM    <DIR>          ..
2019-01-29  03:24 AM    <DIR>          EasternJin
2018-12-06  02:07 AM    <DIR>          Jin
2018-12-06  09:27 PM    <DIR>          Northern Dynasties - Northern Wei
2018-12-06  03:45 PM    <DIR>          Southern Dynasties - Liang
2018-12-06  11:28 AM    <DIR>          Southern Dynasties - Qi
2018-12-05  03:02 PM    <DIR>          Southern Dynasties - Song
               0 File(s)              0 bytes
               8 Dir(s)  30,450,577,408 bytes free


In [6]:
len(docList)

10789

##### function to create a bio entry from the contents (txt) of an HTML file

In [7]:
def createBioEntry(txt, id):
    BIO_ENTRY = {'id': id}  # initialize the BIOGRAPHY entry dict with the unique id
    title = None
    entry = None
    bs  = BeautifulSoup(txt, 'lxml')
    dataTable = bs.find_all('dl') # there should only be one instance of the <dl></dl> tag
    if len(dataTable) == 0:  # the HTML contains no real data
        return None
    for node in dataTable[0]:
        entry = None
        if node.name == 'dt':
            title = node.text
        if node.name == 'dd':
            entry = node.text
        if entry:
            #print(f"title={title}, entry={entry}")
            BIO_ENTRY[title] = entry
    return BIO_ENTRY

### Main code

#### Read .html files and put data in a dict() object

In [8]:
BIOGRAPHY = []  # bio database list
UNIQUE_ID = {}  # this dict keeps track of whether an entry has been processed before (duplicates)
i = 0
for fin in docList:
    i += 1
    bn = os.path.basename(fin)
    id = bn.replace('.html', '')  # removes '.' and the file extension gives us the unique bio entry id
    if id in UNIQUE_ID:
        UNIQUE_ID[id] += 1
        continue
    else:
        UNIQUE_ID[id] = 1
    with open(fin, "r", encoding="utf-8") as fi:
        txt = fi.read()
        entry = createBioEntry(txt, id)
        if entry:
            BIOGRAPHY.append(entry)
        if i % 1000 == 0:
            print(f"processed {i} entries...")

print(f"\nTotal no. of entries processed: {len(BIOGRAPHY)}")

processed 1000 entries...
processed 2000 entries...
processed 4000 entries...
processed 5000 entries...
processed 6000 entries...
processed 7000 entries...
processed 8000 entries...
processed 9000 entries...
processed 10000 entries...

Total no. of entries processed: 10187


In [9]:
len(BIOGRAPHY), len(UNIQUE_ID)

(10187, 10189)

In [10]:
# identify empty entries
s1 = set(BIOGRAPHY[i]['id'] for (i, v) in enumerate(BIOGRAPHY))  # extracts all id's
s2 = set(UNIQUE_ID.keys())  # same
s2.difference(s1)

{'218179', '230708'}

In [11]:
BIOGRAPHY[1000:1010]

[{'id': '309276', '姓名': '沮渠男成', '別名': '大且渠男成', '朝代': '東晉', '籍貫': '盧水'},
 {'id': '309278', '姓名': '沮渠百年', '朝代': '東晉'},
 {'id': '309283', '姓名': '沮渠鄯善', '朝代': '東晉'},
 {'id': '309284', '姓名': '赫連滿', '朝代': '東晉，北魏', '卒年': '427', '籍貫': '新興郡'},
 {'id': '309286', '姓名': '慕容敏', '朝代': '東晉', '籍貫': '棘城'},
 {'id': '309308', '姓名': '尹泓', '朝代': '東晉'},
 {'id': '309314',
  '姓名': '法泉',
  '號': '佛慧',
  '朝代': '宋，東晉',
  '籍貫': '隨州',
  '簡介': '法泉，俗姓時，隨州（治今湖北隨州）人。住蔣山太平興國寺，賜號佛慧。晚年奉詔住大相國智海禪寺。能爲詩，與東坡爲方外友。見《補續高僧傳》卷一〇。',
  '來源章節': '《全宋文》卷二〇〇五《全宋文》卷三四〇二'},
 {'id': '309319', '姓名': '法長', '朝代': '東晉'},
 {'id': '309323', '姓名': '法饒', '朝代': '東晉', '卒年': '351'},
 {'id': '309335', '姓名': '邵洎', '別名': '樂安內史洎', '朝代': '東晉', '籍貫': '安陽'}]

#### Write data to Excel file

In [12]:
import pandas as pd

In [25]:
biodf = pd.DataFrame(BIOGRAPHY)

In [26]:
## The Excel file will have the column names in the order indicated
columns = ['id','姓名','字','號','別名','生年','卒年','籍貫','朝代','簡介','著作列表','親屬','作品數','來源章節']
biodf = biodf.reindex(columns, axis=1)

In [27]:
def sort_by_length_of_indicated_column(df, col, ascending=False):
    s = df[col].str.len().sort_values(ascending=ascending).index
    df_out = df.reindex(s)
    return df_out

In [28]:
biodf = sort_by_length_of_indicated_column(biodf, '姓名')
biodf.rename(columns={'姓名':'name'}, inplace=True)  # change '姓名' to 'name' to match it with Fagushan

In [30]:
%%time
biodf.to_excel("BioChineseWebsite.xlsx")

Wall time: 3.99 s


In [34]:
biodf.shape

(10187, 2)

In [None]:
## We just need 'id' and 'name'
biodf = biodf.loc[:, ['id','name']]

#### Now merge this data set with Fagushan's persons data set

In [36]:
fagushandf = pd.read_excel('fagushan.persons.liusong.xlsx')
fagushandf = fagushan_df.loc[:, ['id', 'name']]

In [37]:
fagushandf.shape

(2263, 2)

In [39]:
combined_df = pd.merge(biodf, fagushandf, how='outer')
combined_df.shape

(12450, 2)

In [40]:
## Now sort the list before writing to Excel
combined_df = sort_by_length_of_indicated_column(combined_df, 'name')
combined_df.to_excel('BioCombinedSorted.xlsx')  # remember to delete single-character entries

#### Try reading back the Excel file

In [41]:
biodf2 = pd.read_excel("BioCombinedSorted.xlsx")

In [42]:
biodf2[:20]

Unnamed: 0.1,Unnamed: 0,id,name
0,0,90135488,段氏（慕容垂皇后）
1,11,90136201,顧某(張玄妹夫)
2,20,90136032,賀氏（拓跋珪母）
3,19,90135622,楊氏(苻承祖姨)
4,18,90135635,劉氏(何無忌母)
5,17,90134762,蔣氏（蔣子文妹）
6,16,90135361,任氏(皇甫謐母)
7,15,90135620,趙氏(孫道溫妻)
8,14,90134650,趙氏(吳康之妻)
9,13,90135359,崔氏(房景伯母)
