### 1. Download selected files from the list of gazette

In [2]:
import os

## create folders
source_folder = "../workdata/doc"
if not os.path.exists(source_folder):
    os.makedirs(source_folder)

dest_folder = "../workdata/docx"
if not os.path.exists(dest_folder):
    os.makedirs(dest_folder)

speech_folder = "../workdata/speech"
if not os.path.exists(speech_folder):
    os.makedirs(speech_folder)

In [3]:
import pandas as pd

gazette_list =  pd.read_csv("../rawdata/公報清單.csv", encoding="utf-8")

gazette_list.iloc[: , :20]
gazette_list['meeting_id'] = gazette_list.docUrl.apply(lambda x: str(x).split("/")[-1].replace(".doc", ""))

term_condition = (gazette_list.term == "10") # 屆別
sessionPeriod_condition = (gazette_list.sessionPeriod == "7") # 會期
agendaType_condition = (gazette_list.agendaType == 3) # 會議類別 (3: 委員會)
subject_condition = (gazette_list.subject.str.contains("社會福利及衛生環境委員會")) # 會議主題名稱
subject_condition_full = (gazette_list.subject.str.contains("社會福利及衛生環境")) 

## select only term (屆別) = 10 and subject (會議名稱) contains "社會福利及衛生環境委員會"
gazette_list_selected = gazette_list.loc[term_condition & subject_condition, :].copy()
gazette_list_selected.to_csv("../workdata/公報_selected.csv", index=False, encoding="utf-8")

gazette_list_selected_full = gazette_list.loc[term_condition & subject_condition_full, :].copy()
gazette_list_selected_full.to_csv("../workdata/公報_selected_full.csv", index=False, encoding="utf-8")

(len(gazette_list_selected), len(gazette_list_selected_full))

(167, 238)

In [62]:
import requests

HEADERS = {'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148'}

def download_file(url, destination):
    '''
    Download file from url and save it to destination folder
    '''
    file_path = url.split("/")[-1]
    ## if file not in doc folder, download it
    if not os.path.exists(os.path.join(source_folder, file_path)) and not os.path.exists(os.path.join(dest_folder, file_path.replace(".doc", ".docx"))):
        response = requests.get(url, headers=HEADERS)
        if response.status_code == 200:
            file_path = os.path.join(destination, file_path)
            open(file_path, "wb").write(response.content)
        else:
            print("Failed to download the file: %s" % file_path)

print("Start downloading...\n{}".format(len(gazette_list_selected)), end=" - ") 
i = 1
for url in gazette_list_selected.docUrl:
    download_file(url, source_folder)
    if i % 10 == 0: print(i, end=".")
    i += 1
print("\nDownload completed!")

Start downloading...
167 - 10.20.30.40.50.60.70.80.90.100.110.120.130.140.150.160.
Download completed!


### 2. Convert all .doc to .docx

In [63]:
import os
import subprocess

g = os.walk(source_folder)

for path, dir_list, file_list in g:
    for file_name in file_list:
        file = (os.path.join(path, file_name))
        output = subprocess.check_output(["/Applications/LibreOffice.app/Contents/MacOS/soffice", "--headless", "--convert-to", "docx", file, "--outdir", dest_folder])
        os.remove(file)

### Split into individual speeches

In [21]:
import pandas as pd
import docx
import re

def read_docx(file_path):
    doc = docx.Document(file_path)
    full_text = ""
    for para in doc.paragraphs:
        full_text += para.text + "#@"
    full_text = re.sub('#@(?!.{2,6}：)', '', full_text)
    return full_text

def split_speeches(file_path):
    meeting_id = file_path.split("/")[-1].replace(".docx", "")
    full_text = read_docx(file_path)

    pattern = '@[\u4e00-\u9fa5]{1,2}委員[\u4e00-\u9fa5]{1,2}：(.+?)#'
    matches = re.finditer(pattern, full_text)
    results = [match.group() for match in matches]

    speaker = []
    speech_text = []
    speech_number = []
    for i, result in enumerate(results):
        speaker.append(result.split('：')[0].replace('@', '').replace('委員', ''))
        speech_text.append('：'.join(result.split('：')[1:]).replace('#', ''))
        speech_number.append(i + 1)

    df = pd.DataFrame({'speaker': speaker, 'speech_text': speech_text, 'speech_number': speech_number, 'meeting_id': meeting_id})
    return df

for file in os.listdir(dest_folder):
    file_path = os.path.join(dest_folder, file)
    file_name = file_path.split("/")[-1]

    # if file_name.replace(".docx", "") not in gazette_list_selected.meeting_id.values:
    #     os.remove(file_path)
    #     continue

    if file_name.endswith(".docx"):
        new_file_path = os.path.join(speech_folder, file_name.replace(".docx", ".csv"))
        # if os.path.exists(new_file_path):
        #     continue 
        output_df = split_speeches(file_path)
        output_df.to_csv(new_file_path, index=False)        