# KMST Maritime Accident Verdict Scraper

This notebook scrapes maritime accident investigation verdicts from the Korean Maritime Safety Tribunal (KMST) website. It:

1. Retrieves accident reports from the KMST website
2. Extracts vessel names and accident details from Korean text
3. Translates content from Korean to English using Google Translate
4. Saves the extracted data to CSV files

The scraper handles pagination and uses both the Google Cloud Translation API and the googletrans library for translations.


In [22]:
import requests
from bs4 import BeautifulSoup
from googletrans import Translator
from google.cloud import translate_v2 as translate
import csv
import os

In [3]:
BASE_URL = "https://www.kmst.go.kr"
LIST_URL = "https://www.kmst.go.kr/web/verdictList.do?menuIdx=121"


In [21]:
translator = Translator()


In [24]:
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/Users/richarddonohue/.config/gcloud/application_default_credentials.json"
translate_client = translate.Client()

In [5]:
def scrape_page(page_number):

    page_decisions = {}
    # Prepare payload for GET request.
    payload = {
        "menuIdx": "121",
        "pageindex": str(page_number)
    }
    response = requests.get(LIST_URL, params=payload)
    soup = BeautifulSoup(response.text, 'html.parser')

    table_body = soup.find("tbody", id="upStFormTbody")
    rows = table_body.find_all("tr") if table_body else []

    for row in rows:
        title_tag = row.find("a", href=lambda href: href and "javascript:moveDetail" in href)
        title = title_tag.text.strip() if title_tag else ""
        translated_title = translator.translate(title, dest='en').text

        file_links = row.find_all("a", class_="fileY")
        decision_link = file_links[0].get("href", "") if file_links else ""

        # Construct full URL if a partial link was found.
        if decision_link:
            decision_link = BASE_URL + decision_link

        page_decisions[translated_title] = decision_link

    return page_decisions

In [25]:
def scrape_page_api(page_number):

    page_decisions = {}
    # Prepare payload for GET request.
    payload = {
        "menuIdx": "121",
        "pageindex": str(page_number)
    }
    response = requests.get(LIST_URL, params=payload)
    soup = BeautifulSoup(response.text, 'html.parser')

    table_body = soup.find("tbody", id="upStFormTbody")
    rows = table_body.find_all("tr") if table_body else []

    for row in rows:
        title_tag = row.find("a", href=lambda href: href and "javascript:moveDetail" in href)
        title = title_tag.text.strip() if title_tag else ""
        translated_title = translate_client.translate(title, target_language='en')['translatedText']

        file_links = row.find_all("a", class_="fileY")
        decision_link = file_links[0].get("href", "") if file_links else ""

        # Construct full URL if a partial link was found.
        if decision_link:
            decision_link = BASE_URL + decision_link

        page_decisions[translated_title] = decision_link

    return page_decisions

In [None]:
all_decisions = {}

for page in range(1, 87):
    decisions = scrape_page(page)
    all_decisions.update(decisions)

all_decisions

In [16]:
all_decisions

{'Fishing boat Myeong Yoon -ho': 'https://www.kmst.go.kr/web/atch/atchFileDownload.do?atchId=100553&fileSn=1',
 'Yein Line Woo -Guk Tea 5 Physician Bu -seon Geumo 7 Pharmaceutical Incident Incident': 'https://www.kmst.go.kr/web/atch/atchFileDownload.do?atchId=100499&fileSn=1',
 "Fishing Path 26 Men's Lake Stranding Incident": 'https://www.kmst.go.kr/web/atch/atchFileDownload.do?atchId=100496&fileSn=1',
 'Fishing boat Gwangjeong No. 8, Fishing Pass Gwangjeong 88 Conflict Case': 'https://www.kmst.go.kr/web/atch/atchFileDownload.do?atchId=100493&fileSn=1',
 'Fishing boat 2007 Yeonheungho, a crash in the frozen carrier Singyu': 'https://www.kmst.go.kr/web/atch/atchFileDownload.do?atchId=100447&fileSn=1',
 "Fishing Path 101 Tongyeongho Sailor's Incident": 'https://www.kmst.go.kr/web/atch/atchFileDownload.do?atchId=100444&fileSn=1',
 'Fishing boat Yoon Sung -ho, bandit cargo ship J -ruby crash case': 'https://www.kmst.go.kr/web/atch/atchFileDownload.do?atchId=100441&fileSn=1',
 'Stella Queen

In [18]:
with open('../data/all_decisions_googletrans.csv', 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(['case_name', 'url'])
    for case_name, url in all_decisions.items():
        writer.writerow([case_name, url])

In [27]:
all_decisions_api = {}

for page in range(1, 87):
    decisions = scrape_page_api(page)
    all_decisions_api.update(decisions)

all_decisions_api   

{'Fishing vessel Myungyoonho Fishing vessel Daeyangho collision incident': 'https://www.kmst.go.kr/web/atch/atchFileDownload.do?atchId=100553&fileSn=1',
 'The grounding incident of the towed vessel Geumoh 7 by the tugboat Woogukti 5': 'https://www.kmst.go.kr/web/atch/atchFileDownload.do?atchId=100499&fileSn=1',
 'Fishing vessel No. 26 Namseongho grounding incident': 'https://www.kmst.go.kr/web/atch/atchFileDownload.do?atchId=100496&fileSn=1',
 'Collision incident between fishing boats Gwangjeong 8 and Gwangjeong 88': 'https://www.kmst.go.kr/web/atch/atchFileDownload.do?atchId=100493&fileSn=1',
 'Fishing vessel Yeonheungho 2007 collision with refrigerated transport vessel Sing Yue': 'https://www.kmst.go.kr/web/atch/atchFileDownload.do?atchId=100447&fileSn=1',
 'Fishing vessel No. 101 Tongyeongho crew casualty incident': 'https://www.kmst.go.kr/web/atch/atchFileDownload.do?atchId=100444&fileSn=1',
 'Fishing boat Yun Seong-ho and cargo ship JC Ruby collision incident': 'https://www.kmst.g

In [28]:
with open('../data/all_decisions_google_api.csv', 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(['case_name', 'url'])
    for case_name, url in all_decisions_api.items():
        writer.writerow([case_name, url])