# KMST Verdict Name Matcher

This notebook matches vessel names between Korean Maritime Safety Tribunal (KMST) verdicts and a vessel database. It:

1. Loads two datasets:
   - Extracted vessel names from KMST verdicts
   - Korean vessel database with infractions
   
2. Implements conservative name matching logic to identify vessels across datasets
   - Handles variations in vessel name formats
   - Matches based on numeric identifiers and name components

The goal is to link maritime incidents from KMST verdicts to specific vessels in the database.


In [1]:
import pandas as pd

In [13]:
# Display all rows of the matched_df DataFrame
pd.set_option('display.max_rows', None)

In [2]:
# Read the CSV files
decisions_df = pd.read_csv('../data/extracted_vessel_names.csv')
vessels_df = pd.read_csv('../data/Korean-vessels-for-infraction-scraping-02-18-25.csv')

In [12]:
decisions_df.head()

Unnamed: 0,case_name,url,vessel_name
0,Fishing vessel Myungyoonho Fishing vessel Daey...,https://www.kmst.go.kr/web/atch/atchFileDownlo...,"['Myungyoonho', 'Daeyangho']"
1,The grounding incident of the towed vessel Geu...,https://www.kmst.go.kr/web/atch/atchFileDownlo...,"['Geumoh 7', 'Woogukti 5']"
2,Fishing vessel No. 26 Namseongho grounding inc...,https://www.kmst.go.kr/web/atch/atchFileDownlo...,['26 Namseongho']
3,Collision incident between fishing boats Gwang...,https://www.kmst.go.kr/web/atch/atchFileDownlo...,"['Gwangjeong 8', 'Gwangjeong 88']"
4,Fishing vessel Yeonheungho 2007 collision with...,https://www.kmst.go.kr/web/atch/atchFileDownlo...,"['Yeonheungho 2007', 'Sing']"


In [None]:
vessels_df.head()

In [29]:
def are_names_matching(vessel_name, decision_vessel):
    """
    Conservative matching that requires exact match of key identifiers
    """
    v1 = clean_vessel_name(vessel_name)
    v2 = clean_vessel_name(decision_vessel)
    
    # Extract number and name components
    def extract_components(name):
        parts = name.split()
        numbers = []
        words = []
        for part in parts:
            if any(c.isdigit() for c in part):
                numbers.append(part)
            else:
                words.append(part)
        return numbers, words
    
    v1_numbers, v1_words = extract_components(v1)
    v2_numbers, v2_words = extract_components(v2)
    
    # If both names have numbers, they must match exactly
    if v1_numbers and v2_numbers:
        if set(v1_numbers) != set(v2_numbers):
            return False
    
    # Main vessel name (excluding numbers) must match exactly
    # Join words to handle slight spacing differences
    v1_main = ''.join(v1_words)
    v2_main = ''.join(v2_words)
    
    return v1_main == v2_main and (not v1_numbers or not v2_numbers or set(v1_numbers) == set(v2_numbers))

In [30]:
def match_decisions_to_vessels(decisions_df, vessels_df):
    # Create a copy of vessels_df
    result_df = vessels_df.copy()
    
    # Add new columns for case information
    result_df['matched_cases'] = None
    result_df['matched_urls'] = None
    
    # Convert string representations of lists to actual lists if needed
    def ensure_list(val):
        if isinstance(val, list):
            return val
        try:
            return eval(val)
        except (TypeError, ValueError, SyntaxError):
            return [str(val)]
    
    decisions_df['vessel_name'] = decisions_df['vessel_name'].apply(ensure_list)
    
    # Debug counter
    match_count = 0
    
    # Iterate through each vessel in vessels_df
    for idx, vessel_row in result_df.iterrows():
        matched_cases = []
        matched_urls = []
        
        # Get the vessel name from vessels_df
        vessel_name = vessel_row['Vessel Name']
        if pd.isna(vessel_name):  # Skip if vessel name is NaN
            continue
            
        vessel_name = str(vessel_name).strip()
        
        # Iterate through decisions
        for _, decision_row in decisions_df.iterrows():
            # Get list of vessel names from decision
            decision_vessels = decision_row['vessel_name']
            
            # Check each vessel name in the decision
            for decision_vessel in decision_vessels:
                if are_names_matching(vessel_name, decision_vessel):
                    matched_cases.append(decision_row['case_name'])
                    matched_urls.append(decision_row['url'] if 'url' in decision_row else None)
                    match_count += 1
                    break
        
        # Update the result dataframe if matches were found
        if matched_cases:
            result_df.at[idx, 'matched_cases'] = matched_cases
            result_df.at[idx, 'matched_urls'] = matched_urls
    
    print(f"Total matches found: {match_count}")
    if match_count > 0:
        print("\nExample matches:")
        matches = result_df[result_df['matched_cases'].notna()].head()
        for _, row in matches.iterrows():
            print(f"\nVessel: {row['Vessel Name']}")
            print(f"Matched cases: {row['matched_cases']}")
            
        # Print match statistics
        total_vessels = len(vessels_df)
        matched_vessels = len(result_df[result_df['matched_cases'].notna()])
        print(f"\nMatch Statistics:")
        print(f"Total vessels: {total_vessels}")
        print(f"Vessels with matches: {matched_vessels}")
        print(f"Match rate: {matched_vessels/total_vessels*100:.1f}%")
    else:
        print("\nNo matches found. Sample data:")
        print("\nFirst few vessel names:")
        print(vessels_df['Vessel Name'].head())
        print("\nFirst few decision vessel names:")
        print(decisions_df['vessel_name'].head())
    
    return result_df

In [24]:
def match_decisions_to_vessels(decisions_df, vessels_df):
    # Create a copy of vessels_df
    result_df = vessels_df.copy()
    
    # Add new columns for case information
    result_df['matched_cases'] = None
    result_df['matched_urls'] = None
    
    # Convert string representations of lists to actual lists if needed
    def ensure_list(val):
        if isinstance(val, list):
            return val
        try:
            return eval(val)
        except (TypeError, ValueError, SyntaxError):
            return [str(val)]
    
    decisions_df['vessel_name'] = decisions_df['vessel_name'].apply(ensure_list)
    
    # Debug counter
    match_count = 0
    
    # Iterate through each vessel in vessels_df
    for idx, vessel_row in result_df.iterrows():
        matched_cases = []
        matched_urls = []
        
        # Get the vessel name from vessels_df
        vessel_name = vessel_row['Vessel Name']
        if pd.isna(vessel_name):  # Skip if vessel name is NaN
            continue
            
        vessel_name = str(vessel_name).strip()
        
        # Iterate through decisions
        for _, decision_row in decisions_df.iterrows():
            # Get list of vessel names from decision
            decision_vessels = decision_row['vessel_name']
            
            # Check each vessel name in the decision
            for decision_vessel in decision_vessels:
                if are_names_similar(vessel_name, decision_vessel):
                    matched_cases.append(decision_row['case_name'])
                    matched_urls.append(decision_row['url'] if 'url' in decision_row else None)
                    match_count += 1
                    break  # Found a match in this decision, move to next decision
        
        # Update the result dataframe if matches were found
        if matched_cases:
            result_df.at[idx, 'matched_cases'] = matched_cases
            result_df.at[idx, 'matched_urls'] = matched_urls
    
    print(f"Total matches found: {match_count}")
    # Print a few examples of matches if any exist
    if match_count > 0:
        print("\nExample matches:")
        matches = result_df[result_df['matched_cases'].notna()].head()
        for _, row in matches.iterrows():
            print(f"\nVessel: {row['Vessel Name']}")
            print(f"Matched cases: {row['matched_cases']}")
            
        # Print match statistics
        total_vessels = len(vessels_df)
        matched_vessels = len(result_df[result_df['matched_cases'].notna()])
        print(f"\nMatch Statistics:")
        print(f"Total vessels: {total_vessels}")
        print(f"Vessels with matches: {matched_vessels}")
        print(f"Match rate: {matched_vessels/total_vessels*100:.1f}%")
    else:
        print("\nNo matches found. Sample data:")
        print("\nFirst few vessel names:")
        print(vessels_df['Vessel Name'].head())
        print("\nFirst few decision vessel names:")
        print(decisions_df['vessel_name'].head())
    
    return result_df

In [31]:
matched_df = match_decisions_to_vessels(decisions_df, vessels_df)

Total matches found: 24

Example matches:

Vessel: Je 1Venus
Matched cases: ['Fishing boat Je8 Taekyungho crew injury incident', 'Fishing vessel Je3 Cheongnamho crew injury incident', 'Fishing vessel Je5 Haechangho grounding incident', 'Fishing boat 203 Wonchangho Fishing boat Je3 Gwangrimho collision incident', 'Death of crew member of fishing boat Je7 Daeseongho', 'Fishing boat Je1 Jeongsuho fire incident', 'Fishing boat Je7 Changnamho crew death incident', 'Fishing boat Je2 Somangho capsize incident', 'Fishing boat Je3 Geumjinho grounding incident', 'Fishing boat Je2 Cheonyangho crew injury incident', 'Fishing boat Je7 Dongmyeongho crew death incident', 'The incident of injury of workers on the towed vessel Je7 Cheonghaeho by the tugboat 2002 Kyungilho', 'Fishing vessel Je1 Dongjinho grounding incident (summary)', 'Fishing boat Je7 Myeongjeongho crew disappearance incident', 'Fishing boat Je8 Changseungho capsize incident', 'Fishing boat Je3munseongho fire incident', 'Fishing vessel

In [27]:

matched_df['matched_cases']

0     [Fishing vessel No. 101 Tongyeongho crew casua...
1     [Fishing vessel No. 1 crew injury incident, Fi...
2     [Fishing boat No. 103 Subokho fire incident, F...
3     [Missing crew of fishing boat No. 107 Subokho,...
4     [Fishing vessel No. 1 crew injury incident, Bu...
5     [Fishing vessel No. 1 crew injury incident, Co...
6     [Fishing vessel No. 1 crew injury incident, Co...
7     [Fishing boat No. 103 Subokho fire incident, F...
8     [Fishing boat No. 211 Jin-ho and fishing boat ...
9     [Fishing vessel Myungyoonho Fishing vessel Dae...
10    [Fishing vessel No. 1 crew injury incident, Co...
11    [The grounding incident of the towed vessel Ge...
12    [The grounding incident of the towed vessel Ge...
13    [Caferi Passenger Ship Dream Island Engine Dam...
14    [Fishing vessel No. 101 Tongyeongho crew casua...
15    [Fishing boat Eun Jin-ho grounding incident, M...
16    [Fishing boat Eun Jin-ho grounding incident, F...
17    [Fishing boat Eun Jin-ho grounding inciden