In [4]:
from bs4 import BeautifulSoup
from bs4.element import Comment
import os
import re

In [5]:
# IF YOU CHOOSE TO RUN THIS SCRIPT INDIVIDUALLY, REPLACE "~/Desktop/Univ. of Chicago/AI Judges/" WITH THE ACTUAL FILE PATH WHERE THE REPLICATION PACKAGE IS STORED ON YOUR COMPUTER
user_file_path = os.path.expanduser("~/Desktop/Univ. of Chicago/AI Judges/")

In [7]:
# Directory where HTML files are stored (gathered from Spamann & Klohn)
html_directory = os.path.join(user_file_path , "Replication Package", "Experiment Materials", "Spamann & Klohn Files")

# Directory where extracted texts will be stored
output_directory = os.path.join(user_file_path, "Replication Package", "Experiment Materials", "Input Materials", "Cases & Statute")
os.makedirs(output_directory, exist_ok=True)

# Mapping of case names to their corresponding HTML file names
file_paths = {
    "Appeal_Sainovic": "sainovic.html",
    "Appeal_Vasiljevic": "vasiljevic.html",
    "Statute": "index.html",
    "Trial_Horvat_1": "Croatian_part1.html",
    "Trial_Horvat_2": "Croatian_part2.html",
    "Trial_Horvat_3": "Croatian_part3.html",
    "Trial_Vukovic_1": "part1.html",
    "Trial_Vukovic_2": "part2.html",
    "Trial_Vukovic_3": "part3.html"
}

In [8]:
# Function to extract text and format it
def get_text_with_formatting(text):
    # Format text into paragraphs to maintain consistency with actual documents
    for element in text.find_all("br"):
        element.replace_with("\n")
    
    # Replace footnotes (<a class="fn">) with random, unique holder (which will later be removed)
    for fn_element in text.find_all(class_="fn"):
        fn_element.replace_with("__FN__HOLDER__")

    # Remove HTML comments
    comments = text.find_all(string=lambda text: isinstance(text, Comment))
    for comment in comments:
        comment.extract()
    
    # Preserve structure of documents
    parts = []
    for element in text.descendants:
        if isinstance(element, str):
            parts.append(element.strip())
        elif element.name in ["p", "div"]:
            parts.append("\n\n")  # Double new line for paragraphs
        elif element.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
            parts.append("\n")
        elif element.name in ["ul", "ol"]:
            parts.append("\n")
        elif element.name == "li":
            parts.append("- " + "\n")
    
    # Join text and strip leading/trailing whitespace
    return "".join(parts).strip()

# Initialize dictionaries to store extracted text parts for Horvat and Vukovic
horvat_parts = []
vukovic_parts = []

# Loop through each file path and name
for case_name, file_name in file_paths.items():
    try:
        # Open the HTML file
        html_file_path = os.path.join(html_directory, file_name)
        with open(html_file_path, "r", encoding="utf-8") as file:
            html_content = file.read()

        # Remove all {% ... %} blocks
        cleaned_content = re.sub(re.compile(r'{%.*?%}', re.DOTALL), '', html_content)

        # Parse the HTML content via BeautifulSoup
        text = BeautifulSoup(cleaned_content, "html.parser")

        # Extract the formatted text
        extracted_text = get_text_with_formatting(text)
        extracted_text = extracted_text.replace("__FN__HOLDER__", " ") # Remove footnotes for clarity
        extracted_text = extracted_text.replace("UNITEDNATIONS", "UNITED NATIONS") # Correcting small typo from Spamann/Klohn experiment
        
        # Store the extracted text in their respective lists
        if "Horvat" in case_name:
            horvat_parts.append(extracted_text)
        elif "Vukovic" in case_name:
            vukovic_parts.append(extracted_text)
        else:
            globals()[case_name] = extracted_text

    except FileNotFoundError:
        print(f"File not found: {html_file_path}")
        continue

    except Exception as e:
        print(f"Error processing {html_file_path}: {e}")
        continue

globals()["Trial_Horvat"] = "\n\n".join(horvat_parts)
globals()["Trial_Vukovic"] = "\n\n".join(vukovic_parts)

# Calculate half the length of each text
overlap_size = 500 
horvat_length = len(Trial_Horvat) // 2
vukovic_length = len(Trial_Vukovic) // 2

# Split exactly in half
Trial_Horvat_1 = Trial_Horvat[:horvat_length + overlap_size]
Trial_Horvat_2 = Trial_Horvat[horvat_length - overlap_size:]

Trial_Vukovic_1 = Trial_Vukovic[:vukovic_length + overlap_size]
Trial_Vukovic_2 = Trial_Vukovic[vukovic_length - overlap_size:]

In [9]:
# Write each text object to a separate .txt file
texts = [
    ("Appeal_Sainovic.txt", Appeal_Sainovic),
    ("Appeal_Vasiljevic.txt", Appeal_Vasiljevic),
    ("Statute.txt", Statute),
    ("Trial_Horvat_1.txt", Trial_Horvat_1),
    ("Trial_Horvat_2.txt", Trial_Horvat_2),
    ("Trial_Horvat.txt", Trial_Horvat),
    ("Trial_Vukovic_1.txt", Trial_Vukovic_1),
    ("Trial_Vukovic_2.txt", Trial_Vukovic_2),
    ("Trial_Vukovic.txt", Trial_Vukovic)
]

# Loop through the list and write each text object to a file
for filename, text in texts:
    with open(os.path.join(output_directory, filename), "w", encoding="utf-8") as file:
        file.write(text)

In [5]:
# ADDITIONAL CODE (OPTIONAL)
# Calculate word & token counts
import tiktoken
def num_tokens_from_string(string: str, encoding_name: str) -> int:
    encoding = tiktoken.encoding_for_model(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

Appeal_Sainovic_WC = len(Appeal_Sainovic.split())
Appeal_Sainovic_TC = num_tokens_from_string(Appeal_Sainovic, "gpt-4o")
print(f"Appeal_Sainovic Word count: {Appeal_Sainovic_WC}")
print(f"Appeal_Sainovic Token count: {Appeal_Sainovic_TC}")

Appeal_Vasiljevic_WC = len(Appeal_Vasiljevic.split())
Appeal_Vasiljevic_TC = num_tokens_from_string(Appeal_Vasiljevic, "gpt-4o")
print(f"Appeal_Vasiljevic Word count: {Appeal_Vasiljevic_WC}")
print(f"Appeal_Vasiljevic Token count: {Appeal_Vasiljevic_TC}")

Trial_Horvat_WC = len(Trial_Horvat.split())
Trial_Horvat_TC = num_tokens_from_string(Trial_Horvat, "gpt-4o")
print(f"Trial_Horvat Word count: {Trial_Horvat_WC}")
print(f"Trial_Horvat Token count: {Trial_Horvat_TC}")

Trial_Vukovic_WC = len(Trial_Vukovic.split())
Trial_Vukovic_TC = num_tokens_from_string(Trial_Vukovic, "gpt-4o")
print(f"Trial_Vukovic Word count: {Trial_Vukovic_WC}")
print(f"Trial_Vukovic Token count: {Trial_Vukovic_TC}")

Statute_WC = len(Statute.split())
Statute_TC = num_tokens_from_string(Statute, "gpt-4o")
print(f"Statute Word count: {Statute_WC}")
print(f"Statute Token count: {Statute_TC}")

Total_TC_HS = Appeal_Sainovic_TC + Trial_Horvat_WC + Statute_TC
Total_TC_HV = Appeal_Vasiljevic_TC + Trial_Horvat_WC + Statute_TC
Total_TC_VS = Appeal_Sainovic_TC + Trial_Vukovic_TC + Statute_TC
Total_TC_VV = Appeal_Vasiljevic_TC + Trial_Vukovic_TC + Statute_TC

print(f"Horvat Sainovic Token count: {Total_TC_HS}")
print(f"Horvat Vasiljevic Token count: {Total_TC_HV}")
print(f"Vukovic Sainovic Token count: {Total_TC_VS}")
print(f"Vukovic Vasiljevic Token count: {Total_TC_VV}")

Appeal_Sainovic Word count: 38509
Appeal_Sainovic Token count: 58656
Appeal_Vasiljevic Word count: 34825
Appeal_Vasiljevic Token count: 48997
Trial_Horvat Word count: 163937
Trial_Horvat Token count: 227139
Trial_Vukovic Word count: 164596
Trial_Vukovic Token count: 230477
Statute Word count: 5465
Statute Token count: 7997
Horvat Sainovic Token count: 230590
Horvat Vasiljevic Token count: 220931
Vukovic Sainovic Token count: 297130
Vukovic Vasiljevic Token count: 287471
