In [None]:
# Standard library
import os
import re
import string
import glob
from pathlib import Path

# Third-party libraries
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import spacy
import stanza
from stanza.utils.conll import CoNLL

nlp = spacy.load("en_core_web_lg")
stop_words = set(stopwords.words('english'))
nlp_stanza = stanza.Pipeline('en', processors='tokenize,pos,lemma,depparse', tokenize_pretokenized=False)

2025-11-02 11:18:01 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.11.0.json: 435kB [00:00, 35.8MB/s]                    
2025-11-02 11:18:01 INFO: Downloaded file to /Users/tuvshinselenge/stanza_resources/resources.json
2025-11-02 11:18:02 INFO: Loading these models for language: en (English):
| Processor | Package           |
---------------------------------
| tokenize  | combined          |
| pos       | combined_charlm   |
| lemma     | combined_nocharlm |
| depparse  | combined_charlm   |

2025-11-02 11:18:02 INFO: Using device: cpu
2025-11-02 11:18:02 INFO: Loading: tokenize
2025-11-02 11:18:02 INFO: Loading: pos
2025-11-02 11:18:03 INFO: Loading: lemma
2025-11-02 11:18:03 INFO: Loading: depparse
2025-11-02 11:18:03 INFO: Done loading processor

In [26]:
PROJECT_ROOT = Path.cwd().resolve().parent
INPUT_DIR = PROJECT_ROOT / "paper_txt"
OUTPUT_DIR = PROJECT_ROOT / "code" / "data"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Relative path strings for display only
try:
    REL_INPUT = INPUT_DIR.relative_to(PROJECT_ROOT)
    REL_OUTPUT = OUTPUT_DIR.relative_to(PROJECT_ROOT)
except Exception:
    REL_INPUT = INPUT_DIR
    REL_OUTPUT = OUTPUT_DIR


### The CoNLL format

In [None]:
def clean_pdf_artifacts(text: str) -> str:
    """Cleans common PDF hyphenation artifacts."""
    # Fix hyphenation across line breaks: "incor- porating" → "incorporating" (lowercase letters)
    text = re.sub(r'(?<=[a-z])-\s+(?=[a-z])', '', text)
    # Normalize spaced hyphens within words: "Short- Term" → "Short-Term" (general case)
    text = re.sub(r'(?<=[A-Za-z])-\s+(?=[A-Za-z])', '-', text)
    # Unify various dash characters to a standard hyphen
    text = re.sub(r'[‐‒–—]', '-', text)
    return text


txt_files = sorted(INPUT_DIR.glob("*.txt"))

print(f"Found {len(txt_files)} text files in {REL_INPUT}")
print(f"Output folder: {REL_OUTPUT}\n")

for txt_path in txt_files:
    filename = txt_path.name
    try:
        print(f"Processing: {filename}")

        data = txt_path.read_text(encoding="utf-8", errors="ignore")
        data = clean_pdf_artifacts(data)

        doc = nlp_stanza(data)

        output_filename = txt_path.with_suffix(".conllu").name
        output_path = OUTPUT_DIR / output_filename

        CoNLL.write_doc2conll(doc, str(output_path))

        print(f"Saved: 1 document ({len(doc.sentences)} sentences) → {output_filename}\n")

    except Exception as e:
        print(f"Error processing {filename}: {e}\n")

print("All files processed!")


Found 20 text files in paper_txt
Output folder: code/data

Processing: 2509.20913v1.txt
✅ Saved: 5790 documents → 2509.20913v1.conllu

Processing: 2509.23158v1.txt
✅ Saved: 3545 documents → 2509.23158v1.conllu

Processing: 2510.05163v1.txt
✅ Saved: 2349 documents → 2510.05163v1.conllu

Processing: 2510.05736v1.txt
✅ Saved: 1094 documents → 2510.05736v1.conllu

Processing: 2510.07320v1.txt
✅ Saved: 2771 documents → 2510.07320v1.conllu

Processing: 2510.08116v1.txt
✅ Saved: 3620 documents → 2510.08116v1.conllu

Processing: 2510.08411v1.txt
✅ Saved: 1923 documents → 2510.08411v1.conllu

Processing: 2510.08662v1.txt
✅ Saved: 3159 documents → 2510.08662v1.conllu

Processing: 2510.08770v1.txt
✅ Saved: 1881 documents → 2510.08770v1.conllu

Processing: 2510.09187v1.txt
✅ Saved: 2189 documents → 2510.09187v1.conllu

Processing: 2510.10729v1.txt
✅ Saved: 1223 documents → 2510.10729v1.conllu

Processing: 2510.10822v1.txt
✅ Saved: 3131 documents → 2510.10822v1.conllu

Processing: 2510.11073v1.txt
