In [None]:
# Standard library
import os
import re
import string
import glob
from pathlib import Path

# Third-party libraries
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import spacy
import stanza
from stanza.utils.conll import CoNLL

nlp = spacy.load("en_core_web_lg")
stop_words = set(stopwords.words('english'))
nlp_stanza = stanza.Pipeline('en', processors='tokenize,pos', tokenize_pretokenized=True)

  from .autonotebook import tqdm as notebook_tqdm
2025-10-31 16:36:59 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.11.0.json: 435kB [00:00, 91.4MB/s]                    
2025-10-31 16:36:59 INFO: Downloaded file to /Users/tuvshinselenge/stanza_resources/resources.json
2025-10-31 16:36:59 INFO: Loading these models for language: en (English):
| Processor | Package         |
-------------------------------
| tokenize  | combined        |
| pos       | combined_charlm |

2025-10-31 16:36:59 INFO: Using device: cpu
2025-10-31 16:36:59 INFO: Loading: tokenize
2025-10-31 16:36:59 INFO: Loading: pos
2025-10-31 16:37:01 INFO: Done loading processors!


In [2]:
PROJECT_ROOT = Path.cwd().resolve().parent
INPUT_DIR = PROJECT_ROOT / "paper_txt"
OUTPUT_DIR = PROJECT_ROOT / "code" / "data"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Relative path strings for display only
try:
    REL_INPUT = INPUT_DIR.relative_to(PROJECT_ROOT)
    REL_OUTPUT = OUTPUT_DIR.relative_to(PROJECT_ROOT)
except Exception:
    REL_INPUT = INPUT_DIR
    REL_OUTPUT = OUTPUT_DIR


In [None]:
def prep_text(text):
    """
    Clean and normalize a text string.

    Steps:
    - Convert to lowercase
    - Remove email addresses, URLs, and special characters
    - Replace multiple spaces and newlines with a single space

    Returns:
        str: cleaned text
    """
    # Convert to lowercase
    text = text.lower()

    # Remove email addresses
    text = re.sub(r'\S+@\S+', ' ', text)

    # Remove URLs
    text = re.sub(r'http\S+|www\.\S+', ' ', text)

    # Remove special characters (keep words, numbers, and basic punctuation)
    text = re.sub(r'[^a-z0-9\s.,]', ' ', text)

    # Remove multiple spaces and newlines
    text = re.sub(r'\s+', ' ', text).strip()

    return text

In [None]:
def lemmatize_texts(texts: Iterable[str], nlp: "spacy.language.Language") -> List[str]:
    """
    Lemmatize a sequence of texts using spaCy.

    For each text:
    - lowercase
    - lemmatize tokens
    - keep only alphabetic tokens (no numbers/punctuation)

    Returns:
        List[str]: one lemmatized string per input text.
    """
    lemmatized_texts: List[str] = []
    for text in texts:
        doc = nlp(text)
        lemmas = [tok.lemma_.lower() for tok in doc if tok.is_alpha]
        lemmatized_texts.append(" ".join(lemmas))
    return lemmatized_texts

In [None]:
def remove_stopwords(
    texts: Iterable[str],
    stop_words: set[str],
) -> List[str]:
    """
    Remove English stopwords and punctuation from a sequence of texts.

    For each text:
    - lowercase
    - tokenize with NLTK
    - drop tokens that are in `stop_words` or pure punctuation

    Returns:
        List[str]: cleaned texts.
    """
    cleaned_texts: List[str] = []
    for text in texts:
        words = word_tokenize(text.lower())
        filtered = [w for w in words if w not in stop_words and w not in string.punctuation]
        cleaned_texts.append(" ".join(filtered))
    return cleaned_texts

### The CoNLL format

In [6]:
txt_files = sorted(INPUT_DIR.glob("*.txt"))

print(f"Found {len(txt_files)} text files in {REL_INPUT}")
print(f"Output folder: {REL_OUTPUT}\n")

for txt_path in txt_files:
    try:
        filename = txt_path.name
        print(f"Processing: {filename}")

        data = txt_path.read_text(encoding="utf-8")

        # Preprocessing pipeline
        prep_data = prep_text(data)
        token_data = word_tokenize(prep_data)
        lemma_data = lemmatize_texts(token_data)
        final_data = remove_stopwords(lemma_data)

        # Create CoNLL-U with Stanza
        docs = []
        for text in final_data:
            if text.strip():
                tokens = text.split()
                doc = nlp_stanza([tokens])
                docs.append(doc)

        output_filename = txt_path.with_suffix(".conllu").name
        output_path = OUTPUT_DIR / output_filename

        with output_path.open("w", encoding="utf-8") as f:
            for doc in docs:
                for sentence in doc.sentences:
                    for token in sentence.tokens:
                        for word in token.words:
                            xpos = word.xpos if getattr(word, "xpos", None) else "_"
                            lemma = getattr(word, "lemma", None) or word.text
                            # CoNLL-U: ID, FORM, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC
                            line = f"{word.id}\t{word.text}\t{lemma}\t{word.upos}\t{xpos}\t_\t_\t_\t_\t_\n"
                            f.write(line)
                    f.write("\n")
                f.write("\n")

        print(f"Saved: {len(docs)} documents → {output_filename}\n")

    except Exception as e:
        print(f"Error processing {filename}: {e}\n")

print("All files processed!")

Found 20 text files in paper_txt
Output folder: code/data

Processing: 2509.20913v1.txt
Saved: 6274 documents → 2509.20913v1.conllu

Processing: 2509.23158v1.txt
Saved: 3706 documents → 2509.23158v1.conllu

Processing: 2510.05163v1.txt
Saved: 2421 documents → 2510.05163v1.conllu

Processing: 2510.05736v1.txt
Saved: 1146 documents → 2510.05736v1.conllu

Processing: 2510.07320v1.txt
Saved: 2838 documents → 2510.07320v1.conllu

Processing: 2510.08116v1.txt
Saved: 3796 documents → 2510.08116v1.conllu

Processing: 2510.08411v1.txt
Saved: 2012 documents → 2510.08411v1.conllu

Processing: 2510.08662v1.txt
Saved: 3279 documents → 2510.08662v1.conllu

Processing: 2510.08770v1.txt
Saved: 2016 documents → 2510.08770v1.conllu

Processing: 2510.09187v1.txt
Saved: 2250 documents → 2510.09187v1.conllu

Processing: 2510.10729v1.txt
Saved: 1269 documents → 2510.10729v1.conllu

Processing: 2510.10822v1.txt
Saved: 3281 documents → 2510.10822v1.conllu

Processing: 2510.11073v1.txt
Saved: 8821 documents → 