In [None]:
# Standard library
import os
import re
import string
import glob
from pathlib import Path

# Third-party libraries
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import spacy
import stanza
from stanza.utils.conll import CoNLL

nlp = spacy.load("en_core_web_lg")
stop_words = set(stopwords.words('english'))
nlp_stanza = stanza.Pipeline('en', processors='tokenize,pos,lemma,depparse', tokenize_pretokenized=True)

2025-11-02 11:18:01 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.11.0.json: 435kB [00:00, 35.8MB/s]                    
2025-11-02 11:18:01 INFO: Downloaded file to /Users/tuvshinselenge/stanza_resources/resources.json
2025-11-02 11:18:02 INFO: Loading these models for language: en (English):
| Processor | Package           |
---------------------------------
| tokenize  | combined          |
| pos       | combined_charlm   |
| lemma     | combined_nocharlm |
| depparse  | combined_charlm   |

2025-11-02 11:18:02 INFO: Using device: cpu
2025-11-02 11:18:02 INFO: Loading: tokenize
2025-11-02 11:18:02 INFO: Loading: pos
2025-11-02 11:18:03 INFO: Loading: lemma
2025-11-02 11:18:03 INFO: Loading: depparse
2025-11-02 11:18:03 INFO: Done loading processor

In [26]:
PROJECT_ROOT = Path.cwd().resolve().parent
INPUT_DIR = PROJECT_ROOT / "paper_txt"
OUTPUT_DIR = PROJECT_ROOT / "code" / "data"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Relative path strings for display only
try:
    REL_INPUT = INPUT_DIR.relative_to(PROJECT_ROOT)
    REL_OUTPUT = OUTPUT_DIR.relative_to(PROJECT_ROOT)
except Exception:
    REL_INPUT = INPUT_DIR
    REL_OUTPUT = OUTPUT_DIR


In [27]:
def prep_text(text):
    """
    Clean and normalize a text string.

    Steps:
    - Convert to lowercase
    - Remove email addresses, URLs, and special characters
    - Replace multiple spaces and newlines with a single space

    Returns:
        str: cleaned text
    """
    # Convert to lowercase
    text = text.lower()

    # Remove email addresses
    text = re.sub(r'\S+@\S+', ' ', text)

    # Remove URLs
    text = re.sub(r'http\S+|www\.\S+', ' ', text)

    # Remove special characters (keep words, numbers, and basic punctuation)
    text = re.sub(r'[^a-z0-9\s.,]', ' ', text)

    # Remove multiple spaces and newlines
    text = re.sub(r'\s+', ' ', text).strip()

    return text

In [28]:
def lemmatize_texts(texts, nlp):
    """
    Lemmatizes a list of text strings using spaCy.
    Returns a list of lemmatized text strings (lowercased, stopwords removed).
    """
    lemmatized_texts = []
    for text in texts:
        doc = nlp(text if text is not None else "")
        lemmas = [token.lemma_.lower() for token in doc if not token.is_stop]
        lemmatized_texts.append(" ".join(lemmas))
    return lemmatized_texts

In [29]:
def remove_stopwords(texts):
    """
    Removes English stopwords and punctuation from a list of text strings.
    Returns a list of cleaned text strings.
    """
    cleaned_texts = []
    for text in texts:
        # Tokenize text
        words = word_tokenize(text.lower())

        # Filter out stopwords and punctuation
        filtered_words = [w for w in words if w not in stop_words and w not in string.punctuation]

        # Join tokens back into a single string
        cleaned_texts.append(" ".join(filtered_words))

    return cleaned_texts

### The CoNLL format

In [None]:
txt_files = sorted(INPUT_DIR.glob("*.txt"))

print(f"Found {len(txt_files)} text files in {REL_INPUT}")
print(f"Output folder: {REL_OUTPUT}\n")

for txt_path in txt_files:
    try:
        filename = txt_path.name
        print(f"Processing: {filename}")

        data = txt_path.read_text(encoding="utf-8")

        # Preprocessing pipeline
        prep_data = prep_text(data)
        token_data = word_tokenize(prep_data)
        lemma_data = lemmatize_texts(token_data, nlp)
        final_data = remove_stopwords(lemma_data)

        # Create CoNLL-U with Stanza
        docs = []
        for text in final_data:
            if text.strip():
                tokens = text.split()
                doc = nlp_stanza([tokens])
                docs.append(doc)

        output_filename = txt_path.with_suffix(".conllu").name
        output_path = OUTPUT_DIR / output_filename

        # Write first doc to create file
        if docs:
            CoNLL.write_doc2conll(docs[0], str(output_path))
            
            if len(docs) > 1:
                with output_path.open("a", encoding="utf-8") as f:
                    for doc in docs[1:]:
                        f.write("\n") 
                        import tempfile
                        with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='.conllu') as tmp:
                            tmp_path = tmp.name
                        CoNLL.write_doc2conll(doc, tmp_path)
                        with open(tmp_path, 'r') as tmp_file:
                            f.write(tmp_file.read())
                        os.unlink(tmp_path)

        print(f"Saved: {len(docs)} documents → {output_filename}\n")

    except Exception as e:
        print(f"Error processing {filename}: {e}\n")

print("All files processed!")

Found 20 text files in paper_txt
Output folder: code/data

Processing: 2509.20913v1.txt
✅ Saved: 5790 documents → 2509.20913v1.conllu

Processing: 2509.23158v1.txt
✅ Saved: 3545 documents → 2509.23158v1.conllu

Processing: 2510.05163v1.txt
✅ Saved: 2349 documents → 2510.05163v1.conllu

Processing: 2510.05736v1.txt
✅ Saved: 1094 documents → 2510.05736v1.conllu

Processing: 2510.07320v1.txt
✅ Saved: 2771 documents → 2510.07320v1.conllu

Processing: 2510.08116v1.txt
✅ Saved: 3620 documents → 2510.08116v1.conllu

Processing: 2510.08411v1.txt
✅ Saved: 1923 documents → 2510.08411v1.conllu

Processing: 2510.08662v1.txt
✅ Saved: 3159 documents → 2510.08662v1.conllu

Processing: 2510.08770v1.txt
✅ Saved: 1881 documents → 2510.08770v1.conllu

Processing: 2510.09187v1.txt
✅ Saved: 2189 documents → 2510.09187v1.conllu

Processing: 2510.10729v1.txt
✅ Saved: 1223 documents → 2510.10729v1.conllu

Processing: 2510.10822v1.txt
✅ Saved: 3131 documents → 2510.10822v1.conllu

Processing: 2510.11073v1.txt
