In [47]:
import os
import re
import string
import glob
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import spacy
import stanza
from stanza.utils.conll import CoNLL
from pathlib import Path

nlp = spacy.load("en_core_web_lg")
stop_words = set(stopwords.words('english'))
nlp_stanza = stanza.Pipeline('en', processors='tokenize,pos', tokenize_pretokenized=True)


2025-10-30 16:29:35 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.11.0.json: 435kB [00:00, 20.5MB/s]                    
2025-10-30 16:29:35 INFO: Downloaded file to /Users/tuvshinselenge/stanza_resources/resources.json
2025-10-30 16:29:36 INFO: Loading these models for language: en (English):
| Processor | Package         |
-------------------------------
| tokenize  | combined        |
| pos       | combined_charlm |

2025-10-30 16:29:36 INFO: Using device: cpu
2025-10-30 16:29:36 INFO: Loading: tokenize
2025-10-30 16:29:36 INFO: Loading: pos
2025-10-30 16:29:36 INFO: Done loading processors!


In [48]:
BASE = Path(os.environ.get("TUW_NLP2025_DIR", "~/TUW_NLP2025")).expanduser()

input_folder  = str((BASE / "paper_txt").resolve())
output_folder = str((BASE / "code" / "data").resolve())


In [49]:
def prep_text(t):
    # Convert to lowercase
    t = t.lower()

    # Remove email addresses
    t = re.sub(r'\S+@\S+', ' ', t)

    # Remove URLs
    t = re.sub(r'http\S+|www\.\S+', ' ', t)

    # Remove special characters (keep words, numbers, and basic punctuation)
    t = re.sub(r'[^a-z0-9\s.,]', ' ', t)

    # Remove multiple spaces and newlines
    t = re.sub(r'\s+', ' ', t).strip()

    return t

In [50]:
def lemmatize_texts(texts):
    """
    Lemmatizes a list of text strings using spaCy.
    Returns a list of lemmatized text strings (lowercased, stopwords removed).
    """
    lemmatized_texts = []

    for text in texts:
        doc = nlp(text)

        lemmas = [
            token.lemma_.lower()
            for token in doc 
        ]
        lemmatized_texts.append(" ".join(lemmas))

    return lemmatized_texts

In [51]:
def remove_stopwords(texts):
    """
    Removes English stopwords and punctuation from a list of text strings.
    Returns a list of cleaned text strings.
    """
    cleaned_texts = []

    for text in texts:
        # Tokenize text
        words = word_tokenize(text.lower())

        # Filter out stopwords and punctuation
        filtered_words = [
            w for w in words if w not in stop_words and w not in string.punctuation
        ]

        # Join tokens back into a single string
        cleaned_texts.append(" ".join(filtered_words))

    return cleaned_texts

### The CoNLL format

In [None]:
txt_files = glob.glob(os.path.join(input_folder, "*.txt"))

print(f"üìÅ Found {len(txt_files)} text files in {input_folder}")
print(f"üìÇ Output folder: {output_folder}\n")

for txt_path in txt_files:
    try:
        filename = txt_path.name
        print(f"Processing: {filename}")

        data = txt_path.read_text(encoding="utf-8")

        # Preprocessing pipeline
        prep_data = prep_text(data)
        token_data = word_tokenize(prep_data)
        lemma_data = lemmatize_texts(token_data)
        final_data = remove_stopwords(lemma_data)

        # CoNLL-U mit Stanza erzeugen
        docs = []
        for text in final_data:
            if text.strip():
                tokens = text.split()
                doc = nlp_stanza([tokens])
                docs.append(doc)

        output_filename = filename.replace(".txt", ".conllu").replace(".TXT", ".conllu")
        output_path = output_folder / output_filename

        with output_path.open("w", encoding="utf-8") as f:
            for doc in docs:
                for sentence in doc.sentences:
                    for token in sentence.tokens:
                        for word in token.words:
                            xpos = word.xpos if getattr(word, "xpos", None) else "_"
                            line = f"{word.id}\t{word.text}\t{word.text}\t{word.upos}\t{xpos}\t_\t0\troot\t_\t_\n"
                            f.write(line)
                    f.write("\n")
                f.write("\n")

        print(f"Saved: {len(docs)} documents ‚Üí {output_filename}\n")

    except Exception as e:
        print(f"Error processing {filename}: {e}\n")

print("All files processed!")

üìÅ Found 0 text files in /Users/tuvshinselenge/TUW_NLP2025/paper_txt
üìÇ Output folder: /Users/tuvshinselenge/TUW_NLP2025/code/data

All files processed!
