In [5]:
import os
import re
import spacy
import spacy.cli
from typing import Dict

# Download SpaCy models
spacy.cli.download("en_core_web_sm")
spacy.cli.download("pt_core_news_sm")

# Contraction maps for English and Portuguese
english_contractions = {
    "it's": "it is",
    "he's": "he is",
    "she's": "she is",
    "that's": "that is",
    "who's": "who is",
    "what's": "what is",
    "you're": "you are",
    "we're": "we are",
    "they're": "they are",
    "i'm": "i am",
    "isn't": "is not",
    "aren't": "are not",
    "wasn't": "was not",
    "weren't": "were not",
    "won't": "will not",
    "can't": "cannot",
    "don't": "do not",
    "doesn't": "does not",
    "didn't": "did not",
    "couldn't": "could not",
    "wouldn't": "would not",
    "shouldn't": "should not",
}

portuguese_contractions = {
    "da": "de a",
    "das": "de as",
    "do": "de o",
    "dos": "de os",
    "dele": "de ele",
    "dela": "de ela",
    "deles": "de eles",
    "delas": "de elas",
    "nesta": "em esta",
    "nestas": "em estas",
    "neste": "em este",
    "nestes": "em estes",
    "nisto": "em isto",
    "pro": "para o",
    "pra": "para a",
    "pros": "para os",
    "pras": "para as",
}

# Expand contractions
def expand_contractions(text: str, language: str) -> str:
    if language == "en":
        contractions_map = english_contractions
    elif language == "pt":
        contractions_map = portuguese_contractions


    pattern = re.compile(r'\b(' + '|'.join(contractions_map.keys()) + r')\b')
    return pattern.sub(lambda x: contractions_map[x.group()], text)

# Cleanse file
def cleanse_file(file_path: str, language: str):

    # load SpaCy model
    nlp = spacy.load("en_core_web_sm") if language == "en" else spacy.load("pt_core_news_sm")

    # read content
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()

    # to lowercase
    content = content.lower()

    # remove contractions
    content = expand_contractions(content, language)

    # remove non ASCII
    content = re.sub(r'[^\x20-\x7EÀ-ÿ]+', ' ', content)

    # remove numbers
    content = re.sub(r'\d+', '', content)

    doc = nlp(content)

    #lemmtize ,if not stop words, punctuation, and short tokens
    filtered_tokens = [
        token.lemma_
        for token in doc
        if not token.is_stop and not token.is_punct and len(token.text) > 1
    ]

    # rejoin
    cleansed_content = ' '.join(filtered_tokens)

    # write back
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(cleansed_content)



# Process all text files
def process_txt_files(directory: str, language: str):
    for filename in os.listdir(directory):
        if filename.endswith('.txt'):
            file_path = os.path.join(directory, filename)
            cleanse_file(file_path, language)





Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Collecting pt-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/pt_core_news_sm-3.8.0/pt_core_news_sm-3.8.0-py3-none-any.whl (13.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.0/13.0 MB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m

In [6]:
process_txt_files("training_data/EN/raw-documents","en")

Processed: training_data/EN/raw-documents/EN_UA_012611.txt
Processed: training_data/EN/raw-documents/EN_UA_104569.txt
Processed: training_data/EN/raw-documents/EN_UA_022319.txt
Processed: training_data/EN/raw-documents/EN_UA_002668.txt
Processed: training_data/EN/raw-documents/EN_UA_021377.txt
Processed: training_data/EN/raw-documents/EN_UA_026142.txt
Processed: training_data/EN/raw-documents/EN_UA_027816.txt
Processed: training_data/EN/raw-documents/EN_UA_016012.txt
Processed: training_data/EN/raw-documents/EN_UA_024628.txt
Processed: training_data/EN/raw-documents/EN_UA_019640.txt
Processed: training_data/EN/raw-documents/EN_UA_103011.txt
Processed: training_data/EN/raw-documents/EN_CC_100243.txt
Processed: training_data/EN/raw-documents/EN_UA_024321.txt
Processed: training_data/EN/raw-documents/EN_UA_100688.txt
Processed: training_data/EN/raw-documents/EN_UA_012803.txt
Processed: training_data/EN/raw-documents/EN_CC_200007.txt
Processed: training_data/EN/raw-documents/EN_UA_010095.t

In [7]:
process_txt_files("training_data/PT/raw-documents","pt")


Processed: training_data/PT/raw-documents/PT_88.txt
Processed: training_data/PT/raw-documents/PT_35.txt
Processed: training_data/PT/raw-documents/PT_37.txt
Processed: training_data/PT/raw-documents/PT_182.txt
Processed: training_data/PT/raw-documents/PT_18.txt
Processed: training_data/PT/raw-documents/PT_158.txt
Processed: training_data/PT/raw-documents/PT_85.txt
Processed: training_data/PT/raw-documents/PT_69.txt
Processed: training_data/PT/raw-documents/PT_136.txt
Processed: training_data/PT/raw-documents/PT_30.txt
Processed: training_data/PT/raw-documents/PT_07.txt
Processed: training_data/PT/raw-documents/PT_131.txt
Processed: training_data/PT/raw-documents/PT_47.txt
Processed: training_data/PT/raw-documents/PT_42.txt
Processed: training_data/PT/raw-documents/PT_38.txt
Processed: training_data/PT/raw-documents/PT_155.txt
Processed: training_data/PT/raw-documents/PT_29.txt
Processed: training_data/PT/raw-documents/PT_22.txt
Processed: training_data/PT/raw-documents/PT_168.txt
Proces

In [9]:
process_txt_files("target/EN/raw-documents","en")
process_txt_files("target/PT/raw-documents","pt")


Processed: target/EN/raw-documents/EN_UA_012611.txt
Processed: target/EN/raw-documents/EN_CC_300028.txt
Processed: target/EN/raw-documents/EN_CC_200134.txt
Processed: target/EN/raw-documents/EN_UA_104569.txt
Processed: target/EN/raw-documents/EN_UA_300017.txt
Processed: target/EN/raw-documents/EN_CC_200048.txt
Processed: target/EN/raw-documents/EN_UA_300117.txt
Processed: target/EN/raw-documents/EN_UA_300051.txt
Processed: target/EN/raw-documents/EN_UA_022319.txt
Processed: target/EN/raw-documents/EN_UA_002668.txt
Processed: target/EN/raw-documents/EN_UA_300032.txt
Processed: target/EN/raw-documents/EN_CC_200327.txt
Processed: target/EN/raw-documents/EN_UA_021377.txt
Processed: target/EN/raw-documents/EN_UA_300016.txt
Processed: target/EN/raw-documents/EN_UA_026142.txt
Processed: target/EN/raw-documents/EN_UA_027816.txt
Processed: target/EN/raw-documents/EN_UA_300025.txt
Processed: target/EN/raw-documents/EN_UA_016012.txt
Processed: target/EN/raw-documents/EN_UA_024628.txt
Processed: t

In [7]:
process_txt_files("dev_docs/EN/subtask-2-documents","en")
process_txt_files("dev_docs/PT/subtask-2-documents","pt")


Processed: dev_docs/EN/subtask-2-documents/EN_UA_DEV_20.txt
Processed: dev_docs/EN/subtask-2-documents/EN_UA_DEV_100004.txt
Processed: dev_docs/EN/subtask-2-documents/EN_UA_DEV_100019.txt
Processed: dev_docs/EN/subtask-2-documents/EN_UA_DEV_214.txt
Processed: dev_docs/EN/subtask-2-documents/EN_UA_DEV_100003.txt
Processed: dev_docs/EN/subtask-2-documents/EN_UA_DEV_22.txt
Processed: dev_docs/EN/subtask-2-documents/EN_CC_200036.txt
Processed: dev_docs/EN/subtask-2-documents/EN_CC_200065.txt
Processed: dev_docs/EN/subtask-2-documents/EN_UA_DEV_100002.txt
Processed: dev_docs/EN/subtask-2-documents/EN_CC_200077.txt
Processed: dev_docs/EN/subtask-2-documents/EN_CC_200033.txt
Processed: dev_docs/EN/subtask-2-documents/EN_UA_DEV_100013.txt
Processed: dev_docs/EN/subtask-2-documents/EN_CC_200054.txt
Processed: dev_docs/EN/subtask-2-documents/EN_UA_DEV_215.txt
Processed: dev_docs/EN/subtask-2-documents/EN_UA_DEV_213.txt
Processed: dev_docs/EN/subtask-2-documents/EN_CC_200053.txt
Processed: dev_do