In [1]:
!pip install spacy
!python -m spacy download en_core_web_sm
!python -m spacy download ro_core_news_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m28.4 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Collecting ro-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/ro_core_news_sm-3.8.0/ro_core_news_sm-3.8.0-py3-none-any.whl (12.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.9/12.9 MB[0m [31m54.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages

In [2]:
import spacy
from spacy import displacy

nlp_en = spacy.load("en_core_web_sm")
nlp_ro = spacy.load("ro_core_news_sm")

In [3]:
print("Stopwords in English:")
print(nlp_en.Defaults.stop_words)

print("\nStopwords in Romanian:")
print(nlp_ro.Defaults.stop_words)

Stopwords in English:
{'your', 'just', 'four', 'what', 'he', '‘ll', 'eleven', 'twelve', 'himself', 'nor', 'part', 'three', 'n’t', 'whoever', 'bottom', 'unless', 'such', 'does', 'some', 'ten', 'quite', 'except', 'ever', 'cannot', 'much', 'these', 'otherwise', 'thereupon', 'whole', 'give', 'mostly', 'besides', 'of', 'show', 'as', 'if', 'being', 'side', 'thence', 'while', 'beyond', 'therein', '’d', 'few', 'somehow', '’s', 'one', 'is', 'now', 'whether', 'alone', "'s", 'during', 'really', 'seem', 'get', 'too', 'therefore', 'over', 'third', 'toward', 'meanwhile', 'those', 'back', 'must', "n't", 'became', 'often', 'sometime', 'among', 'hereupon', '’m', 'whereby', 'only', 'her', 'made', "'ll", 'she', 'would', 'have', 'after', 'fifteen', 'off', 'we', 'twenty', 'per', 'another', 'latter', 'everywhere', 'upon', 'anyone', 'call', 'other', 'but', 'here', 'elsewhere', 'and', 'done', 'up', 'anyway', 'both', 'where', 'are', 'very', 'who', 'namely', "'ve", 'his', 'had', 'almost', 'along', 'again', 'nei

In [4]:
from tabulate import tabulate
from pathlib import Path

def preprocess_text(text, lang="en", idx=0):
    if lang == "en":
        nlp = nlp_en
    elif lang == "ro":
        nlp = nlp_ro
    else:
        raise ValueError("Unsupported language. Choose 'en' or 'ro'.")

    doc = nlp(text.lower())

    svg = displacy.render(doc, style='dep', jupyter=False)

    entity_dict = {token.text: ent.label_ for ent in doc.ents for token in ent}

    processed_tokens = [
        (token.text, token.lemma_, token.pos_, token.tag_, entity_dict.get(token.text, ""))
        for token in doc
        if not token.is_stop and not token.is_punct and not token.is_space
    ]


    output_path = Path(f"./dependency_plot_{idx}.svg")
    output_path.open("w", encoding="utf-8").write(svg)

    headers = ["Token", "Lemma", "POS", "Tag", "Entity Type (NER)"]
    print(tabulate(processed_tokens, headers=headers, tablefmt="grid"))

    return processed_tokens

In [5]:
texts = [
    # English texts
    "Artificial intelligence is rapidly evolving, with machine learning models becoming more sophisticated. Researchers are constantly exploring new algorithms to improve accuracy and efficiency.",
    "Quantum computing has the potential to revolutionize cryptography, enabling secure communication through principles of quantum mechanics that prevent data interception.",
    "Climate change remains one of the most pressing global issues, as rising temperatures contribute to extreme weather events, biodiversity loss, and economic instability.",
    "The philosophical debate between determinism and free will has persisted for centuries, with neuroscientific research offering new perspectives on human decision-making processes.",
    "In the digital age, data privacy has become a critical concern, as corporations collect vast amounts of user information to enhance predictive analytics and targeted advertising.",

    # Romanian texts
    "Inteligența artificială evoluează rapid, iar modelele de învățare automată devin din ce în ce mai sofisticate. Cercetătorii explorează constant noi algoritmi pentru a îmbunătăți acuratețea și eficiența.",
    "Calculul cuantic are potențialul de a revoluționa criptografia, permițând comunicarea securizată prin principii ale mecanicii cuantice care împiedică interceptarea datelor.",
    "Schimbările climatice rămân una dintre cele mai presante probleme globale, deoarece creșterea temperaturilor contribuie la fenomene meteorologice extreme, pierderea biodiversității și instabilitate economică.",
    "Dezbaterea filozofică dintre determinism și liberul arbitru persistă de secole, iar cercetările în neuroștiințe oferă perspective noi asupra proceselor de luare a deciziilor umane.",
    "În era digitală, confidențialitatea datelor a devenit o preocupare esențială, deoarece corporațiile colectează cantități uriașe de informații despre utilizatori pentru a îmbunătăți analiza predictivă și publicitatea direcționată."
]


languages = ["en"] * 5 + ["ro"] * 5

In [6]:
processed_texts = [preprocess_text(text, lang, idx = i) for i, (text, lang) in enumerate(zip(texts, languages))]

+---------------+---------------+-------+-------+---------------------+
| Token         | Lemma         | POS   | Tag   | Entity Type (NER)   |
| artificial    | artificial    | ADJ   | JJ    |                     |
+---------------+---------------+-------+-------+---------------------+
| intelligence  | intelligence  | NOUN  | NN    |                     |
+---------------+---------------+-------+-------+---------------------+
| rapidly       | rapidly       | ADV   | RB    |                     |
+---------------+---------------+-------+-------+---------------------+
| evolving      | evolve        | VERB  | VBG   |                     |
+---------------+---------------+-------+-------+---------------------+
| machine       | machine       | NOUN  | NN    |                     |
+---------------+---------------+-------+-------+---------------------+
| learning      | learning      | NOUN  | NN    |                     |
+---------------+---------------+-------+-------+---------------

In [7]:
for i, (original, processed) in enumerate(zip(texts, processed_texts)):
    print(f"Original: {original}")
    print(f"Processed: {processed}")
    print("-" * 50)

Original: Artificial intelligence is rapidly evolving, with machine learning models becoming more sophisticated. Researchers are constantly exploring new algorithms to improve accuracy and efficiency.
Processed: [('artificial', 'artificial', 'ADJ', 'JJ', ''), ('intelligence', 'intelligence', 'NOUN', 'NN', ''), ('rapidly', 'rapidly', 'ADV', 'RB', ''), ('evolving', 'evolve', 'VERB', 'VBG', ''), ('machine', 'machine', 'NOUN', 'NN', ''), ('learning', 'learning', 'NOUN', 'NN', ''), ('models', 'model', 'NOUN', 'NNS', ''), ('sophisticated', 'sophisticated', 'ADJ', 'JJ', ''), ('researchers', 'researcher', 'NOUN', 'NNS', ''), ('constantly', 'constantly', 'ADV', 'RB', ''), ('exploring', 'explore', 'VERB', 'VBG', ''), ('new', 'new', 'ADJ', 'JJ', ''), ('algorithms', 'algorithm', 'NOUN', 'NNS', ''), ('improve', 'improve', 'VERB', 'VB', ''), ('accuracy', 'accuracy', 'NOUN', 'NN', ''), ('efficiency', 'efficiency', 'NOUN', 'NN', '')]
--------------------------------------------------
Original: Quantum

In [8]:
# Dependency Tree
# NER