In [1]:
%pip install transformers torch nltk

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [2]:
import torch
from transformers import MarianMTModel, MarianTokenizer
from nltk.stem import WordNetLemmatizer
import nltk

nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [3]:
MODEL_NAME_DANISH_TO_ENGLISH = "Helsinki-NLP/opus-mt-da-en"
MODEL_NAME_ENGLISH_TO_DANISH = "Helsinki-NLP/opus-mt-en-da"


device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer_da_en = MarianTokenizer.from_pretrained(MODEL_NAME_DANISH_TO_ENGLISH)
model_da_en = MarianMTModel.from_pretrained(MODEL_NAME_DANISH_TO_ENGLISH).to(device)

tokenizer_en_da = MarianTokenizer.from_pretrained(MODEL_NAME_ENGLISH_TO_DANISH)
model_en_da = MarianMTModel.from_pretrained(MODEL_NAME_ENGLISH_TO_DANISH).to(device)

lemmatizer = WordNetLemmatizer()

def translate(text, tokenizer, model, max_length=512):
    """
    Translates text using MarianMT model.
    Args:
        text (str): Input text to translate.
        tokenizer: MarianMT tokenizer.
        model: MarianMT model.
        max_length (int): Maximum token length for truncation.
    Returns:
        str: Translated text.
    """
    # Tokenize input and move to device (GPU/CPU)
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=max_length).to(device)
    # Generate translation
    outputs = model.generate(**inputs)
    # Decode and remove special tokens
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

def danish_lemmatizer(word):
    """
    Lemmatizes a Danish word via translation to English.
    Steps:
        1. Danish → English
        2. Lemmatize English word
        3. English lemma → Danish
    """
    # Step 1: Danish → English
    try:
        english_translation = translate(word, tokenizer_da_en, model_da_en)
    except Exception as e:
        print(f"Translation error (da→en): {e}")
        return word  # Return original if translation fails

    # Step 2: Lemmatize English word (default: noun)
    english_lemma = lemmatizer.lemmatize(english_translation.lower(), pos='n')

    # Step 3: English lemma → Danish
    try:
        danish_lemma = translate(english_lemma, tokenizer_en_da, model_en_da)
    except Exception as e:
        print(f"Translation error (en→da): {e}")
        return word

    return danish_lemma.lower()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/820k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/788k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/300M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/300M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/788k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/820k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/300M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/300M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

In [4]:
# Example
danish_words = ["løbende", "første", "katte", "spiser"]  # Test words
for word in danish_words:
    lemma = danish_lemmatizer(word)
    print(f"Word: '{word}' → Lemma: '{lemma}'")

Word: 'løbende' → Lemma: 'løbende'
Word: 'første' → Lemma: 'første'
Word: 'katte' → Lemma: 'kat'
Word: 'spiser' → Lemma: 'spise'
