# Greek multiword expressions finetuned model for Masked Language Model

In [None]:
# Install required packages
# !pip install datasets evaluate transformers[sentencepiece]
# !pip install accelerate
# !pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126
# !conda install nvidia/label/cuda-12.6.0::cuda-toolkit


# Import necessary libraries
from transformers import Trainer
import math
import pandas
import csv
from transformers import DataCollatorForLanguageModeling
import collections
import numpy as np
from transformers import default_data_collator
from transformers import Trainer, AutoModelForMaskedLM, AutoTokenizer, TrainingArguments
import torch
from datasets import load_dataset
from huggingface_hub import notebook_login
import torch



In [None]:
# # Set the root directory path where the data is stored
root = 'data/'
batch_size = 256
mlm_probability = 0.25

In [None]:
# GPU Configuration
print("=== GPU Configuration ===")
print(f"CUDA available: {torch.cuda.is_available()}")

if torch.cuda.is_available():
    print(f"Number of GPUs: {torch.cuda.device_count()}")
    print(f"Current GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
    device = torch.device("cuda")
else:
    print("No GPU available, using CPU")
    device = torch.device("cpu")

print(f"Using device: {device}")


def tokenize_function(examples):
    """
    Tokenizes input text examples using a pre-defined tokenizer.

    Args:
        examples: Dictionary containing text samples in 'text' field

    Returns:
        Dictionary containing tokenized results and word IDs (if using fast tokenizer)
    """
    result = tokenizer(examples["text"])
    
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    
    return result


def group_texts(examples, chunk_size=64):
    """
    Groups tokenized texts into chunks of specified size for training.

    Args:
        examples: Dictionary containing tokenized text examples
        chunk_size: Integer specifying the size of text chunks (default: 64)

    Returns:
        Dictionary containing:
        - Input texts grouped into chunks
        - Labels for masked language modeling (copy of input_ids)
    """
    # Concatenate all texts in the examples
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}

    # Calculate total length and ensure it's divisible by chunk_size
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    total_length = (total_length // chunk_size) * chunk_size

    # Split concatenated texts into chunks
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }

    # Create labels for masked language modeling
    result["labels"] = result["input_ids"].copy()
    
    return result

=== GPU Configuration ===
CUDA available: True
Number of GPUs: 1
Current GPU: NVIDIA GeForce GTX 1080
GPU Memory: 8.0 GB


In [4]:
#Specify the pre-trained Greek BERT model to use
model_checkpoint = "nlpaueb/bert-base-greek-uncased-v1"

# Load the pre-trained model for masked language modeling
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

# Calculate and display the number of model parameters in millions
num_parameters = model.num_parameters() / 1_000_000
print(f"'>>> Number of parameters: {round(num_parameters)}M'")

# Define a test sentence with a masked word (indicated by [MASK])
text = "Αυτή είναι μεγάλη [MASK]."  # "This is a big [MASK]."

# Load the tokenizer corresponding to the model
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Tokenize the input text and convert to PyTorch tensors
inputs = tokenizer(text, return_tensors="pt")

# Get model predictions (logits) for all tokens in the sequence
token_logits = model(**inputs).logits

# Find the position of the [MASK] token in the input sequence
mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]

# Extract the logits (prediction scores) for the masked position
mask_token_logits = token_logits[0, mask_token_index, :]

# Get the top 5 most likely tokens to fill the mask
# torch.topk returns both values and indices; we only need indices here
top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()

# Print the original sentence with each predicted word substituted for [MASK]
for token in top_5_tokens:
    print(f"'>>> {text.replace(tokenizer.mask_token, tokenizer.decode([token]))}'")

'>>> Number of parameters: 113M'
'>>> Αυτή είναι μεγάλη χαρα.'
'>>> Αυτή είναι μεγάλη διασκεδαση.'
'>>> Αυτή είναι μεγάλη επιτυχια.'
'>>> Αυτή είναι μεγάλη τιμη.'
'>>> Αυτή είναι μεγάλη επιλογη.'


  return forward_call(*args, **kwargs)


In [5]:
# Read the CSV file containing the dataset into a pandas DataFrame
df = pandas.read_csv(root+'fullmwe.csv')
unique_df = df.drop_duplicates()
print(df.head())  # Display first 5 rows of the data

# Convert text and label columns to Python lists
texts = df.text.tolist()
labels = df.label.tolist()

# Load the dataset using Hugging Face's datasets library
# The 'csv' format is specified, and the file path is provided in the data_files parameter
dataset_custom = load_dataset('csv', data_files={'train': root+'fullmwe.csv'})

# Apply tokenization to the dataset
# - batched=True enables processing multiple examples at once for efficiency
# - remove_columns removes the original text and label columns after tokenization
tokenized_datasets = dataset_custom.map(
    tokenize_function,
    batched=True,
    remove_columns=["text", "label"]
)

# Group the tokenized texts into fixed-size chunks for training
# - This step is necessary for masked language modeling
# - The batched parameter enables processing multiple examples simultaneously
lm_datasets = tokenized_datasets.map(group_texts, batched=True)

                      text label
0      αδειάζω τη γωνιά σε     1
1  αδειάζω πιστόλι πάνω σε     1
2   αλλάζω τον αδόξαστο σε     1
3      αλλάζω την πίστη σε     1
4  δεν αλλάζω ούτε κόμα σε     1


In [6]:
unique_df

Unnamed: 0,text,label
0,αδειάζω τη γωνιά σε,1
1,αδειάζω πιστόλι πάνω σε,1
2,αλλάζω τον αδόξαστο σε,1
3,αλλάζω την πίστη σε,1
4,δεν αλλάζω ούτε κόμα σε,1
...,...,...
4485,χαράζομαι στο πρόσωπο,1
4486,χαράζομαι στην μνήμη,1
4487,χαράζομαι στο μυαλό,1
4489,χτυπάω στα νεύρα,1


In [7]:
lm_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 1804
    })
})

In [8]:
lm_datasets['train'].num_rows

1804

In [9]:
train_size = int(lm_datasets['train'].num_rows*0.9)
test_size = lm_datasets['train'].num_rows-train_size

print(test_size)

# Split the dataset
split_dataset = lm_datasets['train'].train_test_split(
    train_size=train_size,
    test_size=test_size,
    seed=42  # Set seed for reproducibility
)

# The split_dataset will now contain 'train' and 'test' splits
print(f"Training set size: {len(split_dataset['train'])}")
print(f"Test set size: {len(split_dataset['test'])}")

181
Training set size: 1623
Test set size: 181


In [10]:
# Create data collator for masked language modeling
# - Sets up masking of tokens during training
# - mlm_probability=0.5 means 50% of tokens will be masked
data_collator = DataCollatorForLanguageModeling(
   tokenizer=tokenizer,
   mlm_probability=mlm_probability
)


In [11]:
print("First 5 examples from training set:")
for i in range(1):
    example = lm_datasets["train"][i]
    print(f"\n=== Example {i+1} ===")

    # Print all features for this example
    print("\nFeatures:")
    for feature in ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels']:
        print(f"\n{feature}:")
        print(example[feature])

    # Decode the input_ids to show the actual text
    print("\nDecoded text:")
    print(tokenizer.decode(example['input_ids']))

    print("\n" + "="*50)

# Print some statistics
print("\nDataset Statistics:")
print(f"Number of training examples: {len(lm_datasets['train'])}")
print("\nFeature lengths in first example:")
for feature in ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels']:
    print(f"{feature}: {len(lm_datasets['train'][0][feature])}")

First 5 examples from training set:

=== Example 1 ===

Features:

input_ids:
[101, 1335, 1626, 365, 2424, 356, 102, 101, 1335, 1626, 15308, 462, 356, 102, 101, 374, 1626, 362, 6646, 275, 1698, 1189, 356, 102, 101, 374, 1626, 350, 2192, 356, 102, 101, 366, 374, 1626, 509, 5320, 261, 356, 102, 101, 374, 1626, 1181, 351, 102, 101, 374, 1626, 21644, 351, 102, 101, 374, 1626, 355, 15994, 356, 102, 101, 374, 1626, 355, 4771]

token_type_ids:
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

attention_mask:
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

word_ids:
[None, 0, 0, 1, 2, 3, None, None, 0, 0, 1, 2, 3, None, None, 0, 0, 1, 2, 2, 2, 2, 3, None, None, 0, 0, 1, 2, 3, None, None, 0, 1, 1, 2, 3, 3, 4, 

In [12]:
# Take a few examples from the dataset
examples = []
for i in range(1):  # Get first 3 examples
    example = {
        'input_ids': split_dataset['train'][i]['input_ids'],
        'attention_mask': split_dataset['train'][i]['attention_mask'],
        'token_type_ids': split_dataset['train'][i]['token_type_ids']
    }
    examples.append(example)

# Apply the data collator to create masked versions
batch = data_collator(examples)

# For each example in the batch
for i in range(len(batch['input_ids'])):
    print(f"\n=== Example {i+1} ===")

    # Original text (before masking)
    original_ids = split_dataset['train'][i]['input_ids']
    print("\nOriginal text:")
    print(tokenizer.decode(original_ids))

    # Masked version
    input_ids = batch['input_ids'][i]
    labels = batch['labels'][i]
    print("\nMasked text:")
    print(tokenizer.decode(input_ids))

    # Find and show masked tokens
    mask_positions = torch.where(labels != -100)[0]

    print("\nMasked tokens details:")
    for pos in mask_positions:
        original_token = tokenizer.decode([labels[pos]])
        masked_token = tokenizer.decode([input_ids[pos]])
        print(f"Position {pos}: Original '{original_token}' -> Masked '{masked_token}'")

    # Show attention mask for this example
    print("\nAttention mask sample (first 20 values):")
    print(batch['attention_mask'][i][:20].tolist())

    print("\n" + "="*50)

# Print dataset and masking statistics
print("\nDataset Statistics:")
print(f"Total number of training examples: {len(split_dataset['train'])}")
print(f"Masking probability: {data_collator.mlm_probability}")

# Print example feature lengths
print("\nFeature lengths in first example:")
for feature in ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels']:
    print(f"{feature}: {len(split_dataset['train'][0][feature])}")


=== Example 1 ===

Original text:
δινω [SEP] [CLS] εγκυμονω κινδυνους [SEP] [CLS] ειμαι βαπορι [SEP] [CLS] ειμαι εξω φρενων [SEP] [CLS] ειμαι θηριο [SEP] [CLS] ειμαι μεγας και πολυς [SEP] [CLS] ειμαι περδικι [SEP] [CLS] ειμαι πετσι και κοκαλο [SEP] [CLS] ειμαι προβατο του θεου [SEP] [CLS] ειμαι γατακι [SEP] [CLS] ειμαι yolo [SEP] [CLS] ειμαι κλαιν [SEP] [CLS]

Masked text:
δινω [SEP] [CLS] εγκυμον [MASK] [MASK] [SEP] [CLS] ειμαι βαπορι [SEP] [CLS] ειμαι [MASK] φρενων [SEP] [CLS] [MASK] θηριο [SEP] [CLS] ειμαι μεγας [MASK] πολυς [SEP] [CLS] ειμαι περδικι [SEP] [CLS] ειμαι [MASK] [MASK] κοκαλο [SEP] [CLS] ειμαι προβατο του θεου [SEP] [CLS] [MASK] γατακι [SEP] [CLS] ειμαι yolo [SEP] [CLS] ειμαι κλαιν [SEP] [CLS]

Masked tokens details:
Position 6: Original '##ω' -> Masked '[MASK]'
Position 7: Original 'κινδυνους' -> Masked '[MASK]'
Position 17: Original 'εξω' -> Masked '[MASK]'
Position 21: Original 'ειμαι' -> Masked '[MASK]'
Position 27: Original 'και' -> Masked '[MASK]'
Position 38: Or

In [13]:
# Log in to Hugging Face Hub : hf auth login

In [14]:
# Configure training arguments
training_args = TrainingArguments(
   # Output directory for saving model checkpoints
   output_dir=f"bert-base-greek-uncased-v5-finetuned-polylex-mg",
   overwrite_output_dir=True,
   # Evaluate model after each epoch
   num_train_epochs=20,  # Number of training epochs
   learning_rate=0.000005,  # Learning rate for optimization
   weight_decay=0.01,  # L2 regularization factor
   # Batch sizes for training and evaluation
   per_device_train_batch_size=batch_size,
   per_device_eval_batch_size=batch_size,
   push_to_hub=True,  # Whether to push model to Hugging Face Hub
   fp16=False,  # Disable mixed precision training
   logging_steps=10  # How often to log training metrics
)

# Extract model name from checkpoint path
model_name = model_checkpoint.split("/")[-1]

# Initialize the trainer with all components
trainer = Trainer(
   model=model,  # The model to train
   args=training_args,  # Training configuration
   train_dataset=split_dataset["train"],  # Training data
   eval_dataset=split_dataset["test"],  # Evaluation data
   data_collator=data_collator,  # Data preprocessing helper
   tokenizer=tokenizer,  # Tokenizer for text processing
)

eval_results = trainer.evaluate()

print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")


  trainer = Trainer(
  return forward_call(*args, **kwargs)


>>> Perplexity: 189.36


In [15]:
# %%capture
trainer.train()

# Save locally
local_save_path = "bert-base-greek-uncased-v5-finetuned-polylex-mg"
model.save_pretrained(local_save_path)
tokenizer.save_pretrained(local_save_path)
print(f"Model saved locally to: {local_save_path}")



eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

Step,Training Loss
10,3.9668
20,2.9094
30,2.5296
40,2.3654
50,2.233
60,2.2228
70,2.1821
80,2.1145
90,2.0496
100,2.028


Model saved locally to: bert-base-greek-uncased-v5-finetuned-polylex-mg


  return forward_call(*args, **kwargs)


>>> Perplexity: 6.62


In [16]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_checkpoint = "bert-base-greek-uncased-v5-finetuned-polylex-mg"
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)
model.to(device)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
print(f"Model loaded on: {device}")






Model loaded on: cuda


In [17]:
samples=["αθετώ το [MASK]",
"ανοίγω το [MASK]",
"γίνομαι το [MASK]",
"ζητάω το [MASK]",
"κρατάω την [MASK]",
"κάνω το [MASK]",
"καταθέτω το [MASK]",
"κλείνω τα [MASK]",
"παίρνω τον [MASK]",
"παίρνω την [MASK]",
"πληρώνω τη [MASK]",
"ρίχνω το [MASK]",
"ρίχνω τη [MASK]",
"σπάω την [MASK]",
"σώζω την [MASK]",
"σκύβω το [MASK]",
"συμπληρώνω την [MASK]",
"τραβάω τα [MASK]",
"φοβάμαι το [MASK]",
"χάνω την [MASK]"]

In [18]:

num_parameters = model.num_parameters() / 1_000_000
print(f"'>>> Number of parameters: {round(num_parameters)}M'")

# Initialize list to store results
outcome = []

# Process each text sample
for text in samples:
    # Tokenize the input text and convert to PyTorch tensors
    inputs = tokenizer(text, return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Get model predictions (logits) for all tokens in the sequence
    token_logits = model(**inputs).logits

    # Find the position of the [MASK] token in the input
    # Returns a tuple of tensors; [1] indexes into the second dimension
    mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]

    # Extract logits for only the [MASK] position
    mask_token_logits = token_logits[0, mask_token_index, :]

    # Get the top 2 most likely tokens to replace [MASK]
    # indices[0] gets the first (and only) batch item
    top_tokens = torch.topk(mask_token_logits, 2, dim=1).indices[0].tolist()

    # Print the original text with [MASK]
    print(text + ',')

    # Print two variations of the text, replacing [MASK] with each predicted token
    for token in top_tokens:
        print(text.replace(tokenizer.mask_token, tokenizer.decode([token])) + ',')

'>>> Number of parameters: 113M'
αθετώ το [MASK],
αθετώ το νομο,
αθετώ το λαθος,
ανοίγω το [MASK],
ανοίγω το παραθυρο,
ανοίγω το δρομο,
γίνομαι το [MASK],
γίνομαι το θυμα,
γίνομαι το φως,
ζητάω το [MASK],
ζητάω το καλυτερο,
ζητάω το ιδιο,
κρατάω την [MASK],
κρατάω την ισορροπια,
κρατάω την ψυχραιμια,
κάνω το [MASK],
κάνω το σωστο,
κάνω το λαθος,
καταθέτω το [MASK],
καταθέτω το χερι,
καταθέτω το νομο,
κλείνω τα [MASK],
κλείνω τα ματια,
κλείνω τα αυτια,
παίρνω τον [MASK],
παίρνω τον αερα,
παίρνω τον δρομο,
παίρνω την [MASK],
παίρνω την αποφαση,
παίρνω την ευθυνη,
πληρώνω τη [MASK],
πληρώνω τη νυφη,
πληρώνω τη ζωη,
ρίχνω το [MASK],
ρίχνω το βαρος,
ρίχνω το μπαλακι,
ρίχνω τη [MASK],
ρίχνω τη ματια,
ρίχνω τη μια,
σπάω την [MASK],
σπάω την ισορροπια,
σπάω την καρδια,
σώζω την [MASK],
σώζω την παρτιδα,
σώζω την ζωη,
σκύβω το [MASK],
σκύβω το κεφαλι,
σκύβω το προσωπο,
συμπληρώνω την [MASK],
συμπληρώνω την ωρα,
συμπληρώνω την σειρα,
τραβάω τα [MASK],
τραβάω τα μαλλια,
τραβάω τα γελια,
φοβάμαι τ

In [None]:
# Using the base Greek BERT model from NLPAUEB
# Another fine-tuned version is available (commented out):
model_checkpoint="nlpaueb/bert-base-greek-uncased-v1"

# Initialize the model for masked language modeling task
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)
# Calculate and display the model's size in millions of parameters
num_parameters = model.num_parameters() / 1_000_000
print(f"'>>> Number of parameters: {round(num_parameters)}M'")

# Load the corresponding tokenizer for the model
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Initialize list to store results
outcome = []

# Process each text sample
for text in samples:
    # Tokenize the input text and convert to PyTorch tensors
    inputs = tokenizer(text, return_tensors="pt")

    # Get model predictions (logits) for all tokens in the sequence
    token_logits = model(**inputs).logits

    # Find the position of the [MASK] token in the input
    # Using torch.where returns a tuple; [1] selects the column indices
    mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]

    # Extract logits for only the [MASK] position
    # [0] selects first batch item, mask_token_index selects the MASK position
    mask_token_logits = token_logits[0, mask_token_index, :]

    # Get the top 2 most likely tokens to replace [MASK]
    # topk returns values and indices; we only need indices here
    top_tokens = torch.topk(mask_token_logits, 2, dim=1).indices[0].tolist()

    # Print the original masked text with comma
    print(text + ',')

    # Print variations where [MASK] is replaced with predicted tokens
    for token in top_tokens:
        print(text.replace(tokenizer.mask_token, tokenizer.decode([token])) + ',')

'>>> Number of parameters: 113M'
αθετώ το [MASK],
αθετώ το κουμπι,
αθετώ το το,
ανοίγω το [MASK],
ανοίγω το newline,
ανοίγω το κουμπι,
γίνομαι το [MASK],
γίνομαι το αλλο,
γίνομαι το νεο,
ζητάω το [MASK],
ζητάω το newline,
ζητάω το να,
κρατάω την [MASK],
κρατάω την newline,
κρατάω την αρχικη,
κάνω το [MASK],
κάνω το οχι,
κάνω το σωστο,
καταθέτω το [MASK],
καταθέτω το newline,
καταθέτω το σχεδιο,
κλείνω τα [MASK],
κλείνω τα newline,
κλείνω τα ιταλικα,
παίρνω τον [MASK],
παίρνω τον πρωτο,
παίρνω τον χρονο,
παίρνω την [MASK],
παίρνω την καρτα,
παίρνω την ημερα,
πληρώνω τη [MASK],
πληρώνω τη σκεψη,
πληρώνω τη μουσικη,
ρίχνω το [MASK],
ρίχνω το κουμπι,
ρίχνω το υπολοιπο,
ρίχνω τη [MASK],
ρίχνω τη νυχτα,
ρίχνω τη μιση,
σπάω την [MASK],
σπάω την newline,
σπάω την πληρη,
σώζω την [MASK],
σώζω την πληρη,
σώζω την τριχα,
σκύβω το [MASK],
σκύβω το κουμπι,
σκύβω το newline,
συμπληρώνω την [MASK],
συμπληρώνω την newline,
συμπληρώνω την θα,
τραβάω τα [MASK],
τραβάω τα newline,
τραβάω τα ηλεκτρονικα,
