In [21]:
# Standard library imports
import os

# Third-party imports
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler

# Machine learning and metrics
from sklearn.metrics import (
    classification_report, 
    precision_recall_fscore_support, 
    confusion_matrix
)
from sklearn.model_selection import KFold

# Transformers and Hugging Face
from transformers import (
    AutoTokenizer, 
    AutoModel, 
    AutoModelForSequenceClassification,
    get_linear_schedule_with_warmup
)
from transformers.modeling_outputs import TokenClassifierOutput

# Progress bars
from tqdm import trange
from tqdm.notebook import tqdm

In [22]:

class CustomModel(nn.Module):
  def __init__(self,pretrained,num_labels):
    super(CustomModel,self).__init__()
    self.num_labels = num_labels
    #Load Model with given checkpoint and extract its body
    self.model = pretrained
    self.dropout = nn.Dropout(0.1,inplace=False)
    self.classifier = nn.Linear(768,num_labels) # load and initialize weights

  def forward(self, input_ids=None, attention_mask=None,labels=None):
    #Extract outputs from the body
    outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)

    #Add custom layers
    sequence_output = self.dropout(outputs[0]) #outputs[0]=last hidden state

    logits = self.classifier(sequence_output[:,0,:].view(-1,768)) # calculate losses

    loss = None
    if labels is not None:

      pos_weight = torch.ones([768])
      loss_fct = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weight)
      loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))


      # loss_fct = nn.CrossEntropyLoss()
      # loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

    return TokenClassifierOutput(loss=loss, logits=logits, hidden_states=outputs.hidden_states,attentions=outputs.attentions)




In [23]:
import logging
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler

# Configure logging
logging.basicConfig(
    format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
    datefmt='%m/%d/%Y %H:%M:%S',
    level=logging.INFO
)
logger = logging.getLogger(__name__)


class BertInputItem:
    """An item with all the necessary attributes for finetuning BERT."""

    def __init__(self, text, input_ids, input_mask, segment_ids, label_id):
        self.text = text
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_id = label_id


def convert_examples_to_inputs(example_texts, example_labels, label2idx, 
                              max_seq_length, tokenizer, verbose=0):
    """
    Loads a data file into a list of `InputBatch`s.
    
    Args:
        example_texts: List of text examples
        example_labels: List of corresponding labels
        label2idx: Dictionary mapping labels to indices
        max_seq_length: Maximum sequence length
        tokenizer: BERT tokenizer
        verbose: Verbosity level
    
    Returns:
        List of BertInputItem objects
    """
    input_items = []
    examples = zip(example_texts, example_labels)
    
    for (ex_index, (text, label)) in enumerate(examples):
        # Create a list of token ids
        input_ids = tokenizer.encode(f"[CLS] {text} [SEP]")
        
        if len(input_ids) > max_seq_length:
            input_ids = input_ids[:max_seq_length]

        # All our tokens are in the first input segment (id 0)
        segment_ids = [0] * len(input_ids)

        # The mask has 1 for real tokens and 0 for padding tokens
        input_mask = [1] * len(input_ids)

        # Zero-pad up to the sequence length
        padding = [0] * (max_seq_length - len(input_ids))
        input_ids += padding
        input_mask += padding
        segment_ids += padding

        # Verify lengths
        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length

        label_id = label2idx[label]

        input_items.append(
            BertInputItem(
                text=text,
                input_ids=input_ids,
                input_mask=input_mask,
                segment_ids=segment_ids,
                label_id=label_id
            )
        )

    return input_items


def get_data_loader(features, max_seq_length, batch_size, shuffle=True):
    """
    Create a DataLoader from features.
    
    Args:
        features: List of BertInputItem objects
        max_seq_length: Maximum sequence length
        batch_size: Batch size for the DataLoader
        shuffle: Whether to shuffle the data
    
    Returns:
        DataLoader object
    """
    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
    all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long)
    
    data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
    dataloader = DataLoader(data, shuffle=shuffle, batch_size=batch_size)
    
    return dataloader


def evaluate(model, dataloader):
    """
    Evaluate the model on the given dataloader.
    
    Args:
        model: BERT model to evaluate
        dataloader: DataLoader containing evaluation data
    
    Returns:
        Tuple of (eval_loss, correct_labels, predicted_labels)
    """
    model.eval()

    eval_loss = 0
    nb_eval_steps = 0
    predicted_labels, correct_labels = [], []

    for step, batch in enumerate(tqdm(dataloader, desc="Evaluation iteration", disable=True)):
        batch = tuple(t.to(device) for t in batch)
        input_ids, input_mask, segment_ids, label_ids = batch

        with torch.no_grad():
            outputs = model(
                input_ids, 
                attention_mask=input_mask,
                token_type_ids=segment_ids, 
                labels=label_ids
            )

        logits = outputs.logits
        tmp_eval_loss = outputs.loss
        
        # Get predictions
        predicted_outputs = np.argmax(logits.to('cpu'), axis=1)
        label_ids_cpu = label_ids.to('cpu').numpy()

        predicted_labels += list(predicted_outputs)
        correct_labels += list(label_ids_cpu)

        eval_loss += tmp_eval_loss.mean().item()
        nb_eval_steps += 1

    # Calculate average loss
    eval_loss = eval_loss / nb_eval_steps

    # Convert to numpy arrays
    correct_labels = np.array(correct_labels)
    predicted_labels = np.array(predicted_labels)

    return eval_loss, correct_labels, predicted_labels

In [24]:
# Configuration
root = 'data/'
CORPUS_PATH = root + 'training_v4.csv'

# Training hyperparameters
GRADIENT_ACCUMULATION_STEPS = 2
NUM_TRAIN_EPOCHS = 160
MAX_SEQ_LENGTH = 100
BATCH_SIZE = 128
LEARNING_RATE = 0.000001
WARMUP_PROPORTION = 0.1
MAX_GRAD_NORM = 2.5
PATIENCE = 80
WEIGHT_DECAY = 0.001
NFOLDS = 10

# Load data
df = pandas.read_csv(CORPUS_PATH)
print(df.head())

texts = df.text.tolist()
labels = df.label.tolist()
in_polylex = df.polylex.tolist()
context = df.context.tolist()

# Create label mapping
target_names = list(set(labels))
label2idx = {label: idx for idx, label in enumerate(target_names)}

# Initialize results storage
correct = []
predicted = []
error_checking = []

# Cross-validation setup
kf = KFold(n_splits=NFOLDS)

for train_index, test_index in kf.split(texts):
    # Load model and tokenizer

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model_checkpoint = "bert-base-greek-uncased-v5-finetuned-polylex-mg"
    model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)
    model.to(device)
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    print(f"Model loaded on: {device}")








    # BERT_MODEL = "snousias/bert-base-greek-uncased-v3-finetuned-polylex"
    # tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL)
    # model = AutoModelForSequenceClassification.from_pretrained(
    #     BERT_MODEL, 
    #     num_labels=len(label2idx)
    # )
    
    print(f"Label mapping: {label2idx}")
    
    # Setup device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Split data
    train_texts = [texts[i] for i in train_index]
    train_labels = [labels[i] for i in train_index]
    
    dev_texts = [texts[i] for i in test_index]
    dev_labels = [labels[i] for i in test_index]
    dev_polylex = [in_polylex[i] for i in test_index]
    dev_context = [context[i] for i in test_index]

    # Convert to features
    train_features = convert_examples_to_inputs(
        train_texts, train_labels, label2idx, MAX_SEQ_LENGTH, tokenizer, verbose=0
    )
    dev_features = convert_examples_to_inputs(
        dev_texts, dev_labels, label2idx, MAX_SEQ_LENGTH, tokenizer
    )

    # Create dataloaders
    train_dataloader = get_data_loader(
        train_features, MAX_SEQ_LENGTH, BATCH_SIZE, shuffle=True
    )
    dev_dataloader = get_data_loader(
        dev_features, MAX_SEQ_LENGTH, BATCH_SIZE, shuffle=False
    )

    # Calculate training steps
    num_train_steps = int(
        len(train_dataloader.dataset) / BATCH_SIZE / GRADIENT_ACCUMULATION_STEPS * NUM_TRAIN_EPOCHS
    )
    num_warmup_steps = int(WARMUP_PROPORTION * num_train_steps)

    # Setup optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {
            'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 
            'weight_decay': WEIGHT_DECAY
        },
        {
            'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 
            'weight_decay': 0.0
        }
    ]

    optimizer = AdamW(optimizer_grouped_parameters, lr=LEARNING_RATE)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, 
        num_warmup_steps=num_warmup_steps, 
        num_training_steps=num_train_steps
    )

    # Model saving setup
    OUTPUT_DIR = root
    MODEL_FILE_NAME = "pytorch_model_test.bin"

    # Training loop
    loss_history = []
    no_improvement = 0

    for _ in trange(int(NUM_TRAIN_EPOCHS), desc="Epoch"):
        model.train()
        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0
        
        for step, batch in enumerate(tqdm(train_dataloader, desc="Training iteration", disable=True)):
            batch = tuple(t.to(device) for t in batch)
            input_ids, input_mask, segment_ids, label_ids = batch

            outputs = model(
                input_ids, 
                attention_mask=input_mask, 
                labels=label_ids, 
                token_type_ids=segment_ids
            )
            loss = outputs[0]

            if GRADIENT_ACCUMULATION_STEPS > 1:
                loss = loss / GRADIENT_ACCUMULATION_STEPS

            loss.backward()
            tr_loss += loss.item()

            if (step + 1) % GRADIENT_ACCUMULATION_STEPS == 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(), MAX_GRAD_NORM)
                optimizer.step()
                optimizer.zero_grad()
                scheduler.step()

        # Evaluate on development set
        dev_loss, _, _ = evaluate(model, dev_dataloader)

        # Early stopping check
        if len(loss_history) == 0 or dev_loss < min(loss_history):
            no_improvement = 0
            model_to_save = model.module if hasattr(model, 'module') else model
            output_model_file = os.path.join(OUTPUT_DIR, MODEL_FILE_NAME)
            torch.save(model_to_save.state_dict(), output_model_file)
        else:
            no_improvement += 1

        if no_improvement >= PATIENCE:
            print("No improvement on development set. Finish training.")
            break

        loss_history.append(dev_loss)

    # Final evaluation
    _, train_correct, train_predicted = evaluate(model, train_dataloader)
    _, dev_correct, dev_predicted = evaluate(model, dev_dataloader)

    # Error analysis
    error_case_indices = list(np.where(dev_correct != dev_predicted))[0]
    for e in error_case_indices:
        error_checking.append({
            "text": dev_texts[e],
            "true_label": dev_labels[e],
            "in_polylex": dev_polylex[e],
            "context_provided": dev_context[e]
        })
    
    print(f"Error cases: {error_checking}")

    # Performance metrics
    print("Training performance:", precision_recall_fscore_support(
        train_correct, train_predicted, average="micro"
    ))
    print("Development performance:", precision_recall_fscore_support(
        dev_correct, dev_predicted, average="micro"
    ))

    # Store results
    correct.extend(dev_correct)
    predicted.extend(dev_predicted)

# Final results
print(classification_report(correct, predicted))
print(confusion_matrix(correct, predicted))

loading configuration file bert-base-greek-uncased-v5-finetuned-polylex-mg\config.json
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.54.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 35000
}

loading weights file bert-base-greek-uncased-v5-finetuned-polylex-mg\model.safetensors
All model checkpoint weights were used when initializing BertForMaskedLM.

All the weights of BertForMaskedLM were initialized from the model checkpoint at bert-base-greek-uncased-v5-finetu

                                                text  label  polylex  context
0  Αν κάποια στιγμή πέσουν λίπη στα κάρβουνα και ...      0        0        1
1  Μέσα σε λίγα λεπτά άναψαν τα αίματα και ο διαπ...      1        1        1
2  Σε ακραίες περιπτώσεις που έχετε κόψει βαθιά τ...      0        0        1
3  Κάθε φορά που έμπαινε καλάθι, έβγαζαν τις ίδιε...      1        1        1
4          Έπεσε το ποτήρι από το χέρι μου κι έσπασε      0        0        1


loading file vocab.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading file chat_template.jinja


Model loaded on: cuda
Label mapping: {0: 0, 1: 1}


Epoch:   0%|          | 0/160 [00:00<?, ?it/s]Exception ignored in: <generator object tqdm_notebook.__iter__ at 0x000001D39BFB25E0>
Traceback (most recent call last):
  File "c:\Users\int\.conda\envs\transformers_01\Lib\site-packages\tqdm\notebook.py", line 255, in __iter__
    self.disp(bar_style='danger')
  File "c:\Users\int\.conda\envs\transformers_01\Lib\site-packages\tqdm\notebook.py", line 225, in <lambda>
    self.disp = lambda *_, **__: None

KeyboardInterrupt: 
Epoch:   0%|          | 0/160 [00:23<?, ?it/s]


ValueError: Expected input batch_size (12800) to match target batch_size (128).

In [None]:
# %%script false --no-raise-error

# Configuration
root = 'data/'
CORPUS_PATH = root + 'training_v4.csv'

# Training hyperparameters
GRADIENT_ACCUMULATION_STEPS = 2
NUM_TRAIN_EPOCHS = 160
MAX_SEQ_LENGTH = 100
BATCH_SIZE = 128
LEARNING_RATE = 0.000001
WARMUP_PROPORTION = 0.1
MAX_GRAD_NORM = 2.5
PATIENCE = 80
WEIGHT_DECAY = 0.001
NFOLDS = 10

# Load data
df = pandas.read_csv(CORPUS_PATH)
print(df.head())

texts = df.text.tolist()
labels = df.label.tolist()
in_polylex = df.polylex.tolist()
context = df.context.tolist()

# Create label mapping
target_names = list(set(labels))
label2idx = {label: idx for idx, label in enumerate(target_names)}

# Initialize results storage
correct = []
predicted = []
error_checking = []

# Cross-validation setup
kf = KFold(n_splits=NFOLDS)

for train_index, test_index in kf.split(texts):
    # Load model and tokenizer
    BERT_MODEL = "nlpaueb/bert-base-greek-uncased-v1"
    tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL)
    model = AutoModelForSequenceClassification.from_pretrained(
        BERT_MODEL, 
        num_labels=len(label2idx)
    )
    
    print(f"Label mapping: {label2idx}")
    
    # Setup device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Split data
    train_texts = [texts[i] for i in train_index]
    train_labels = [labels[i] for i in train_index]
    
    dev_texts = [texts[i] for i in test_index]
    dev_labels = [labels[i] for i in test_index]
    dev_polylex = [in_polylex[i] for i in test_index]
    dev_context = [context[i] for i in test_index]

    # Convert to features
    train_features = convert_examples_to_inputs(
        train_texts, train_labels, label2idx, MAX_SEQ_LENGTH, tokenizer, verbose=0
    )
    dev_features = convert_examples_to_inputs(
        dev_texts, dev_labels, label2idx, MAX_SEQ_LENGTH, tokenizer
    )

    # Create dataloaders
    train_dataloader = get_data_loader(
        train_features, MAX_SEQ_LENGTH, BATCH_SIZE, shuffle=True
    )
    dev_dataloader = get_data_loader(
        dev_features, MAX_SEQ_LENGTH, BATCH_SIZE, shuffle=False
    )

    # Calculate training steps
    num_train_steps = int(
        len(train_dataloader.dataset) / BATCH_SIZE / GRADIENT_ACCUMULATION_STEPS * NUM_TRAIN_EPOCHS
    )
    num_warmup_steps = int(WARMUP_PROPORTION * num_train_steps)

    # Setup optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {
            'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 
            'weight_decay': WEIGHT_DECAY
        },
        {
            'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 
            'weight_decay': 0.0
        }
    ]

    optimizer = AdamW(optimizer_grouped_parameters, lr=LEARNING_RATE, correct_bias=False)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, 
        num_warmup_steps=num_warmup_steps, 
        num_training_steps=num_train_steps
    )

    # Model saving setup
    OUTPUT_DIR = root
    MODEL_FILE_NAME = "pytorch_model_test.bin"

    # Training loop
    loss_history = []
    no_improvement = 0

    for _ in trange(int(NUM_TRAIN_EPOCHS), desc="Epoch"):
        model.train()
        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0
        
        for step, batch in enumerate(tqdm(train_dataloader, desc="Training iteration", disable=True)):
            batch = tuple(t.to(device) for t in batch)
            input_ids, input_mask, segment_ids, label_ids = batch

            outputs = model(
                input_ids, 
                attention_mask=input_mask, 
                labels=label_ids, 
                token_type_ids=segment_ids
            )
            loss = outputs[0]

            if GRADIENT_ACCUMULATION_STEPS > 1:
                loss = loss / GRADIENT_ACCUMULATION_STEPS

            loss.backward()
            tr_loss += loss.item()

            if (step + 1) % GRADIENT_ACCUMULATION_STEPS == 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(), MAX_GRAD_NORM)
                optimizer.step()
                optimizer.zero_grad()
                scheduler.step()

        # Evaluate on development set
        dev_loss, _, _ = evaluate(model, dev_dataloader)

        # Early stopping check
        if len(loss_history) == 0 or dev_loss < min(loss_history):
            no_improvement = 0
            model_to_save = model.module if hasattr(model, 'module') else model
            output_model_file = os.path.join(OUTPUT_DIR, MODEL_FILE_NAME)
            torch.save(model_to_save.state_dict(), output_model_file)
        else:
            no_improvement += 1

        if no_improvement >= PATIENCE:
            print("No improvement on development set. Finish training.")
            break

        loss_history.append(dev_loss)

    # Final evaluation
    _, train_correct, train_predicted = evaluate(model, train_dataloader)
    _, dev_correct, dev_predicted = evaluate(model, dev_dataloader)

    # Error analysis
    error_case_indices = list(np.where(dev_correct != dev_predicted))[0]
    for e in error_case_indices:
        error_checking.append({
            "text": dev_texts[e],
            "true_label": dev_labels[e],
            "in_polylex": dev_polylex[e],
            "context_provided": dev_context[e]
        })
    
    print(f"Error cases: {error_checking}")

    # Performance metrics
    print("Training performance:", precision_recall_fscore_support(
        train_correct, train_predicted, average="micro"
    ))
    print("Development performance:", precision_recall_fscore_support(
        dev_correct, dev_predicted, average="micro"
    ))

    # Store results
    correct.extend(dev_correct)
    predicted.extend(dev_predicted)

# Final results
print(classification_report(correct, predicted))
print(confusion_matrix(correct, predicted))

In [None]:
print(classification_report(correct, predicted))

In [20]:
# Standard library imports
import os

# Third-party imports
import numpy as np
import pandas as pd
import torch
from tqdm import trange
from tqdm.notebook import tqdm

# Machine learning imports
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import (
    classification_report, 
    precision_recall_fscore_support, 
    confusion_matrix
)

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

# Configuration
root = 'data/'
CORPUS_PATH = root + 'training_v4.csv'

# Training hyperparameters
GRADIENT_ACCUMULATION_STEPS = 2
NUM_TRAIN_EPOCHS = 80
MAX_SEQ_LENGTH = 100
BATCH_SIZE = 64
LEARNING_RATE = 0.000001
WARMUP_PROPORTION = 0.1
MAX_GRAD_NORM = 10
PATIENCE = 80
NFOLDS = 10
verbose=False

# Load data
df = pd.read_csv(CORPUS_PATH)
print(df.head())

texts = df.text.tolist()
labels = df.label.tolist()
in_polylex = df.polylex.tolist()
context = df.context.tolist()

# Create label mapping
target_names = list(set(labels))
label2idx = {label: idx for idx, label in enumerate(target_names)}

# Initialize results storage
correct = []
predicted = []
error_checking = []

# Cross-validation setup
kf = KFold(n_splits=NFOLDS)

for train_index, test_index in kf.split(texts):

    # Split data
    train_texts = [texts[i] for i in train_index]
    train_labels = [labels[i] for i in train_index]
    
    dev_texts = [texts[i] for i in test_index]
    dev_labels = [labels[i] for i in test_index]
    dev_polylex = [in_polylex[i] for i in test_index]
    dev_context = [context[i] for i in test_index]

    # Setup baseline classifier pipeline
    pipeline = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('lr', LogisticRegression(multi_class="ovr", solver="lbfgs"))
    ])

    # Grid search parameters
    parameters = {
        'lr__C': [0.1, 0.5, 1, 2, 5, 10, 100, 1000]
    }

    # Train baseline classifier
    best_classifier = GridSearchCV(pipeline, parameters, cv=5, verbose=1)
    best_classifier.fit(train_texts, train_labels)
    best_predictions = best_classifier.predict(dev_texts)

    # Calculate baseline accuracy
    baseline_accuracy = np.mean(best_predictions == dev_labels)
    if verbose:
        print(f"Baseline accuracy: {baseline_accuracy:.4f}")

    # Store results
    correct.extend(dev_labels)
    predicted.extend(best_predictions)

    # Error analysis
    error_case_indices = list(np.where(dev_labels != best_predictions))[0]
    for e in error_case_indices:
        error_checking.append({
            "text": dev_texts[e],
            "true_label": dev_labels[e],
            "in_polylex": dev_polylex[e],
            "context_provided": dev_context[e]
        })
    
    if verbose:
        print(f"Error cases: {error_checking}")

# Final results
print("\n" + "="*50)
print("FINAL RESULTS")
print("="*50)
print(classification_report(correct, predicted))
print("\nConfusion Matrix:")
print(confusion_matrix(correct, predicted))

                                                text  label  polylex  context
0  Αν κάποια στιγμή πέσουν λίπη στα κάρβουνα και ...      0        0        1
1  Μέσα σε λίγα λεπτά άναψαν τα αίματα και ο διαπ...      1        1        1
2  Σε ακραίες περιπτώσεις που έχετε κόψει βαθιά τ...      0        0        1
3  Κάθε φορά που έμπαινε καλάθι, έβγαζαν τις ίδιε...      1        1        1
4          Έπεσε το ποτήρι από το χέρι μου κι έσπασε      0        0        1
Fitting 5 folds for each of 8 candidates, totalling 40 fits
Fitting 5 folds for each of 8 candidates, totalling 40 fits
Fitting 5 folds for each of 8 candidates, totalling 40 fits
Fitting 5 folds for each of 8 candidates, totalling 40 fits
Fitting 5 folds for each of 8 candidates, totalling 40 fits
Fitting 5 folds for each of 8 candidates, totalling 40 fits
Fitting 5 folds for each of 8 candidates, totalling 40 fits
Fitting 5 folds for each of 8 candidates, totalling 40 fits
Fitting 5 folds for each of 8 candidates, totalling 