In [13]:
import torch
import pandas as pd
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json
from transformers import  XLMRobertaTokenizerFast
from datasets import Dataset
from transformers import AdamW
from transformers import XLMRobertaTokenizerFast, XLMRobertaForTokenClassification, DataCollatorForTokenClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from datasets import Dataset
from seqeval.metrics import classification_report, precision_score, recall_score, f1_score

from transformers import  DataCollatorForTokenClassification
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.utils.class_weight import compute_class_weight
import warnings
from torch.nn.functional import softmax



### Set up for labels and test data

In [14]:
warnings.filterwarnings("ignore")

In [25]:
def extract_refs(json_path):
    cultural_refs_tag = []
    cultural_refs_IDs = {}

    with open(json_path, "r") as f:
        cultural_data = json.load(f)
        cultural_refs_tag.extend([entry['tag'] for entry in cultural_data])
        cultural_refs_IDs.update({entry['tag']: entry['id'] for entry in cultural_data})

    return cultural_refs_tag, cultural_refs_IDs

ner_tags_path = "data/ner_tags.json"
cultural_ref_tags_path = "data/cultural_tags.json"

ner_tags, tag2id_ner = extract_refs(ner_tags_path)
cultural_tags, tag2id_cultural = extract_refs(cultural_ref_tags_path)

num_labels_culture = len(tag2id_cultural)
num_labels_ner = len(tag2id_ner)

id2tag_ner = {v: k for k, v in tag2id_ner.items()}
id2tag_cultural = {v: k for k, v in tag2id_cultural.items()}

In [26]:
tokenizer = XLMRobertaTokenizerFast.from_pretrained("xlm-roberta-base")
test_data = "data/test_merged_output.conll"

#### Tokenizing the test data

In [27]:
def parse_conllu(file_path):
    " This function create a usable dataset of the sparse .conllu file with the NER tags and cultural refferences"
    sentences = []
    tags = []
    cultural_references = []
    
    with open(file_path, "r",encoding="utf-8") as file:
        tokens = []
        ner_tags = []
        cultural_ref = []
        
        for line in file:
            line = line.strip()
                       
            if not line:
                if tokens:  
                    sentences.append(tokens)
                    tags.append(ner_tags)
                    cultural_references.append(cultural_ref)
              
                tokens = []
                ner_tags = []
                cultural_ref = []
                continue

            parts = line.split("\t")
            
            if len(parts) >= 3:
                token = parts[0]  
                ner_tag = parts[1] 
                cultural_ref_val = parts[2]
                
                tokens.append(token)
                ner_tags.append(ner_tag)
                cultural_ref.append(cultural_ref_val)
        
       
        if tokens:
            sentences.append(tokens)
            tags.append(ner_tags)
            cultural_references.append(cultural_ref)
    
    return sentences, tags, cultural_references

In [28]:
sentences_test, tags_test, cultural_refs_test = parse_conllu(test_data)
test_data = [{"tokens": s, "ner_tags": t, "cultural_ref": c} for s, t, c in zip(sentences_test, tags_test, cultural_refs_test)]
dataset_test = Dataset.from_list(test_data)

In [29]:
def tokenize_and_align_labels_with_cultural(example):
    tokenized = tokenizer(
        example['tokens'], 
        truncation=True, 
        padding=True,
        is_split_into_words=True
    )

    labels = []
    cultural_ref = []

    for i, word in enumerate(example['tokens']):
        if example['ner_tags'][i] != "O":
            
            labels.append(tag2id_ner.get(example['ner_tags'][i], -100))
            
            cultural_ref.append(tag2id_cultural.get(example['cultural_ref'][i], -100))
        else:
            labels.append(-100)
            cultural_ref.append(-100)

    tokenized['labels'] = labels
    tokenized['cultural_ref'] = cultural_ref
    return tokenized


In [30]:
tokenized_cultural_test = dataset_test.map(tokenize_and_align_labels_with_cultural)

                                                                 

In [31]:
tokenized_cultural_test

Dataset({
    features: ['tokens', 'ner_tags', 'cultural_ref', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 2952
})

## 1. Inicialization of models

### 1.a. NER tagger 

In [18]:
ner_model = XLMRobertaForTokenClassification.from_pretrained("models/xlmr-ner-head/checkpoint-3018")
ner_model.eval()

XLMRobertaForTokenClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bi

### 1.a. Culture tagger 

In [19]:
cultural_model = XLMRobertaForTokenClassification.from_pretrained("models/xlmr-cultural-head/checkpoint-700")
cultural_model.eval()

XLMRobertaForTokenClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bi

## 2. NER and Cultural prediction connection

### 2.a Getting the predictions from the NER model

In [34]:
# sentence = "Barack Obama was born in Hawaii and lived in the United States. Ivett is a student in Copenhagen. Bence is also in Copenhagen, but further."
sentence = "So, it was a chill afternoon in New York City, and Emily Roberts was kicking back at this low-key café near Central Park, waiting on Dr. Sanjay Mehta. Dude finally shows up, all jazzed about some big-deal archaeology gig happening in Athens, Greece. Apparently, Professor Laura Kim—yeah, the legend from Seoul National University—is dropping some serious knowledge bombs there. Meanwhile, way over in Tokyo, Kenji Tanaka was neck-deep in robot parts when his phone lit up. It was Maria Gonzalez hitting him up from Madrid, trying to rope him into some wild collab with NASA out in Houston, Texas. Talk about long-distance hustle. On the other side of the world, Liam O'Connor was off the grid, hiking through the misty trails of Connemara National Park in Ireland, totally ghosting his phone. Poor Sofia Petrova was blowing up his messages from her freezing office in St. Petersburg, Russia, probably thinking he got eaten by sheep or something. Back in Cape Town, Dr.Amina Yusuf was setting up for a Zoom with Michael Zhang, who just touched down in Beijing after a whirlwind week at some high-profile summit in Berlin. Man was running on fumes and airport coffee. Meanwhile, in sunny Sydney, Noah Thompson was catching up with Priya Desai over flat whites. They were cracking up, talking about their wild college days back at Oxford University—you know, late-night cramming...."

In [None]:
def tokenize_and_get_offsets(sentence, tokenizer):
    """
    Tokenizes the sentence and returns input IDs, attention mask, and offsets.
    """
    encoding = tokenizer(sentence, return_tensors="pt", return_offsets_mapping=True, is_split_into_words=False, truncation=True)
    input_ids = encoding["input_ids"]
    attention_mask = encoding["attention_mask"]
    offset_mapping = encoding["offset_mapping"][0] 
    word_ids = encoding.word_ids()
    encoding.pop("offset_mapping") 
    
    return input_ids, attention_mask, offset_mapping, word_ids

def predict_ner(encoding, ner_model):
    
    with torch.no_grad():
        outputs = ner_model(**encoding)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)[0]
    return predictions

def extract_entities(input_ids, word_ids, predictions, tokenizer, id2label):

    entities = []
    current_word = None
    current_entity = None
    current_label = "O"

    for idx, word_idx in enumerate(word_ids):
        if word_idx is None:
            continue  # Skip special tokens

        label = id2label[predictions[idx].item()]
        word = tokenizer.convert_ids_to_tokens(int(input_ids[0][idx]))
        
        # Skip subword tokens (keep only the first token of a word)
        if word_idx != current_word:
            if current_entity and current_label != "O":
                entities.append((" ".join(current_entity), current_label))
            if label.startswith("B-"):
                current_entity = [word.lstrip("▁")]
                current_label = label[2:]
            elif label.startswith("I-") and current_label == label[2:]:
                current_entity.append(word.lstrip("▁"))
            else:
                current_entity = []
                current_label = "O"
            current_word = word_idx
        else:
            # Continuation of the same word (likely subword)
            if current_entity is not None:
                current_entity.append(word.lstrip("▁"))

    # Append the last entity if any
    if current_entity and current_label != "O":
        entities.append((" ".join(current_entity), current_label))
    
    return entities


In [None]:
input_ids, attention_mask, offset_mapping, word_ids = tokenize_and_get_offsets(sentence, tokenizer)

predictions = predict_ner({"input_ids": input_ids, "attention_mask": attention_mask}, ner_model)

id2label = ner_model.config.id2label
entities = extract_entities(input_ids, word_ids, predictions, tokenizer, id2label)

print("Extracted Named Entities with Tags:")
for word, tag in entities:
    print(f"{word}: {tag}")

Extracted Named Entities with Tags:
New: LOC
New York: LOC
New York City ,: LOC
Emily: PER
Emily Roberts: PER
Central: LOC
Central Park ,: LOC
Sanjay: PER
Sanjay Meh ta .: PER
Athen s ,: LOC
Greece .: LOC
Laura: PER
Laura Kim — ye ah ,: PER
Seoul: ORG
Seoul National: ORG
Seoul National University — is: ORG
Tokyo ,: LOC
Ke nji: PER
Ke nji Tan aka: PER
Maria: PER
Maria Go nza lez: PER
Madrid ,: LOC
NASA: ORG
Houston ,: LOC
Texas .: LOC
Liam: PER
Liam O ' Con nor: PER
Con ne mara: LOC
Con ne mara National: LOC
Con ne mara National Park: LOC
Ireland ,: LOC
Sofia: PER
Sofia Petrov a: PER
St .: LOC
St . Petersburg ,: LOC
Russia ,: LOC
Cape: LOC
Cape Town ,: LOC
Michael: PER
Michael Z hang ,: PER
Beijing: LOC
Berlin .: LOC
Sydney ,: LOC
No ah: PER
No ah Thompson: PER
Pri ya: PER
Pri ya Desa i: PER
Oxford: ORG
Oxford University — you: ORG


### 2.b. Pass the NER entites to the Cultural model

In [51]:
outputs

TokenClassifierOutput(loss=None, logits=tensor([[[-1.2601,  3.3004,  0.5962,  0.6516,  1.1875,  0.1818, -2.2296,
          -0.1563, -1.0434, -1.3488],
         [-0.9783,  0.5731, -2.5981, -0.5789, -2.1552,  4.4398, -0.9211,
           2.4038,  1.1361, -1.4119],
         [-0.8675,  1.6432, -2.4132, -0.2053, -1.9445,  4.1493, -1.3196,
           2.1429,  0.5877, -1.5297],
         [-2.8530,  4.5201,  2.3758,  0.5015,  0.1321,  0.3668, -2.8938,
          -0.7940, -0.9317, -3.4310],
         [-2.6802,  4.1746,  2.7399,  0.3276,  2.2364, -0.8926, -2.7958,
          -1.1117, -1.3645, -3.2229],
         [-1.0831,  2.7994,  0.2827,  0.6256,  1.0628,  0.0573, -1.8229,
          -0.1489, -1.0642, -1.0602]]]), hidden_states=None, attentions=None)

In [57]:
for entity_text, ner_tag in entities:    
    encoding = tokenizer(
        entity_text,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=32 
    )

    with torch.no_grad():
        outputs = cultural_model(**encoding)
  
    logits = outputs.logits[0]
    
    entity_logits = logits[0]
    probs = softmax(entity_logits, dim=0)
    predicted_id = torch.argmax(probs).item()
    predicted_label = cultural_model.config.id2label[predicted_id]

    print(f"Entity: {entity_text}, NER Tag: {ner_tag}, Cultural Tag: {predicted_label}")

Entity: New, NER Tag: LOC, Cultural Tag: Latin
Entity: New York, NER Tag: LOC, Cultural Tag: Eastern Asian
Entity: New York City ,, NER Tag: LOC, Cultural Tag: African
Entity: Emily, NER Tag: PER, Cultural Tag: Latin
Entity: Emily Roberts, NER Tag: PER, Cultural Tag: Latin
Entity: Central, NER Tag: LOC, Cultural Tag: African
Entity: Central Park ,, NER Tag: LOC, Cultural Tag: African
Entity: Sanjay, NER Tag: PER, Cultural Tag: Latin
Entity: Sanjay Meh ta ., NER Tag: PER, Cultural Tag: Latin
Entity: Athen s ,, NER Tag: LOC, Cultural Tag: African
Entity: Greece ., NER Tag: LOC, Cultural Tag: African
Entity: Laura, NER Tag: PER, Cultural Tag: Latin
Entity: Laura Kim — ye ah ,, NER Tag: PER, Cultural Tag: Latin
Entity: Seoul, NER Tag: ORG, Cultural Tag: Eastern Asian
Entity: Seoul National, NER Tag: ORG, Cultural Tag: African
Entity: Seoul National University — is, NER Tag: ORG, Cultural Tag: African
Entity: Tokyo ,, NER Tag: LOC, Cultural Tag: African
Entity: Ke nji, NER Tag: PER, Cultura