In [13]:
import torch
import pandas as pd
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json
from transformers import  XLMRobertaTokenizerFast
from datasets import Dataset
from transformers import AdamW
from transformers import XLMRobertaTokenizerFast, XLMRobertaForTokenClassification, DataCollatorForTokenClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from datasets import Dataset
from seqeval.metrics import classification_report, precision_score, recall_score, f1_score

from transformers import  DataCollatorForTokenClassification
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.utils.class_weight import compute_class_weight
import warnings
from torch.nn.functional import softmax



## Commonly used

In [14]:
warnings.filterwarnings("ignore")

In [15]:
tokenizer = XLMRobertaTokenizerFast.from_pretrained("xlm-roberta-base")
test_data = "data/test_merged_output.conll"

In [16]:
def extract_refs(json_path):
    cultural_refs_tag = []
    cultural_refs_IDs = {}

    with open(json_path, "r") as f:
        cultural_data = json.load(f)
        cultural_refs_tag.extend([entry['tag'] for entry in cultural_data])
        cultural_refs_IDs.update({entry['tag']: entry['id'] for entry in cultural_data})

    return cultural_refs_tag, cultural_refs_IDs

ner_tags_path = "data/ner_tags.json"
cultural_ref_tags_path = "data/cultural_tags.json"

ner_tags, tag2id_ner = extract_refs(ner_tags_path)
cultural_tags, tag2id_cultural = extract_refs(cultural_ref_tags_path)

num_labels_culture = len(tag2id_cultural)
num_labels_ner = len(tag2id_ner)

In [17]:
ner_tags_path = "./data/ner_tags.json"
cultural_ref_tags_path = "./data/cultural_tags.json"

ner_tags, tag2id_ner = extract_refs(ner_tags_path)
cultural_tags, tag2id_cultural = extract_refs(cultural_ref_tags_path)

id2tag_ner = {v: k for k, v in tag2id_ner.items()}
id2tag_cultural = {v: k for k, v in tag2id_cultural.items()}

## 1. Inicialization of models

### 1.a. NER tagger 

In [18]:
ner_model = XLMRobertaForTokenClassification.from_pretrained("models/xlmr-ner-head/checkpoint-3018")
ner_model.eval()

XLMRobertaForTokenClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bi

### 1.a. Culture tagger 

In [19]:
cultural_model = XLMRobertaForTokenClassification.from_pretrained("models/xlmr-cultural-head/checkpoint-700")
cultural_model.eval()

XLMRobertaForTokenClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bi

## 2. NER and Cultural prediction connection

### 2.a Getting the predictions from the NER model

In [20]:
# sentence = "Barack Obama was born in Hawaii and lived in the United States. Ivett is a student in Copenhagen. Bence is also in Copenhagen, but further."
sentence = "So, it was a chill afternoon in New York City, and Emily Roberts was kicking back at this low-key café near Central Park, waiting on Dr. Sanjay Mehta. Dude finally shows up, all jazzed about some big-deal archaeology gig happening in Athens, Greece. Apparently, Professor Laura Kim—yeah, the legend from Seoul National University—is dropping some serious knowledge bombs there. Meanwhile, way over in Tokyo, Kenji Tanaka was neck-deep in robot parts when his phone lit up. It was Maria Gonzalez hitting him up from Madrid, trying to rope him into some wild collab with NASA out in Houston, Texas. Talk about long-distance hustle. On the other side of the world, Liam O'Connor was off the grid, hiking through the misty trails of Connemara National Park in Ireland, totally ghosting his phone. Poor Sofia Petrova was blowing up his messages from her freezing office in St. Petersburg, Russia, probably thinking he got eaten by sheep or something. Back in Cape Town, Dr.Amina Yusuf was setting up for a Zoom with Michael Zhang, who just touched down in Beijing after a whirlwind week at some high-profile summit in Berlin. Man was running on fumes and airport coffee. Meanwhile, in sunny Sydney, Noah Thompson was catching up with Priya Desai over flat whites. They were cracking up, talking about their wild college days back at Oxford University—you know, late-night cramming...."

In [21]:
# Tokenize
encoding = tokenizer(sentence, return_tensors="pt", return_offsets_mapping=True, is_split_into_words=False, truncation=True)
input_ids = encoding["input_ids"]
attention_mask = encoding["attention_mask"]
offset_mapping = encoding["offset_mapping"][0]
word_ids = encoding.word_ids()
encoding.pop("offset_mapping")

with torch.no_grad():
    outputs = ner_model(**encoding)
logits = outputs.logits
predictions = torch.argmax(logits, dim=-1)[0]

#ensure thet the labels are the same as the models label
id2label = ner_model.config.id2label

entities = []
current_word = None
current_entity = None
current_label = "O"

for idx, word_idx in enumerate(word_ids):
    if word_idx is None:
        continue  # Skip special tokens

    label = id2label[predictions[idx].item()]
    word = tokenizer.convert_ids_to_tokens(int(input_ids[0][idx]))
    
    # Skip subword tokens (keep only first token of a word)
    if word_idx != current_word:
        if current_entity and current_label != "O":
            entities.append((" ".join(current_entity), current_label))
        if label.startswith("B-"):
            current_entity = [word.lstrip("▁")]
            current_label = label[2:]
        elif label.startswith("I-") and current_label == label[2:]:
            current_entity = [word.lstrip("▁")]
        else:
            current_entity = []
            current_label = "O"
        current_word = word_idx
    else:
        # Continuation of the same word (likely subword)
        if current_entity is not None:
            current_entity.append(word.lstrip("▁"))

if current_entity and current_label != "O":
    entities.append((" ".join(current_entity), current_label))

# to see if the entites are rigth:
print("Extracted Named Entities with Tags:")
for word, tag in entities:
    print(f"{word}: {tag}")


Extracted Named Entities with Tags:
New: LOC
York: LOC
City ,: LOC
Emily: PER
Roberts: PER
Central: LOC
Park ,: LOC
Sanjay: PER
Meh ta .: PER
Athen s ,: LOC
Greece .: LOC
Laura: PER
Kim — ye ah ,: PER
Seoul: ORG
National: ORG
University — is: ORG
Tokyo ,: LOC
Ke nji: PER
Tan aka: PER
Maria: PER
Go nza lez: PER
Madrid ,: LOC
NASA: ORG
Houston ,: LOC
Texas .: LOC
Liam: PER
O ' Con nor: PER
Con ne mara: LOC
National: LOC
Park: LOC
Ireland ,: LOC
Sofia: PER
Petrov a: PER
St .: LOC
Petersburg ,: LOC
Russia ,: LOC
Cape: LOC
Town ,: LOC
Michael: PER
Z hang ,: PER
Beijing: LOC
Berlin .: LOC
Sydney ,: LOC
No ah: PER
Thompson: PER
Pri ya: PER
Desa i: PER
Oxford: ORG
University — you: ORG


### 2.b. Pass the NER entites to the Cultural model

In [22]:
for entity_text, ner_tag in entities:    
    encoding = tokenizer(
        entity_text,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=32 
    )

    with torch.no_grad():
        outputs = cultural_model(**encoding)
  
    logits = outputs.logits[0]
    
    entity_logits = logits[0]
    probs = softmax(entity_logits, dim=0)
    predicted_id = torch.argmax(probs).item()
    predicted_label = cultural_model.config.id2label[predicted_id]

    print(f"Entity: {entity_text}, NER Tag: {ner_tag}, Cultural Tag: {predicted_label}")

Entity: New, NER Tag: LOC, Cultural Tag: Latin
Entity: York, NER Tag: LOC, Cultural Tag: Eastern Asian
Entity: City ,, NER Tag: LOC, Cultural Tag: African
Entity: Emily, NER Tag: PER, Cultural Tag: Latin
Entity: Roberts, NER Tag: PER, Cultural Tag: Latin
Entity: Central, NER Tag: LOC, Cultural Tag: African
Entity: Park ,, NER Tag: LOC, Cultural Tag: African
Entity: Sanjay, NER Tag: PER, Cultural Tag: Latin
Entity: Meh ta ., NER Tag: PER, Cultural Tag: Latin
Entity: Athen s ,, NER Tag: LOC, Cultural Tag: African
Entity: Greece ., NER Tag: LOC, Cultural Tag: African
Entity: Laura, NER Tag: PER, Cultural Tag: Latin
Entity: Kim — ye ah ,, NER Tag: PER, Cultural Tag: Latin
Entity: Seoul, NER Tag: ORG, Cultural Tag: Eastern Asian
Entity: National, NER Tag: ORG, Cultural Tag: African
Entity: University — is, NER Tag: ORG, Cultural Tag: African
Entity: Tokyo ,, NER Tag: LOC, Cultural Tag: African
Entity: Ke nji, NER Tag: PER, Cultural Tag: Latin
Entity: Tan aka, NER Tag: PER, Cultural Tag: Lat

## 3. Visualize results