In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!git clone https://github.com/sskyisthelimit/DS_Internship.git

In [None]:
cd DS_Internship/NLP/

In [None]:
!pip install -r requirements.txt

In [None]:
cd src

In [None]:
from inference import (post_process_tokens_and_labels,
                       predict_ner_labels, labels, label2id, id2label,
                       AutoTokenizer, AutoModelForTokenClassification)

In [None]:
device = "cuda:0"

model_name = "sskyisthelimit/mount-ner-model"
ch_model_base = "e1c89f5ecd230ded080eebefecf5c3cfbcc2adf4"
ch_tokenizer_base = "3812b840d1845edc8bc80585d53575ae2f08340f"

tokenizer_base = AutoTokenizer.from_pretrained(model_name, revision=ch_tokenizer_base)
model_base = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True,
    revision=ch_model_base).to(device)

In [8]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader

class BertDataset(Dataset):
    def __init__(self, dataframe: pd.DataFrame, tokenizer):
        super().__init__()

        self.tokenizer = tokenizer


        self.dataframe = dataframe


    def __len__(self):
        return len(self.dataframe)


    def __getitem__(self, index):
        sentence = str(self.dataframe.iloc[index]['sentence'])
        tags = list(self.dataframe.iloc[index]['tags'])
        return sentence, tags

In [9]:
from ast import literal_eval
import pandas as pd

dataset_path = "/content/drive/MyDrive/Datasets/NLP/test_dataset_v2.csv"
batch_size = 32

data = pd.read_csv(dataset_path, converters={'tags': literal_eval})

dataset_test = BertDataset(dataframe=data, tokenizer=tokenizer_base)



In [10]:
data[data["source"] == "HUMAN"].head()

Unnamed: 0,names,sentence,tags,source
0,"['Appalachian Mountains', 'Great Smoky Mountai...","The Appalachian Mountains, stretching across t...","[O, B-MOUNTAIN, I-MOUNTAIN, O, O, O, O, O, O, ...",HUMAN
1,"['Tien Shan Range', 'Celestial Mountains', 'Kh...","The Tien Shan Range in Central Asia, known as ...","[O, B-MOUNTAIN, I-MOUNTAIN, I-MOUNTAIN, O, O, ...",HUMAN
2,['Drakensberg Mountains'],"South Africa's Drakensberg Mountains, with its...","[O, O, O, B-MOUNTAIN, I-MOUNTAIN, O, O, O, O, ...",HUMAN
3,"['Hotakadake', 'Yarigatake', 'Southern Alps']","The Southern Alps of Japan, including the Hota...","[O, B-MOUNTAIN, I-MOUNTAIN, O, O, O, O, O, B-M...",HUMAN
4,"['Rocky Mountains', 'Banff', 'Mount Robson']",The Rocky Mountains in Canada showcase the stu...,"[O, B-MOUNTAIN, I-MOUNTAIN, O, O, O, O, O, O, ...",HUMAN


In [11]:
import torch
from sklearn.metrics import classification_report
import spacy
from spacy.tokens import Doc, Span
from spacy import displacy

v2_test_labels = ["O", "B-MOUNTAIN", "I-MOUNTAIN"]
v2_test_label2id = {label: idx for idx, label in enumerate(v2_test_labels)}
v2_test_id2label = {idx: label for label, idx in v2_test_label2id.items()}

# SpaCy setup for visualization
nlp = spacy.blank("en")  # Create a blank SpaCy pipeline

def create_spacy_doc(tokens, labels, label_map):
    """
    Create a SpaCy Doc object with labeled entities for visualization.

    Args:
    tokens (list): List of tokens.
    labels (list): List of corresponding labels.
    label_map (dict): Mapping from label IDs to label names.

    Returns:
    Doc: A SpaCy Doc object with labeled entities.
    """
    doc = Doc(nlp.vocab, words=tokens)
    entities = []
    for i, label in enumerate(labels):
        if label > 0:  # Assuming 0 is 'O' or non-entity
            entities.append(Span(doc, i, i + 1, label=label_map[label]))
    doc.ents = entities
    return doc

def post_process_tokens(tokens):
    """
    Post-process tokens and labels by merging subwords and removing special tokens like [CLS] and [SEP].

    Args:
    tokens (list): List of tokenized input words.
    predicted_labels (list): List of predicted labels.

    Returns:
    final_tokens (list): List of final tokens with subwords merged.
    final_labels (list): List of final labels corresponding to each token.
    """
    final_tokens = []

    for token in tokens:
        if token not in ['[CLS]', '[SEP]']:
            if token.startswith("##"):
                final_tokens[-1] += token[2:]
            else:
                final_tokens.append(token)

    return final_tokens


def compute_metrics_and_visualize(data_test, model, tokenizer, device, label_map):

    y_true, y_pred = [], []

    for idx in range(len(data_test)):
        sentence, labels = data_test[idx]
        pred_tokens, pred_labels = predict_ner_labels(sentence, model, tokenizer, device, False)
        tokens = post_process_tokens(tokenizer.tokenize(sentence))
        tokens, labels = post_process_tokens_and_labels(tokens, labels)
        labels = [label2id[label] for label in labels]

        # Adjust labels for equivalence rules
        adjusted_preds = [
            true if (label == true or (label, true) in [(3, 1), (1, 3), (2, 4), (4, 2)]) else 0
            for label, true in zip(pred_labels, labels)
        ]

        y_true.extend(labels)
        y_pred.extend(adjusted_preds)
        
    print("Classification Report:")
    print(classification_report(y_true, y_pred, target_names=list(label_map.values())))

compute_metrics_and_visualize(dataset_test, model_base, tokenizer_base, device, v2_test_id2label)


Classification Report:
              precision    recall  f1-score   support

           O       0.98      1.00      0.99      4693
  B-MOUNTAIN       1.00      0.83      0.91       433
  I-MOUNTAIN       1.00      0.90      0.95       415

    accuracy                           0.98      5541
   macro avg       0.99      0.91      0.95      5541
weighted avg       0.98      0.98      0.98      5541



In [12]:
dataset_test_human = BertDataset(dataframe=data[data["source"] == "HUMAN"], tokenizer=tokenizer_base)
compute_metrics_and_visualize(dataset_test_human, model_base, tokenizer_base, device, v2_test_id2label)

Classification Report:
              precision    recall  f1-score   support

           O       0.97      1.00      0.99      2376
  B-MOUNTAIN       1.00      0.72      0.84       163
  I-MOUNTAIN       1.00      0.83      0.91       126

    accuracy                           0.98      2665
   macro avg       0.99      0.85      0.91      2665
weighted avg       0.98      0.98      0.97      2665

