In [1]:
%env CUDA_VISIBLE_DEVICES=2

env: CUDA_VISIBLE_DEVICES=2


In [2]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline


def load_model_and_tokenizer(model_name_or_path):
    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
    model = AutoModelForTokenClassification.from_pretrained(model_name_or_path).to("cuda")
    return tokenizer, model


def infer(model, tokenizer, text):
    # Tokenize the input text
    tokens = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to("cuda")

    # Get predictions
    outputs = model(**tokens)
    predictions = outputs.logits.argmax(dim=-1).squeeze().tolist()

    # Get labels from the tokenizer
    labels = [model.config.id2label[pred] for pred in predictions]

    # Combine tokens and labels
    tokens = tokenizer.convert_ids_to_tokens(tokens.input_ids.squeeze().tolist())
    results = list(zip(tokens, labels))

    return results


def format_predictions(predictions):
    formatted_result = []
    for token, label in predictions:
        if token not in ["[CLS]", "[SEP]", "[PAD]"]:
            formatted_result.append((token, label))
    return formatted_result


def get_ner(formatted_predictions):
    text = ""
    actual_label = ""
    labels = []

    for token, label in formatted_predictions:
        if label.startswith("B"):
            if text:
                labels.append((text, actual_label))
            actual_label = label.split("-")[1]
            text = token.replace("▁", " ")
        elif text != "" and label.startswith("I") and actual_label == label.split("-")[1]:
            text += token.replace("▁", " ")
        else:
            if text and actual_label:
                labels.append((text.strip(), actual_label))
            text = ""
            actual_label = ""

    # Ensure the last entity is added
    if text and actual_label:
        labels.append((text.strip(), actual_label))

    return labels

2024-05-28 11:42:57.927312: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-05-28 11:42:57.950261: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
# Load your trained model and tokenizer
model_name_or_path = "/mnt/sdb/stt/mata/models/ner_models/train/xlm-roberta-large-finetuned-conll03-english-hug-28-05-2024/xlm-roberta-large"
tokenizer, model = load_model_and_tokenizer(model_name_or_path)


In [4]:
# Example text for inference
text = "Shavkat Mirziyoyev Rossiya bilan hamkorlikda kam quvvatli AES qurilishi haqida gapirdi."

In [5]:
# Perform inference
predictions = infer(model, tokenizer, text)

# Format predictions
formatted_predictions = format_predictions(predictions)
ner = get_ner(formatted_predictions)
ner

[(' Shavkat Mirziyoyev', 'PERSON'), ('Rossiya', 'GPE')]