In [None]:
# !pip install mistralai
# !pip install huggingface_hub
# !pip install transformers
# !pip install torch
# !pip install openai
# !pip install seqeval

In [None]:
from huggingface_hub import login
login("")
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import re
import pickle
from collections import Counter
from openai import OpenAI
from mistralai import Mistral
# Set up OpenAI API key
client_openai=  OpenAI(api_key="",
)

torch.backends.cuda.enable_mem_efficient_sdp(False)
torch.backends.cuda.enable_flash_sdp(False)


tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B")

client_mistral = Mistral(api_key="")

import random

# Set the seed for reproducibility
random.seed(42)


In [None]:


# system_prompt_nepali = "तपाईं एक उत्कृष्ट भाषाविज्ञ हुनुहुन्छ। "

def generate_NER(model_name,prompt):


    generated_text=""

    if model_name=="openai":
        completion = client_openai.chat.completions.create(
            model = 'gpt-4o',
            messages = [
                {'role':'system',"content": "You are an excellent linguist. "},
            #   {'role':'system',"content": "तपाईं एक उत्कृष्ट भाषाविज्ञ हुनुहुन्छ। "},
                {'role': 'user', 'content':prompt}
            ],
            # temperature = 0  ,
                max_tokens=500,
                temperature=0,
  top_p=1,
  frequency_penalty=0,
  presence_penalty=0,
  n=1
            )

        generated_text = completion.choices[0].message.content.strip()

    return generated_text


In [None]:
!git clone https://github.com/nowalab/everest-ner.git
!git clone https://github.com/nowalab/DanfeNER.git

In [None]:
def parse_file(file_path):
  with open(file_path,"r") as f:
    tokens,labels = [],[]
    t,l = [], []
    for line in f.readlines():
      tmp = line.strip().split()
      if len(tmp) == 0:
        tokens.append(t)
        labels.append(l)
        t, l = [], []
      else:
        t.append(tmp[0])
        l.append(tmp[1])
    if len(t) > 0:
      tokens.append(t)
      labels.append(l)
    data = tokens,labels
    return data

def get_news_data_sets():
  train_data= parse_file("everest-ner/EverestNER-train-bio.txt")
  test_data= parse_file("everest-ner/EverestNER-test-bio.txt")
  return train_data,test_data

def get_tweets_data_sets():
  train_data = parse_file("DanfeNER/DanfeNER-train-bio.txt")
  test_data = parse_file("DanfeNER/DanfeNER-test-bio.txt")
  return train_data,test_data



In [None]:
entities_list=["Location","Date","Person","Organization","Event"]

english_entities_dict={"Location":"Location",
                       "Date":"Date",
                       "Person":"Person",
                       "Organization":"Organization",
                       "Event":"Event"}


nepali_entities_dict={"Location":"स्थानको नाम",
                       "Date":"मिति या समय",
                       "Person":"व्यक्तिको नाम",
                       "Organization":"सङ्घ संस्थाको नाम",
                       "Event":"उत्सव, पर्व या घटनाको नाम"}

In [None]:
news_train,news_test=get_news_data_sets()
tweet_train,tweet_test=get_tweets_data_sets()

In [None]:
news_train_sentences, news_train_labels = news_train
news_test_sentences, news_test_labels = news_test
len(news_train_sentences),len(news_test_sentences)

In [None]:
tweet_train_sentences, tweet_train_labels = tweet_train
tweet_test_sentences, tweet_test_labels = tweet_test
len(tweet_train_sentences),len(tweet_test_sentences)

In [None]:
# tweet_train_sentences[0],tweet_train_labels[0]

In [None]:
def process_examples_with_correct_entity_spans(examples):
    """
    Process a list of examples and return dictionaries where keys are entity types,
    and values are the whole sentence with only the specific entity type annotated.

    Args:
        examples (list): A list of examples, where each example is a tuple of tokens and labels.

    Returns:
        list: A list of dictionaries, one for each sentence, where keys are entity types
              and values are the whole sentence with only the specific entity type annotated.
    """
    all_outputs = []

    for tokens, labels in examples:
        # Initialize the output dictionary with the original sentence for each tag
        output = {
            "Location": " ".join(tokens),
            "Date": " ".join(tokens),
            "Person": " ".join(tokens),
            "Organization": " ".join(tokens),
            "Event": " ".join(tokens)
        }

        # Process the sentence for each tag individually
        for entity_type in output.keys():
            annotated_tokens = tokens[:]  # Copy of the tokens for this tag
            in_entity = False
            start_index = None

            for i, t in enumerate(tokens):
                tag = labels[i]

                # Start of the specific entity type
                if tag == f"B-{entity_type}":
                    if in_entity:  # If already in an entity, close the previous one
                        annotated_tokens[start_index] = f"@@{annotated_tokens[start_index]}"
                        annotated_tokens[i - 1] += "##"
                    start_index = i
                    in_entity = True

                # End of the specific entity or outside any entity
                elif in_entity and (tag != f"I-{entity_type}" or i == len(tokens) - 1):
                    # Close the current entity
                    annotated_tokens[start_index] = f"@@{annotated_tokens[start_index]}"
                    if tag != f"I-{entity_type}":  # If not continuing, close the entity
                        annotated_tokens[i - 1] += "##"
                        in_entity = False
                    elif i == len(tokens) - 1:  # Handle the last token
                        annotated_tokens[i] += "##"
                        in_entity = False

            # Close any lingering entity
            if in_entity:
                annotated_tokens[start_index] = f"@@{annotated_tokens[start_index]}"
                annotated_tokens[len(tokens) - 1] += "##"

            # Combine tokens to form the annotated sentence
            output[entity_type] = " ".join(annotated_tokens)

        all_outputs.append(output)

    return all_outputs




train_datasets_with_tagging={}
for i in range(0,len(news_train_sentences)):
  tokens, labels = news_train_sentences[i],news_train_labels[i]
#   print(tokens,labels)
  annotated_sentences = process_examples_with_correct_entity_spans([(tokens,labels)])
  annotated_sentences.append(labels)
  train_datasets_with_tagging[" ".join(tokens)]=annotated_sentences


test_datasets_with_tagging={}
for i in range(0,len(news_test_sentences)):
  tokens, labels = news_test_sentences[i],news_test_labels[i]
#   print(tokens,labels)
  annotated_sentences = process_examples_with_correct_entity_spans([(tokens,labels)])
  annotated_sentences.append(labels)
  test_datasets_with_tagging[" ".join(tokens)]=annotated_sentences


In [None]:
# 
len(train_datasets_with_tagging),len(test_datasets_with_tagging)

In [None]:
with open("/output_path/train_datasets_with_tagging.pkl", "wb") as file:
    pickle.dump(train_datasets_with_tagging, file)

with open("/output_path/test_datasets_with_tagging.pkl", "wb") as file:
    pickle.dump(test_datasets_with_tagging, file)

In [None]:
with open("output_path/train_datasets_with_tagging.pkl", "rb") as file:  # "rb" stands for read binary
    train_datasets_with_tagging = pickle.load(file)

with open("output_path/test_datasets_with_tagging.pkl", "rb") as file:  # "rb" stands for read binary
    test_datasets_with_tagging = pickle.load(file)


In [None]:
import random
def get_random_k_examples_with_entity(k,entity):
    entity_data={}
    
    for i in train_datasets_with_tagging:
        if "@" in train_datasets_with_tagging[i][0][entity]:
            entity_data[i]=[train_datasets_with_tagging[i][0][entity],train_datasets_with_tagging[i][1]]
            
            
    selected_keys = random.sample(list(entity_data.keys()), k)    
    k_data = {key: entity_data[key] for key in selected_keys}

    return k_data

In [None]:
entity_random_k_examples={}
for entity in entities_list:
    entity_k={}
    
    temp_examples=get_random_k_examples_with_entity(100,entity)
    # entity_k[100]=temp_examples
    entity_random_k_examples[entity]=temp_examples

with open("output_path/entity_random_k_examples.pkl", "wb") as file:
    pickle.dump(entity_random_k_examples, file)

In [None]:
import random
def get_random_k_examples_without_entity(k):
    entity_data={}
    
    for i in train_datasets_with_tagging:
        for entity in train_datasets_with_tagging[i][0]:
            if "@" in train_datasets_with_tagging[i][0][entity]:
                entity_data[i]=[train_datasets_with_tagging[i][0][entity],train_datasets_with_tagging[i][1]]
            
            
    selected_keys = random.sample(list(entity_data.keys()), k)    
    k_data = {key: entity_data[key] for key in selected_keys}

    return k_data

In [None]:
from transformers import BertTokenizer, BertModel
import torch
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("path/bert-npvec1/tokenizer")
model = BertModel.from_pretrained("path/bert-npvec1", output_hidden_states=True)

# Function to get sentence embeddings
def get_sentence_embedding(sentence):
    inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    # Use the [CLS] token as sentence embedding
    cls_embedding = outputs.last_hidden_state[:, 0, :]
    return cls_embedding.squeeze().numpy()



In [None]:
sentences = [ " ".join(i) for i in news_train_sentences]
train_embeddings = np.array([get_sentence_embedding(sentence) for sentence in sentences])


with open("output_path/train_embeddings.pkl", "wb") as file:
    pickle.dump(train_embeddings, file)


In [None]:
with open("output_path/train_embeddings.pkl", "rb") as file:  # "rb" stands for read binary
    train_embeddings = pickle.load(file)

In [None]:

# Function to get top k most similar sentences
def get_top_k_similar_sentences(query, embeddings,sentences, k,train_datasets_with_tagging,entity):
    # Compute embeddings for all sentences
   
    
    # Compute embedding for the query sentence
    query_embedding = get_sentence_embedding(query)
    
    # Compute cosine similarity between query and all sentences
    similarities = cosine_similarity([query_embedding], embeddings)[0]
    
    # Get indices of top k similar sentences
    top_k_indices = similarities.argsort()[-k:][::-1]  # Sort in descending order of similarity
    
    # Return top k sentences as a list
    top_k_sentences = [sentences[idx] for idx in top_k_indices]
    top_k_annotated_sentences = [train_datasets_with_tagging[i][0] for i in top_k_sentences]
    top_k_labels = [train_datasets_with_tagging[i][1] for i in top_k_sentences]

    final_dict={}
    for i in range(len(top_k_sentences)):
        final_dict[top_k_sentences[i]]=[top_k_annotated_sentences[i][entity],top_k_labels[i]]
    return final_dict



In [None]:
entity_sentences={}
for entity in entities_list:
    temp_examples=[]
    for i in train_datasets_with_tagging:
        if "@" in train_datasets_with_tagging[i][0][entity]:
            temp_examples.append(i)
    entity_sentences[entity]=temp_examples

In [None]:
sentences = [ " ".join(i) for i in news_train_sentences]
entity_NN_k_examples_testsentences={}

for entity in entities_list:     
    temp_examples=entity_sentences[entity]            
    sentence_indexes=[sentences.index(i) for i in temp_examples]            
    selected_sentences=[sentences[i] for i in sentence_indexes]
    selected_embeddings=[train_embeddings[i] for i in sentence_indexes]
    entity_wise={}
    count=0
    for test_sentence in test_datasets_with_tagging:   
        similar_sentences=get_top_k_similar_sentences(test_sentence, selected_embeddings,selected_sentences,100,train_datasets_with_tagging,entity)
        entity_wise[test_sentence]=similar_sentences    
        count+=1
        if count%100==0:
            print(entity,count)
    entity_NN_k_examples_testsentences[entity]=entity_wise

with open("output_path/entity_NN_k_examples_testsentences.pkl", "wb") as file:
    pickle.dump(entity_NN_k_examples_testsentences, file)

In [None]:
with open("output_path/entity_random_k_examples.pkl", "rb") as file:  # "rb" stands for read binary
    entity_random_k_examples = pickle.load(file)

In [None]:
with open("/home/sneupane/NER/Flairs_paper/train_examples/entity_NN_k_examples_testsentences.pkl", "rb") as file:  # "rb" stands for read binary
    entity_NN_k_examples_testsentences = pickle.load(file)

In [None]:
def create_prompt_with_k_NN_examples(k,entity,entity_prompt,test_sentence):

    # person_prompt ="The task is to label " + entity_prompt + " entities in the given Nepali sentence."
    person_prompt ="गर्नुपर्ने काम भनेको दिइएको नेपाली वाक्यमा " + entity_prompt + "लाई @@ ## भित्र लेबल गर्नु हो।"
    if k > 0:
        #  person_prompt+=" Below are some examples with Input and Output pairs. For the prediction, you should generate the output in the same format as in the examples. Do not give any explanations. \n Examples:"
        person_prompt+="तल वाक्य र लेबल गरेका नतिजाका केही उदाहरणहरू दिइएका छन्। वाक्यलाई लेबल गर्दा उदाहरणको जस्तै ढाँचामा मात्र गर्नुहोस्। कुनै थप व्याख्या नगर्नुहोस्। \n उदाहरणहरू:  "
    else:
        person_prompt+=" Output the whole sentence and enclose the entity within @@ and ##."
        #  person_prompt+=" पुरा वाक्यनै नतिजामा राख्नुहोस् र लेबललाई @@ र ## भित्र राख्नुहोस्।"

    k_nn_examples=[]
    count=0
    for i in entity_NN_k_examples_testsentences[entity][test_sentence]:
        k_nn_examples.append([i,entity_NN_k_examples_testsentences[entity][test_sentence][i][0]])
        count+=1
        if count ==k :
            break
 
    if k==0:
        person_prompt+="\n" + "Input: " + test_sentence + "\n" + "Output: "
        # person_prompt += "\n" + "वाक्य: " + test_sentence + "\n" + "नतिजा: "
    else:
        for i in k_nn_examples:
            
            # person_prompt+="\n" + "Input: " + i[0] + "\n" + "Output: "+ i[1] + "\n"
            person_prompt += "\n" + "वाक्य: " + i[0] + "\n" + "नतिजा: "+ i[1] + "\n"
        # person_prompt+="\n" + " Now predict the output for the following input sentence. \n Input: " + test_sentence + "\n" 
        person_prompt += "\n" + "अब तल दिईएको वाक्यलाई लेबल गर्नुहोस्।  \nवाक्य: " + test_sentence + "\n" 
    return person_prompt

In [None]:


def create_prompt_with_k_random_examples(k,entity,entity_prompt,test_sentence):

    # person_prompt ="The task is to label " + entity_prompt + " entities in the given Nepali sentence."
    person_prompt ="गर्नुपर्ने काम भनेको दिइएको नेपाली वाक्यमा " + entity_prompt + "लाई @@ ## भित्र लेबल गर्नु हो।"
    if k > 0:
        #  person_prompt+="  Below are some examples with Input and Output pairs. \nFor the prediction, you should generate the output in the same format as in the examples.  Do not give any explanations. \nExamples:"
        person_prompt+="तल वाक्य र लेबल गरेका नतिजाका केही उदाहरणहरू दिइएका छन्।\nवाक्यलाई लेबल गर्दा उदाहरणको जस्तै ढाँचामा मात्र गर्नुहोस्। कुनै थप व्याख्या नगर्नुहोस्। \nउदाहरणहरू: "
    else:
        person_prompt+=" Output the whole sentence and enclose the entity within @@ and ##."
        # person_prompt+="पुरा वाक्यनै नतिजामा राख्नुहोस् र लेबललाई @@ र ## भित्र राख्नुहोस्।"

    random_k_examples=[]
    count=0
    for i in entity_random_k_examples[entity]:
        random_k_examples.append([i,entity_random_k_examples[entity][i][0]])
        count+=1
        if count ==k :
            break

    if k==0:
        person_prompt+="\n" + "Input: " + test_sentence + "\n" + "Output: "
        # person_prompt += "\n" + "वाक्य: " + test_sentence + "\n" + "नतिजा: "
    else:
        for i in random_k_examples:
            
            # person_prompt+="\n" + "Input: " + i[0] + "\n" + "Output: "+ i[1] + "\n"
            person_prompt += "\n" + "वाक्य: " + i[0] + "\n" + "नतिजा: "+ i[1] + "\n"
        # person_prompt+= "Now predict the output for the following input sentence. \nInput: " + test_sentence + "\n" 
        person_prompt +=  "अब तल दिईएको वाक्यलाई लेबल गर्नुहोस्। \nवाक्य: " + test_sentence + "\n" 
    return person_prompt

In [None]:
print(create_prompt_with_k_random_examples(0,"Person",nepali_entities_dict["Person"],"उपाधि दौड मा रहे को इंग्लिस प्रिमियर लिग फुटबल क्लब लिभरपुल एफए कप को तेस्रो चरण बाटै बाहिरिए को छ ।"))
# print(generate_NER("openai",a))