In [1]:
!pip install tokenizers -q
!pip install git+https://github.com/csebuetnlp/normalizer
!pip install transformers -q

Collecting git+https://github.com/csebuetnlp/normalizer
  Cloning https://github.com/csebuetnlp/normalizer to /tmp/pip-req-build-w__hc9dt
  Running command git clone --filter=blob:none --quiet https://github.com/csebuetnlp/normalizer /tmp/pip-req-build-w__hc9dt
  Resolved https://github.com/csebuetnlp/normalizer to commit d405944dde5ceeacb7c2fd3245ae2a9dea5f35c9
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting emoji==1.4.2 (from normalizer==0.0.1)
  Downloading emoji-1.4.2.tar.gz (184 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m185.0/185.0 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting ftfy==6.0.3 (from normalizer==0.0.1)
  Downloading ftfy-6.0.3.tar.gz (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.2/64.2 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: nor

In [2]:
from normalizer import normalize
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import pickle
import torch
from transformers import ElectraTokenizer

In [None]:

model_name = '/content/drive/MyDrive/Thesis/BERTOUTPUT/checkpoint-11000/'
tokenizer = ElectraTokenizer.from_pretrained(model_name)

In [3]:
class TrieNode:
    def __init__(self):
        self.children = {}
        self.is_end_of_word = False

class Trie:
    def __init__(self):
        self.root = TrieNode()
        self.entity_tags = ["PER", "LOC", "CW", "CORP", "GRP", "PROD"]
        self.tag_encoding = {"O": 0, "B-PER": 1, "I-PER": 2, "B-LOC": 3, "I-LOC": 4,
                             "B-CW": 5, "I-CW": 6, "B-CORP": 7, "I-CORP": 8,
                             "B-GRP": 9, "I-GRP": 10, "B-PROD": 11, "I-PROD": 12}

    def insert(self, word, entity_type):
        node = self.root
        for char in word:
            if char not in node.children:
                node.children[char] = TrieNode()
            node = node.children[char]
        node.is_end_of_word = True
        node.entity_type = entity_type

    def search(self, sentence):
        encoding = [0] * len(sentence)
        for i in range(len(sentence)):
            node = self.root
            for j in range(i, len(sentence)):
                char = sentence[j]
                if char not in node.children:
                    break
                node = node.children[char]
                if node.is_end_of_word:
                    entity_type = node.entity_type
                    entity_length = j - i + 1
                    start_index = i
                    end_index = j
                    for k in range(start_index, end_index + 1):
                        if k == start_index:
                            encoding[k] = self.tag_encoding["B-" + entity_type]
                        else:
                            encoding[k] = self.tag_encoding["I-" + entity_type]
                    break
        one_hot_encoding = [[0] * 13 for i in range(len(encoding))]
        for i, tag in enumerate(encoding):
            one_hot_encoding[i][tag] = 1
        return one_hot_encoding

import numpy as np

def group_encodings_by_word(encoding, sentence):
    # Create an empty list to store the word encodings
    word_encodings = []

    # Create an empty list to store the current word encoding
    current_word_encoding = []

    # Create an empty string to store the current word
    current_word = ""

    # Iterate over each character encoding and character in the input encoding list and sentence, respectively
    for char_encoding, char in zip(encoding, sentence):
        # If the current character is a whitespace character, finish the current word and add its first character encoding to the word encodings list
        if char == " ":
            if len(current_word_encoding) > 0:
                word_encodings.append(np.array(current_word_encoding[0]))
                current_word_encoding = []
            current_word = ""
        # If the current character is part of a word, append the character encoding to the current word encoding and the character to the current word
        else:
            current_word_encoding.append(char_encoding)
            current_word += char

    # Add the last word encoding to the word encodings list, if it exists
    if len(current_word_encoding) > 0:
        word_encodings.append(np.array(current_word_encoding[0]))

    # Return the word encodings as a NumPy array
    return np.array(word_encodings)

import pickle

def save_trie(trie, filename):
    with open(filename, "wb") as f:
        pickle.dump(trie, f)

def load_trie(filename):
    with open(filename, "rb") as f:
        trie = pickle.load(f)
    return trie

if __name__ == "__main__":
    trie = Trie()

    # Insert some named entities into the Trie
    trie.insert("John", "PER")
    trie.insert("New York", "LOC")
    trie.insert("iPhone 14", "PROD")
    trie.insert("টোকিও বিশ্ববিদ্যালয়", "CORP")
    trie.insert("Apple", "CORP")
    trie.insert("Apple", "PROD")
    # Search for named entities in a sentence
    sentence = "John টোকিও বিশ্ববিদ্যালয় এ পড়ে and owns an iPhone 14 made by Apple, not Microsoft."
    encoding = trie.search(sentence)
    for i, word in enumerate(sentence):
        print(encoding[i],word)

    word_encodings = group_encodings_by_word(encoding,sentence)

# Print the word encodings
    for word, word_encoding in zip(sentence.split(), word_encodings):
        print(word, word_encoding)

[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] J
[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] o
[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] h
[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] n
[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]  
[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0] ট
[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0] ো
[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0] ক
[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0] ি
[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0] ও
[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]  
[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0] ব
[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0] ি
[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0] শ
[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0] ্
[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0] ব
[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0] ব
[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0] ি
[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0] দ
[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0] ্
[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0] য
[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0] া
[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0] ল
[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 

In [None]:
trie = load_trie('/content/drive/MyDrive/Bracu/THESIS/Trie DS/Trie old.bin')


In [None]:

def Gtoken(text):
  inputs = tokenizer.encode_plus(text, return_tensors='pt')

  encoded_dict = tokenizer.encode_plus(
                  text,       # Sentence to encode.
                  add_special_tokens = False, # Add '[CLS]' and '[SEP]'
                  max_length = 314,           # Pad & truncate all sentences.
                  padding = 'max_length',
                  return_attention_mask = True,   # Construct attn. masks.
                  return_tensors = 'pt',
                  truncation=False)
  input_ids = encoded_dict['input_ids']
  tokenized = tokenizer.convert_ids_to_tokens([i.item() for i in input_ids.squeeze() if i > 1])
  return "< "+" ".join(tokenized)+" >"

In [None]:
%cd /content

/content


In [None]:
entity_tags = ["PER", "LOC", "CW", "CORP", "GRP", "PROD"]
for entity in entity_tags:
    with open('{}.txt'.format(entity), 'r', encoding = 'utf-8') as file:
        print(entity)
        for line in file:
            trie.insert(Gtoken(normalize(line.strip())), entity)

# save_trie(tree, "Trie.bin")

PER
LOC
CW
CORP
GRP
PROD


In [None]:
%cd /content/drive/MyDrive/Bracu/THESIS/Trie DS/

/content/drive/MyDrive/Bracu/THESIS/Trie DS


In [None]:

tree = Trie()
entity_tags = ["PER", "LOC", "CW", "CORP", "GRP", "PROD"]
for entity in entity_tags:
    with open('KE/{}/{}.txt'.format(entity,entity), 'r', encoding = 'utf-8') as file:
        print(entity)
        for line in file:
            tree.insert(Gtoken(normalize(line.strip())), entity)
    with open('KE/{}.txt'.format(entity), 'r', encoding = 'utf-8') as file:
        print(entity)
        for line in file:
            tree.insert(Gtoken(normalize(line.strip())), entity)

save_trie(tree, "/content/Trie.bin")


PER
PER
LOC
LOC
CW
CW
CORP
CORP
GRP
GRP
PROD
PROD


In [None]:
sentence = Gtoken(normalize("আন্দ্রেয়া নাভাগেরো এবং অ্যাগোস্টিনো বেজানোর প্রতিকৃতি বাড়ীর ছোট বউ অ্যামিটি ইনস্টিটিউট অফ বায়োটেকনোলজি"))
encoding = trie.search(sentence)
trie.insert(Gtoken(normalize('কুত্তা')), 'PER')
# for i, word in enumerate(sentence):
#     # print(encoding[i],word)

word_encodings = group_encodings_by_word(encoding,sentence)

# Print the word encodings
for word, word_encoding in zip(sentence.split(), word_encodings):
    print(word, word_encoding)

print(word_encodings.shape)

< [1 0 0 0 0 0 0 0 0 0 0 0 0]
আন্দ্রে [0 0 0 0 0 1 0 0 0 0 0 0 0]
##য়া [1 0 0 0 0 0 0 0 0 0 0 0 0]
নাভ [1 0 0 0 0 0 0 0 0 0 0 0 0]
##াগের [1 0 0 0 0 0 0 0 0 0 0 0 0]
##ো [1 0 0 0 0 0 0 0 0 0 0 0 0]
এবং [1 0 0 0 0 0 0 0 0 0 0 0 0]
অ্যা [1 0 0 0 0 0 0 0 0 0 0 0 0]
##গো [1 0 0 0 0 0 0 0 0 0 0 0 0]
##স্টি [1 0 0 0 0 0 0 0 0 0 0 0 0]
##নো [1 0 0 0 0 0 0 0 0 0 0 0 0]
বেজ [1 0 0 0 0 0 0 0 0 0 0 0 0]
##ানোর [1 0 0 0 0 0 0 0 0 0 0 0 0]
প্রতিকৃতি [0 0 0 0 0 1 0 0 0 0 0 0 0]
বাড়ীর [1 0 0 0 0 0 0 0 0 0 0 0 0]
ছোট [0 0 0 0 0 1 0 0 0 0 0 0 0]
বউ [0 0 0 0 0 1 0 0 0 0 0 0 0]
অ্যাম [0 0 0 0 0 1 0 0 0 0 0 0 0]
##িটি [0 0 0 0 0 0 1 0 0 0 0 0 0]
ইনস্টিটিউট [0 0 0 0 0 1 0 0 0 0 0 0 0]
অফ [1 0 0 0 0 0 0 0 0 0 0 0 0]
বায়ো [1 0 0 0 0 0 0 0 0 0 0 0 0]
##টেক [1 0 0 0 0 0 0 0 0 0 0 0 0]
##নো [1 0 0 0 0 0 0 0 0 0 0 0 0]
##লজি [1 0 0 0 0 0 0 0 0 0 0 0 0]
> [1 0 0 0 0 0 0 0 0 0 0 0 0]
(26, 13)


In [None]:
def Gazetteer(sentence, pad = True, dtype ='torch'):
  sentence = Gtoken(normalize(sentence))
  encoding = trie.search(sentence)
  tensor = group_encodings_by_word(encoding,sentence)
  if pad:
    current_size = tensor.size(0)
    if current_size >= 64:
        return tensor[:64, :]

    padded_tensor = torch.full((64, 13),-100.)
    padded_tensor[:current_size, :] = tensor
    if dtype == 'numpy':
        return padded_tensor.numpy()
    elif dtype == 'tensorflow':
        import tensorflow as tf
        return tf.convert_to_tensor(padded_tensor)
    else:
      return padded_tensor
  else:
    if dtype == 'numpy':
        return padded_tensor.numpy()
    elif dtype == 'tensorflow':
        import tensorflow as tf
        return tf.convert_to_tensor(tensor)
    else:
        return tensor