In [3]:
import os

os.environ["KERAS_BACKEND"] = "tensorflow"

import keras
#from keras import ops
import numpy as np
import tensorflow as tf
from keras import layers
#from datasets import load_dataset
from collections import Counter
# from conlleval import evaluate

In [4]:
print(keras.__version__)


2.10.0


## Utility Classes

In [5]:
class ops:
    @staticmethod
    def shape(tensor):
        return tf.shape(tensor)

    @staticmethod
    def arange(start, stop, step=1):
        return tf.range(start, stop, step)

## Build layers

Let's start by defining a TransformerBlock layer:

In [6]:
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.att = keras.layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.ffn = keras.Sequential(
            [
                keras.layers.Dense(ff_dim, activation="relu"),
                keras.layers.Dense(embed_dim),
            ]
        )
        self.layernorm1 = keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = keras.layers.Dropout(rate)
        self.dropout2 = keras.layers.Dropout(rate)

    def call(self, inputs, training=False):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

Next, let's define a TokenAndPositionEmbedding layer:

In [7]:
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super().__init__()
        self.token_emb = keras.layers.Embedding(
            input_dim=vocab_size, output_dim=embed_dim
        )
        self.pos_emb = keras.layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, inputs):
        maxlen = ops.shape(inputs)[-1]
        positions = ops.arange(start=0, stop=maxlen, step=1)
        position_embeddings = self.pos_emb(positions)
        token_embeddings = self.token_emb(inputs)
        return token_embeddings + position_embeddings

### Build the NER model class as a keras.Model subclass


In [8]:
class NERModel(keras.Model):
    def __init__(
        self, num_tags, vocab_size, maxlen=128, embed_dim=32, num_heads=2, ff_dim=32
    ):
        super().__init__()
        self.embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
        self.transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
        self.dropout1 = layers.Dropout(0.1)
        self.ff = layers.Dense(ff_dim, activation="relu")
        self.dropout2 = layers.Dropout(0.1)
        self.ff_final = layers.Dense(num_tags, activation="softmax")

    def call(self, inputs, training=False):
        x = self.embedding_layer(inputs)
        x = self.transformer_block(x)
        x = self.dropout1(x, training=training)
        x = self.ff(x)
        x = self.dropout2(x, training=training)
        x = self.ff_final(x)
        return x

## Make the NER label lookup table


In [9]:
def make_tag_lookup_table():
    
    iob_labels = ["B", "I"]
    ner_labels = ["PER", "ORG", "LOC", "EVE",'NUM','MON','LAN','TIME']
    all_labels = [(label1, label2) for label2 in ner_labels for label1 in iob_labels]
    all_labels = ["-".join([a, b]) for a, b in all_labels]
    all_labels = ["[PAD]", "O"] + all_labels
    return dict(zip(range(0, len(all_labels) + 1), all_labels))


mapping = make_tag_lookup_table()
print(mapping)

{0: '[PAD]', 1: 'O', 2: 'B-PER', 3: 'I-PER', 4: 'B-ORG', 5: 'I-ORG', 6: 'B-LOC', 7: 'I-LOC', 8: 'B-EVE', 9: 'I-EVE', 10: 'B-NUM', 11: 'I-NUM', 12: 'B-MON', 13: 'I-MON', 14: 'B-LAN', 15: 'I-LAN', 16: 'B-TIME', 17: 'I-TIME'}


Get a list of all tokens in the training dataset. This will be used to create the vocabulary.

In [12]:
training_set_elements = []
with open('C:/Users/TESTUSER/Desktop/UniversityCoursesFiles/uniYear4/AI/ArabicNamedEntityRecognition/code/data/train.txt') as my_file:
    training_set_elements = my_file.readlines()

for i in range(20):
    print(training_set_elements[i])

طµظˆط±ط© O

ط¹ظ…ظ„ط© O

ظˆط±ظ‚ظٹط© O

ظ…ظ† O

ظپط¦ط© O

500 B-MONEY

ظ…ظ„ط² I-MONEY

ط®ظ„ط§ظ„ O

ظپطھط±ط© O

ط§ظ„ط§ظ†طھط¯ط§ط¨ B-EVENT

ط§ظ„ط¨ط±ظٹط·ط§ظ†ظٹ I-EVENT

ط¹ظ„ظ‰ I-EVENT

ظپظ„ط³ط·ظٹظ† I-EVENT

. O

 

ط±ط³ط§ظ„ط© O

ط§ظ„ط´ظٹط® O

ط¹ط«ظ…ط§ظ† B-PERS

ط²ظ‚ظˆطھ I-PERS

ظپظٹ O


In [11]:
all_tokens = sum(conll_data["train"]["tokens"], [])
all_tokens_array = np.array(list(map(str.lower, all_tokens)))

counter = Counter(all_tokens_array)
print(len(counter))

num_tags = len(mapping)
vocab_size = 20000

# We only take (vocab_size - 2) most commons words from the training data since
# the `StringLookup` class uses 2 additional tokens - one denoting an unknown
# token and another one denoting a masking token
vocabulary = [token for token, count in counter.most_common(vocab_size - 2)]

# The StringLook class will convert tokens to token IDs
lookup_layer = keras.layers.StringLookup(vocabulary=vocabulary)

NameError: name 'conll_data' is not defined