# Recurrent neural network

### Import the data and prepare it for training

In [9]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, TimeDistributed, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle
import warnings
import numpy as np

warnings.filterwarnings("ignore")

# Load and preprocess data
def load_data(file_path):
    sentences = []
    labels = []
    sentence = []
    label = []
    with open(file_path, "r", encoding="utf-8") as file:
        for line in file:
            if line.strip():
                word, tag = line.strip().split()
                sentence.append(word)
                label.append(tag)
            else:
                if sentence:
                    sentences.append(sentence)
                    labels.append(label)
                    sentence = []
                    label = []
    return sentences, labels

data, label_data = load_data("../data/train_cleaned.txt")
test_data, test_label_data = load_data("../data/test_cleaned.txt")
val_data, val_label_data = load_data("../data/val_cleaned.txt")

# Prepare vocabulary and labels
all_words = list(set(word for sentence in data for word in sentence))
all_tags = list(set(tag for tags in label_data for tag in tags))

word2idx = {word: idx + 2 for idx, word in enumerate(all_words)}
word2idx["<PAD>"] = 0
word2idx["<UNK>"] = 1
idx2word = {idx: word for word, idx in word2idx.items()}

label2idx = {tag: idx for idx, tag in enumerate(all_tags)}
idx2label = {idx: tag for tag, idx in label2idx.items()}

# Convert data to indices
def convert_to_indices(sentences, labels, word2idx, label2idx):
    X = [[word2idx.get(word, word2idx["<UNK>"]) for word in sentence] for sentence in sentences]
    y = [[label2idx[tag] for tag in tags] for tags in labels]
    return X, y

X_train, y_train = convert_to_indices(data, label_data, word2idx, label2idx)
X_val, y_val = convert_to_indices(val_data, val_label_data, word2idx, label2idx)
X_test, y_test = convert_to_indices(test_data, test_label_data, word2idx, label2idx)

# Pad sequences
max_len = 100
X_train = pad_sequences(X_train, maxlen=max_len, padding="post")
y_train = pad_sequences(y_train, maxlen=max_len, padding="post")
X_val = pad_sequences(X_val, maxlen=max_len, padding="post")
y_val = pad_sequences(y_val, maxlen=max_len, padding="post")
X_test = pad_sequences(X_test, maxlen=max_len, padding="post")
y_test = pad_sequences(y_test, maxlen=max_len, padding="post")

# Build the model
model = Sequential([
    Embedding(input_dim=len(word2idx), output_dim=128, input_length=max_len),
    Dropout(0.3),
    Bidirectional(LSTM(units=64, return_sequences=True)),
    TimeDistributed(Dense(len(label2idx), activation="softmax"))
])

model.compile(optimizer=Adam(learning_rate=0.001), loss="sparse_categorical_crossentropy", metrics=["accuracy"])

# Train the model
y_train = np.expand_dims(y_train, -1)
y_val = np.expand_dims(y_val, -1)

model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    batch_size=32,
    epochs=10
)

# Evaluate the model
y_test = np.expand_dims(y_test, -1)
results = model.evaluate(X_test, y_test)
print("Test metrics:")
print("Test Loss:", results[0])
print("Test Accuracy:", results[1])

results = model.evaluate(X_val, y_val)
print("Validation metrics:")
print("Validation Loss:", results[0])
print("Validation Accuracy:", results[1])

# Save the model
model.save("../model/arabic_ner_model.h5")

# Save mappings
with open("../model/word2idx.pkl", "wb") as file:
    pickle.dump(word2idx, file)
with open("../model/idx2label.pkl", "wb") as file:
    pickle.dump(idx2label, file)

Epoch 1/10
[1m723/723[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m103s[0m 136ms/step - accuracy: 0.9412 - loss: 0.2566 - val_accuracy: 0.9879 - val_loss: 0.0491
Epoch 2/10
[1m723/723[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m91s[0m 126ms/step - accuracy: 0.9901 - loss: 0.0377 - val_accuracy: 0.9915 - val_loss: 0.0333
Epoch 3/10
[1m723/723[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m88s[0m 122ms/step - accuracy: 0.9944 - loss: 0.0198 - val_accuracy: 0.9919 - val_loss: 0.0311
Epoch 4/10
[1m723/723[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m87s[0m 120ms/step - accuracy: 0.9964 - loss: 0.0123 - val_accuracy: 0.9925 - val_loss: 0.0310
Epoch 5/10
[1m723/723[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m86s[0m 118ms/step - accuracy: 0.9976 - loss: 0.0083 - val_accuracy: 0.9927 - val_loss: 0.0318
Epoch 6/10
[1m723/723[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m90s[0m 125ms/step - accuracy: 0.9984 - loss: 0.0058 - val_accuracy: 0.9927 - val_loss: 0.0340
Epoch 7/1



Validation metrics:
Validation Loss: 0.04141910746693611
Validation Accuracy: 0.9925218820571899


In [10]:
# Load the model and make predictions
loaded_model = tf.keras.models.load_model("../model/arabic_ner_model.h5")

# Load mappings
with open("../model/word2idx.pkl", "rb") as file:
    word2idx = pickle.load(file)
with open("../model/idx2label.pkl", "rb") as file:
    idx2label = pickle.load(file)

# Prepare a sample sentence for prediction
sample_sentence = "صورة لعملة ورقية من فئة 500 ملز خلال فترة الانتداب البريطاني على فلسطين.".split()
sample_indices = [word2idx.get(word, word2idx["<UNK>"]) for word in sample_sentence]
sample_padded = pad_sequences([sample_indices], maxlen=max_len, padding="post")
predictions = loaded_model.predict(sample_padded)
predicted_tags = [idx2label[np.argmax(tag)] for tag in predictions[0]]

# Print predictions
print("Sentence:", sample_sentence)
print("Predicted Tags:", predicted_tags[:len(sample_sentence)])




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 698ms/step
Sentence: ['صورة', 'لعملة', 'ورقية', 'من', 'فئة', '500', 'ملز', 'خلال', 'فترة', 'الانتداب', 'البريطاني', 'على', 'فلسطين.']
Predicted Tags: ['O', 'O', 'O', 'O', 'O', 'B-MON', 'I-MON', 'O', 'O', 'B-EVE', 'I-EVE', 'I-EVE', 'O']
