In [1]:
import sys
sys.path.append('../backend')
from assemblage import load_data
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Bidirectional, TimeDistributed, Dense, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Load and preprocess data
data = load_data("../data/train_cleaned.txt")
sentences, labels = zip(*data)

# Tokenization
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(sentences)
X_sequences = tokenizer.texts_to_sequences(sentences)

# Padding
max_length = max(len(seq) for seq in X_sequences)
X_padded = pad_sequences(X_sequences, maxlen=max_length, padding='post')

# Encode labels
unique_labels = set(label for seq in labels for label in seq)
label_encoder = LabelEncoder()
label_encoder.fit(list(unique_labels))
y_encoded = [[label_encoder.transform([l])[0] for l in seq] for seq in labels]
y_padded = pad_sequences(y_encoded, maxlen=max_length, padding='post', value=label_encoder.transform(['O'])[0])
y_categorical = [to_categorical(seq, num_classes=len(unique_labels)) for seq in y_padded]

# Train-test split
X_train, X_val, y_train, y_val = train_test_split(X_padded, np.array(y_categorical), test_size=0.2)

# Define model
input_layer = Input(shape=(max_length,))
embedding_layer = Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=128, input_length=max_length)(input_layer)
bi_lstm = Bidirectional(LSTM(units=64, return_sequences=True, dropout=0.2, recurrent_dropout=0.2))(embedding_layer)
output_layer = TimeDistributed(Dense(len(unique_labels), activation='softmax'))(bi_lstm)

model = Model(input_layer, output_layer)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train model
history = model.fit(
    X_train, 
    np.array(y_train), 
    batch_size=32, 
    epochs=10, 
    validation_data=(X_val, np.array(y_val))
)

# Evaluate model
results = model.evaluate(X_val, np.array(y_val))
print(f"Validation Loss: {results[0]}, Validation Accuracy: {results[1]}")

# Save model
model.save("arabic_ner_model.h5")

# Example inference
def predict_sentence(sentence):
    sequence = tokenizer.texts_to_sequences([sentence])
    padded_sequence = pad_sequences(sequence, maxlen=max_length, padding='post')
    predictions = model.predict(padded_sequence)
    predicted_labels = [label_encoder.inverse_transform([np.argmax(p)])[0] for p in predictions[0]]
    return list(zip(sentence.split(), predicted_labels))

test_sentence = "صورة عملة ورقية من فئة 500 ملز خلال فترة الانتداب البريطاني على فلسطين"
print(predict_sentence(test_sentence))




Epoch 1/10
[1m579/579[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m263s[0m 439ms/step - accuracy: 0.9717 - loss: 0.1577 - val_accuracy: 0.9893 - val_loss: 0.0409
Epoch 2/10
[1m472/579[0m [32m━━━━━━━━━━━━━━━━[0m[37m━━━━[0m [1m44s[0m 419ms/step - accuracy: 0.9901 - loss: 0.0366

KeyboardInterrupt: 