In [1]:
import sys
import json
sys.path.append('../backend')
from assemblage import load_data
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Bidirectional, TimeDistributed, Dense, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
import numpy as np
from joblib import dump

# Load and preprocess data
data = load_data("../data/train_cleaned.txt")
sentences, labels = zip(*data)

# Tokenization
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(sentences)
X_sequences = tokenizer.texts_to_sequences(sentences)

# Padding
max_length = max(len(seq) for seq in X_sequences)
with open('../model/config.json', 'w') as config_file:
    json.dump({'max_length': max_length}, config_file)
X_padded = pad_sequences(X_sequences, maxlen=max_length, padding='post')

# Encode labels
unique_labels = set(label for seq in labels for label in seq)
label_encoder = LabelEncoder()
label_encoder.fit(list(unique_labels))
y_encoded = [[label_encoder.transform([l])[0] for l in seq] for seq in labels]
y_padded = pad_sequences(y_encoded, maxlen=max_length, padding='post', value=label_encoder.transform(['O'])[0])
y_categorical = [to_categorical(seq, num_classes=len(unique_labels)) for seq in y_padded]

# Prepare the data
X_train = X_padded
y_train = np.array(y_categorical)

# Define model
input_layer = Input(shape=(max_length,))
embedding_layer = Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=128)(input_layer)
bi_lstm = Bidirectional(LSTM(units=64, return_sequences=True, dropout=0.2, recurrent_dropout=0.2))(embedding_layer)
output_layer = TimeDistributed(Dense(len(unique_labels), activation='softmax'))(bi_lstm)

model = Model(input_layer, output_layer)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(
    X_train, 
    y_train, 
    batch_size=32, 
    epochs=1
)




# Save the model in native Keras format
model.save('../model/ner_model.keras')
dump(tokenizer, '../model/tokenizer.joblib')
dump(label_encoder, '../model/label_encoder.joblib')

print("Model saved successfully")

[1m723/723[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m328s[0m 439ms/step - accuracy: 0.9731 - loss: 0.1468
Model saved successfully
