# Load libraries

In [1]:
import json
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
import pandas as pd
import json
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences

# NER model

In [41]:

# Load dataset
with open("animal_ner_english_dataset.json", "r") as file:
    data = json.load(file)

# Extract sentences and corresponding tags
sentences_list = [item["sentence"] for item in data]
labels_list = [item["category"] for item in data]

# Tokenization
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(sentences_list)
vocab = tokenizer.word_index
VOCAB_SIZE = len(vocab) + 1  # Add 1 for padding

# Create tag mapping
unique_labels = sorted(set(tag for tags in labels_list for tag in tags))
if "O" not in unique_labels:
    unique_labels.append("O")  # Ensure "O" exists
tag_map = {label: i for i, label in enumerate(unique_labels)}
NUM_CLASSES = len(tag_map)

# Hyperparameters
MAX_LEN = 100
BATCH_SIZE = 32
SHUFFLE_BUFFER_SIZE = 100
EMBEDDING_DIM = 100
RNN_UNITS = 100


# Function to prepare data
def get_params(sentences_list, labels_list):
    tokenized_sentences = tokenizer.texts_to_sequences(sentences_list)
    sentences_padded = pad_sequences(tokenized_sentences, maxlen=MAX_LEN, padding='post')

    # Convert tags to numerical labels
    t_labels = [[tag_map[label] for label in sentence_labels] for sentence_labels in labels_list]

    # Padding for tags
    labels_padded = pad_sequences(t_labels, maxlen=MAX_LEN, padding='post', value=tag_map["O"])
    labels_padded = np.array(labels_padded, dtype=np.int32)

    return np.array(sentences_padded), labels_padded


# Split into train (80%), validation (10%), and test (10%)
train_sentences, temp_sentences, train_labels, temp_labels = train_test_split(
    sentences_list, labels_list, test_size=0.2, random_state=42
)
val_sentences, test_sentences, val_labels, test_labels = train_test_split(
    temp_sentences, temp_labels, test_size=0.5, random_state=42
)

# Tokenize and pad data
t_sentences, t_labels = get_params(train_sentences, train_labels)
v_sentences, v_labels = get_params(val_sentences, val_labels)
test_sentences, test_labels = get_params(test_sentences, test_labels)

train_dataset = tf.data.Dataset.from_tensor_slices((t_sentences, t_labels)).shuffle(SHUFFLE_BUFFER_SIZE).batch(BATCH_SIZE)
val_dataset = tf.data.Dataset.from_tensor_slices((v_sentences, v_labels)).batch(BATCH_SIZE)
test_dataset = tf.data.Dataset.from_tensor_slices((test_sentences, test_labels)).batch(BATCH_SIZE)

# Build the NER model
model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(input_dim=VOCAB_SIZE, output_dim=EMBEDDING_DIM, input_length=MAX_LEN),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units=RNN_UNITS, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units=RNN_UNITS, return_sequences=True)),
    tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(NUM_CLASSES, activation='softmax'))
])

# Compile the model
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

# Train the model
history = model.fit(train_dataset, validation_data=val_dataset, epochs=15)


# Function to predict entity tags
def predict_entities(sentence):
    reverse_tag_map = {v: k for k, v in tag_map.items()}

    # Tokenize sentence
    sentence_seq = tokenizer.texts_to_sequences([sentence.split()])
    sentence_padded = pad_sequences(sentence_seq, maxlen=MAX_LEN, padding='post')

    # Predict
    predictions = model.predict(sentence_padded)[0]

    # Get tags
    predicted_tags = [reverse_tag_map[np.argmax(word_pred)] for word_pred in predictions[:len(sentence.split())]]

    return list(zip(sentence.split(), predicted_tags))




Epoch 1/15
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 259ms/step - accuracy: 0.8472 - loss: 0.8318 - val_accuracy: 0.9943 - val_loss: 0.0537
Epoch 2/15
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 176ms/step - accuracy: 0.9944 - loss: 0.0471 - val_accuracy: 0.9943 - val_loss: 0.0384
Epoch 3/15
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 191ms/step - accuracy: 0.9944 - loss: 0.0359 - val_accuracy: 0.9943 - val_loss: 0.0347
Epoch 4/15
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 210ms/step - accuracy: 0.9946 - loss: 0.0316 - val_accuracy: 0.9943 - val_loss: 0.0317
Epoch 5/15
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 197ms/step - accuracy: 0.9946 - loss: 0.0286 - val_accuracy: 0.9943 - val_loss: 0.0278
Epoch 6/15
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 203ms/step - accuracy: 0.9945 - loss: 0.0251 - val_accuracy: 0.9943 - val_loss: 0.0233
Epoch 7/15
[1m25/25[0m [

In [2]:

# Load JSON dataset
with open("dataset.json", "r", encoding="utf-8") as f:
    datastore = json.load(f)

# Initialize lists
sentences = []
categories = []

# Load texts and labels
for item in datastore:
    sentences.append(item['sentence'])
    categories.append(item['category'])

print(f"Loaded {len(sentences)} samples")

unique_categories = list(set(categories))
print(f"Unique categories: {unique_categories}")

# Encode labels into numerical values
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(categories)  

# Check encoding
print(f"Example category mapping: {dict(zip(unique_categories, label_encoder.transform(unique_categories)))}")

# Tokenization parameters
vocab_size = 10000
max_length = 32
embedding_dim = 16
trunc_type = 'post'
padding_type = 'post'
oov_tok = "<OOV>"

# Initialize tokenizer
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(sentences)

# Convert texts into numerical sequences
sequences = tokenizer.texts_to_sequences(sentences)
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# Convert labels to a numpy array
encoded_labels = np.array(encoded_labels, dtype=np.int32)

# Split data into training and testing sets 
training_size = int(len(sentences) * 0.8)  # 80% for training, 20% for testing
training_sentences = padded_sequences[:training_size]
testing_sentences = padded_sequences[training_size:]

training_labels = encoded_labels[:training_size]
testing_labels = encoded_labels[training_size:]

print(f"Training size: {len(training_sentences)}, Testing size: {len(testing_sentences)}")

# Build the model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(16)),
    tf.keras.layers.Dense(48, activation='relu'),
    tf.keras.layers.Dropout(0.3),  # Adding Dropout for regularization
    tf.keras.layers.Dense(len(unique_categories), activation='softmax')  # Multi-class classification
])

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    min_delta=0,
    patience=8,
    verbose=0,
    mode='auto',
    baseline=None,
    restore_best_weights=False,
    start_from_epoch=0
)

# Print model summary
print(model.summary())

# Train the model 
num_epochs = 30
history = model.fit(training_sentences, training_labels, epochs=num_epochs, validation_data=(testing_sentences, testing_labels), verbose=2)

print("Training complete!")

# Function to predict the category of an animal 
def predict_animal(sentence):
    sequence = tokenizer.texts_to_sequences([sentence])
    padded = pad_sequences(sequence, maxlen=max_length, padding=padding_type, truncating=trunc_type)
    prediction = model.predict(padded)
    predicted_label = np.argmax(prediction)
    return label_encoder.inverse_transform([predicted_label])[0]

# Example prediction
test_sentence = "There might be some cat in the picture, am I right?"
print(f"Predicted category: {predict_animal(test_sentence)}")

# Save the model in Keras format
model.save("animal_classifier_nlp.keras")

# Save the tokenizer
import json
tokenizer_json = tokenizer.to_json()
with open("tokenizer.json", "w", encoding="utf-8") as f:
    f.write(json.dumps(tokenizer_json, ensure_ascii=False))

print("Model and tokenizer saved!")


Loaded 1000 samples
Unique categories: ['sheep', 'butterfly', 'cow', 'spider', 'dog', 'hen', 'elephant', 'panda', 'horse', 'cat', 'squirrel', 'monkey']
Example category mapping: {'sheep': np.int64(9), 'butterfly': np.int64(0), 'cow': np.int64(2), 'spider': np.int64(10), 'dog': np.int64(3), 'hen': np.int64(5), 'elephant': np.int64(4), 'panda': np.int64(8), 'horse': np.int64(6), 'cat': np.int64(1), 'squirrel': np.int64(11), 'monkey': np.int64(7)}
Training size: 800, Testing size: 200




None
Epoch 1/30
25/25 - 21s - 859ms/step - accuracy: 0.0975 - loss: 2.4839 - val_accuracy: 0.0650 - val_loss: 2.4856
Epoch 2/30
25/25 - 1s - 51ms/step - accuracy: 0.1163 - loss: 2.4755 - val_accuracy: 0.0650 - val_loss: 2.4825
Epoch 3/30
25/25 - 1s - 49ms/step - accuracy: 0.1350 - loss: 2.4278 - val_accuracy: 0.0800 - val_loss: 2.3982
Epoch 4/30
25/25 - 1s - 42ms/step - accuracy: 0.2062 - loss: 2.1898 - val_accuracy: 0.2050 - val_loss: 2.1168
Epoch 5/30
25/25 - 1s - 40ms/step - accuracy: 0.2500 - loss: 1.9580 - val_accuracy: 0.2950 - val_loss: 1.8356
Epoch 6/30
25/25 - 1s - 52ms/step - accuracy: 0.3587 - loss: 1.7012 - val_accuracy: 0.4200 - val_loss: 1.6130
Epoch 7/30
25/25 - 1s - 43ms/step - accuracy: 0.4000 - loss: 1.5102 - val_accuracy: 0.4650 - val_loss: 1.3702
Epoch 8/30
25/25 - 1s - 44ms/step - accuracy: 0.4437 - loss: 1.2984 - val_accuracy: 0.5100 - val_loss: 1.1606
Epoch 9/30
25/25 - 1s - 53ms/step - accuracy: 0.5537 - loss: 1.1065 - val_accuracy: 0.6600 - val_loss: 0.9876
Epo