In [1]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Import necessary libraries
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Dense, Embedding, LayerNormalization, Dropout, MultiHeadAttention
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score

Mounted at /content/drive


In [2]:
# Load and preprocess data
def load_and_preprocess_data(train_filepath, val_filepath):
    train_df = pd.read_csv(train_filepath).dropna()
    val_df = pd.read_csv(val_filepath).dropna()

    # Text and labels
    X_train, y_train = train_df['WORD'], train_df['LABEL']
    X_test, y_test = val_df['WORD'], val_df['LABEL']

    return X_train, y_train, X_test, y_test

In [6]:
# Paths to your manually created CSV files
train_filepath = '/content/drive/MyDrive/seq2seq/train_dataset.csv'
val_filepath = '/content/drive/MyDrive/seq2seq/validation_dataset.csv'

In [7]:
X_train, y_train, X_test, y_test = load_and_preprocess_data(train_filepath, val_filepath)

In [8]:
# Parameters
Vx = 20000  # Vocabulary size for words
Vy = len(y_train.unique())  # Number of unique labels (dynamically computed)
MAX_SEQUENCE_LENGTH = 128
EMBEDDING_DIM = 100
L = 256

In [9]:
# Text Tokenization
text_tokenizer = Tokenizer(num_words=Vx, oov_token='<ERROR TOKEN>', filters='')
text_tokenizer.fit_on_texts(X_train)
text_sequences_train = text_tokenizer.texts_to_sequences(X_train)
text_sequences_test = text_tokenizer.texts_to_sequences(X_test)

In [10]:
# Get the word to index mapping for input language
num_words_input = len(text_tokenizer.word_index) + 1  # FOR PADDING
max_len_text = max(len(s) for s in text_sequences_train)

In [11]:
# Label Tokenization
li_tokenizer = Tokenizer(num_words=Vy+1, filters='', oov_token='<ERROR TOKEN>')
li_tokenizer.fit_on_texts(y_train)
li_sequences_train = li_tokenizer.texts_to_sequences(y_train)
li_sequences_test = li_tokenizer.texts_to_sequences(y_test)


In [12]:
# Get the word to index mapping for output language
num_words_output = len(li_tokenizer.word_index) + 1  # FOR PADDING
max_len_li = max(len(s) for s in li_sequences_train)


In [13]:
text_inputs_train = pad_sequences(text_sequences_train, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
text_inputs_test = pad_sequences(text_sequences_test, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
li_targets_train = pad_sequences(li_sequences_train, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
li_targets_test = pad_sequences(li_sequences_test, maxlen=MAX_SEQUENCE_LENGTH, padding='post')

In [14]:
def transformer_block(inputs, embed_dim, num_heads, ff_dim, rate=0.1, training=False):
    attn_output = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)(inputs, inputs)
    attn_output = Dropout(rate)(attn_output, training=training)
    out1 = LayerNormalization(epsilon=1e-6)(inputs + attn_output)
    ffn_output = Dense(ff_dim, activation="relu")(out1)
    ffn_output = Dense(embed_dim)(ffn_output)
    ffn_output = Dropout(rate)(ffn_output, training=training)
    return LayerNormalization(epsilon=1e-6)(out1 + ffn_output)

In [15]:
def token_and_position_embedding(inputs, maxlen, vocab_size, embed_dim):
    token_emb = Embedding(input_dim=vocab_size, output_dim=embed_dim)(inputs)
    pos_emb = Embedding(input_dim=maxlen, output_dim=embed_dim)(tf.range(start=0, limit=maxlen, delta=1))
    return token_emb + pos_emb

In [16]:

def LIModel(num_tags, vocab_size, maxlen=128, embed_dim=32, num_heads=2, ff_dim=32):
    inputs = tf.keras.Input(shape=(maxlen,))
    x = token_and_position_embedding(inputs, maxlen, vocab_size, embed_dim)
    x = transformer_block(x, embed_dim, num_heads, ff_dim)
    x = Dropout(0.1)(x)
    x = Dense(ff_dim, activation="relu")(x)
    x = Dropout(0.1)(x)
    outputs = Dense(num_tags, activation="softmax")(x)
    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    return model


In [17]:
# Define custom loss function
class CustomNonPaddingTokenLoss(tf.keras.losses.Loss):
    def __init__(self, name="custom_ner_loss"):
        super().__init__(name=name)

    def call(self, y_true, y_pred):
        loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE)
        loss = loss_fn(y_true, y_pred)
        mask = tf.cast((y_true > 0), dtype=tf.float32)
        loss = loss * mask
        return tf.reduce_sum(loss) / tf.reduce_sum(mask)

In [18]:
def acc(y_true, y_pred):
    targ = tf.cast(y_true, dtype='int32')
    pred = tf.cast(tf.argmax(y_pred, axis=-1), dtype='int32')
    correct = tf.cast(tf.equal(targ, pred), dtype='float32')
    mask = tf.cast(tf.greater(targ, 0), dtype='float32')
    n_correct = tf.reduce_sum(mask * correct)
    n_total = tf.reduce_sum(mask)
    return n_correct / n_total


In [19]:
# Compile model
model = LIModel(num_tags=Vy + 1, vocab_size=Vx, embed_dim=EMBEDDING_DIM, num_heads=4, ff_dim=64)
model.compile(optimizer='adam', loss=CustomNonPaddingTokenLoss(), metrics=[acc])


In [20]:
# Train model with Early Stopping
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
history = model.fit(text_inputs_train, li_targets_train, batch_size=32, epochs=15, validation_split=0.2, callbacks=[early_stopping])

Epoch 1/15
[1m828/828[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m294s[0m 350ms/step - acc: 0.6866 - loss: 0.9817 - val_acc: 0.8378 - val_loss: 0.5341
Epoch 2/15
[1m828/828[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m322s[0m 351ms/step - acc: 0.9302 - loss: 0.2285 - val_acc: 0.8565 - val_loss: 0.6110
Epoch 3/15
[1m828/828[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m293s[0m 354ms/step - acc: 0.9768 - loss: 0.0882 - val_acc: 0.8257 - val_loss: 0.6367
Epoch 4/15
[1m828/828[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m341s[0m 377ms/step - acc: 0.9787 - loss: 0.0748 - val_acc: 0.8561 - val_loss: 0.7552


In [21]:
# Evaluate model
loss, accuracy = model.evaluate(text_inputs_test, li_targets_test)

[1m442/442[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 114ms/step - acc: 0.8450 - loss: 0.5436


In [22]:
# Predict and convert predictions to labels
y_pred_prob = model.predict(text_inputs_test)
y_pred = np.argmax(y_pred_prob, axis=-1)

[1m442/442[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 126ms/step


In [23]:
# Flatten the arrays for computing the metrics
y_test_flat = li_targets_test.flatten()
y_pred_flat = y_pred.flatten()

In [24]:
# Remove padding from the flattened arrays
non_zero_indices = y_test_flat != 0
y_test_flat_non_zero = y_test_flat[non_zero_indices]
y_pred_flat_non_zero = y_pred_flat[non_zero_indices]

In [25]:
# Compute classification report
print("Classification Report:\n", classification_report(y_test_flat_non_zero, y_pred_flat_non_zero))
print("Accuracy:", accuracy_score(y_test_flat_non_zero, y_pred_flat_non_zero))
print("Macro F1 Score:", f1_score(y_test_flat_non_zero, y_pred_flat_non_zero, average='macro'))
print("Precision:", precision_score(y_test_flat_non_zero, y_pred_flat_non_zero, average='macro'))
print("Recall:", recall_score(y_test_flat_non_zero, y_pred_flat_non_zero, average='macro'))

Classification Report:
               precision    recall  f1-score   support

           1       0.83      0.30      0.44        83
           2       0.79      0.98      0.88      6507
           3       0.93      0.82      0.87      3220
           4       1.00      1.00      1.00      1560
           5       0.64      0.35      0.45      1065
           6       0.76      0.76      0.76       851
           7       0.91      0.11      0.19       541
           8       0.98      0.60      0.75       310

    accuracy                           0.84     14137
   macro avg       0.86      0.61      0.67     14137
weighted avg       0.84      0.84      0.82     14137

Accuracy: 0.8389332956072717
Macro F1 Score: 0.6677767984204612
Precision: 0.8557154222449969
Recall: 0.6148136582880699
