In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [25]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report
import numpy as np

In [39]:
# Load train and validation datasets
train_data= pd.read_csv('/content/drive/MyDrive/seq2seq/train_dataset.csv').dropna()
val_data= pd.read_csv('/content/drive/MyDrive/seq2seq/validation_dataset.csv').dropna()

In [40]:
# Extract text and labels
X_train = train_dataset['WORD'].astype(str).tolist()
y_train = train_dataset['LABEL'].astype(str).tolist()
X_val = validation_dataset['WORD'].astype(str).tolist()
y_val = validation_dataset['LABEL'].astype(str).tolist()

In [50]:
# Encode labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_val_encoded = label_encoder.transform(y_val)

In [51]:
# Tokenization
max_words = 15000  # Adjust as needed
tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)

In [52]:
# Sequences and padding
max_len = 100  # Set a fixed maximum length for padding
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_val)
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post')
X_val_pad = pad_sequences(X_val_seq, maxlen=max_len, padding='post')

In [53]:
# Model parameters
embedding_dim = 100
lstm_units = 128
dropout_rate = 0.3

In [54]:
# Define BiLSTM model
model = Sequential([
    Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=embedding_dim, input_length=max_len),
    Bidirectional(LSTM(units=lstm_units, dropout=dropout_rate, return_sequences=True)),
    Bidirectional(LSTM(units=lstm_units, dropout=dropout_rate)),
    Dense(len(set(y_train_encoded)), activation='softmax')
])



In [59]:
# Build the model with input shape
model.build((None, max_len))  # (None, max_len) represents the batch size and input length

In [60]:
# Compile model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [61]:
# Print model summary
model.summary()

In [66]:
# Training with early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)
history = model.fit(
    X_train_pad, y_train_encoded,
    epochs=50,
    batch_size=8,
    validation_data=(X_val_pad, y_val_encoded),
    callbacks=[early_stopping],
    verbose=1
)

Epoch 1/50
[1m4139/4139[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m104s[0m 25ms/step - accuracy: 0.9682 - loss: 0.1259 - val_accuracy: 0.7267 - val_loss: 0.9357
Epoch 2/50
[1m4139/4139[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 25ms/step - accuracy: 0.9772 - loss: 0.0936 - val_accuracy: 0.7259 - val_loss: 0.9634
Epoch 3/50
[1m4139/4139[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m144s[0m 26ms/step - accuracy: 0.9779 - loss: 0.0840 - val_accuracy: 0.7268 - val_loss: 0.8598
Epoch 4/50
[1m4139/4139[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 26ms/step - accuracy: 0.9773 - loss: 0.0783 - val_accuracy: 0.7261 - val_loss: 0.9370
Epoch 5/50
[1m4139/4139[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m138s[0m 25ms/step - accuracy: 0.9798 - loss: 0.0699 - val_accuracy: 0.7268 - val_loss: 0.7979
Epoch 6/50
[1m4139/4139[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m105s[0m 25ms/step - accuracy: 0.9785 - loss: 0.0716 - val_accuracy: 0.7386 - val_loss: 0.675

In [67]:
# Evaluation
loss, accuracy = model.evaluate(X_val_pad, y_val_encoded, verbose=1)
print(f'Validation Loss: {loss:.4f}')
print(f'Validation Accuracy: {accuracy:.4f}')

[1m442/442[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 9ms/step - accuracy: 0.8678 - loss: 0.5032
Validation Loss: 0.5420
Validation Accuracy: 0.8579


In [68]:
# Classification report
y_pred_prob = model.predict(X_val_pad)
y_pred = np.argmax(y_pred_prob, axis=1)
print(classification_report(y_val_encoded, y_pred, target_names=label_encoder.classes_))

[1m442/442[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 8ms/step
              precision    recall  f1-score   support

     ENGLISH       0.97      0.83      0.89      3220
   MALAYALAM       0.81      0.98      0.89      6507
       MIXED       0.89      0.28      0.42       541
        NAME       0.79      0.79      0.79       851
      NUMBER       0.99      0.75      0.85       310
       OTHER       0.61      0.42      0.50      1065
       PLACE       0.93      0.45      0.60        83
         SYM       1.00      1.00      1.00      1560

    accuracy                           0.86     14137
   macro avg       0.87      0.69      0.74     14137
weighted avg       0.86      0.86      0.85     14137

