### ***ALL IN ONE ***

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Import necessary libraries
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Dense, Embedding, LayerNormalization, Dropout, MultiHeadAttention
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score

In [None]:
# Load and preprocess data
def load_and_preprocess_data(train_filepath, val_filepath):
    train_df = pd.read_csv(train_filepath).dropna()
    val_df = pd.read_csv(val_filepath).dropna()
    X_train, y_train = train_df['Word'], train_df['Tag']
    X_test, y_test = val_df['Word'], val_df['Tag']
    return X_train, y_train, X_test, y_test

# Define parameters
MAX_SEQUENCE_LENGTH = 128
EMBEDDING_DIM = 100

In [None]:

# Define function for creating and compiling the model
def create_and_compile_model(num_tags, vocab_size, embed_dim=EMBEDDING_DIM, num_heads=4, ff_dim=64):
    def transformer_block(inputs, embed_dim, num_heads, ff_dim, rate=0.1, training=False):
        attn_output = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)(inputs, inputs)
        attn_output = Dropout(rate)(attn_output, training=training)
        out1 = LayerNormalization(epsilon=1e-6)(inputs + attn_output)
        ffn_output = Dense(ff_dim, activation="relu")(out1)
        ffn_output = Dense(embed_dim)(ffn_output)
        ffn_output = Dropout(rate)(ffn_output, training=training)
        return LayerNormalization(epsilon=1e-6)(out1 + ffn_output)

    def token_and_position_embedding(inputs, maxlen, vocab_size, embed_dim):
        token_emb = Embedding(input_dim=vocab_size, output_dim=embed_dim)(inputs)
        pos_emb = Embedding(input_dim=maxlen, output_dim=embed_dim)(tf.range(start=0, limit=maxlen, delta=1))
        return token_emb + pos_emb

    inputs = tf.keras.Input(shape=(MAX_SEQUENCE_LENGTH,))
    x = token_and_position_embedding(inputs, MAX_SEQUENCE_LENGTH, vocab_size, embed_dim)
    x = transformer_block(x, embed_dim, num_heads, ff_dim)
    x = Dropout(0.1)(x)
    x = Dense(ff_dim, activation="relu")(x)
    x = Dropout(0.1)(x)
    outputs = Dense(num_tags, activation="softmax")(x)
    model = tf.keras.Model(inputs=inputs, outputs=outputs)

In [None]:
    # Define custom loss function
    class CustomNonPaddingTokenLoss(tf.keras.losses.Loss):
        def __init__(self, name="custom_ner_loss"):
            super().__init__(name=name)
        def call(self, y_true, y_pred):
            loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE)
            loss = loss_fn(y_true, y_pred)
            mask = tf.cast((y_true > 0), dtype=tf.float32)
            loss = loss * mask
            return tf.reduce_sum(loss) / tf.reduce_sum(mask)

    def acc(y_true, y_pred):
        targ = tf.cast(y_true, dtype='int32')
        pred = tf.cast(tf.argmax(y_pred, axis=-1), dtype='int32')
        correct = tf.cast(tf.equal(targ, pred), dtype='float32')
        mask = tf.cast(tf.greater(targ, 0), dtype='float32')
        n_correct = tf.reduce_sum(mask * correct)
        n_total = tf.reduce_sum(mask)
        return n_correct / n_total

    model.compile(optimizer='adam', loss=CustomNonPaddingTokenLoss(), metrics=[acc])
    return model

In [None]:
# File paths for different datasets
datasets = {
    "Tamil": {
        "train": "/content/drive/MyDrive/seq2seq/TAMIL DATASET/tamil_train (1).csv",
        "validation": "/content/drive/MyDrive/seq2seq/TAMIL DATASET/tamil_val.csv"
    },
     "Malayalam": {
        "train": "/content/drive/MyDrive/seq2seq/MALAYALAM DATASET/Final_mal_train(80%).csv",
        "validation": "/content/drive/MyDrive/seq2seq/MALAYALAM DATASET/Final_mal_dev(20%).csv"

    },
    "Tulu": {
        "train": "/content/drive/MyDrive/seq2seq/TULU DATASET/tulu_train (1) (1).csv",
         "validation":"/content/drive/MyDrive/seq2seq/TULU DATASET/tulu_val (1).csv"
    },
   "Kannada": {
        "train": "/content/drive/MyDrive/seq2seq/KANNADA DATASET/kn_train (1) (1).csv",
        "validation": "/content/drive/MyDrive/seq2seq/KANNADA DATASET/kn_val (1).csv"

    }
}

In [None]:
# Evaluate model on each dataset
for language, paths in datasets.items():
    print(f"\nProcessing {language} dataset...")

    X_train, y_train, X_test, y_test = load_and_preprocess_data(paths['train'], paths['val'])

    # Parameters
    Vx = 20000  # You may want to adjust this parameter based on your vocabulary size
    Vy = len(y_train.unique())

    # Text Tokenization
    text_tokenizer = Tokenizer(num_words=Vx, oov_token='<OOV>', filters='')
    text_tokenizer.fit_on_texts(X_train)
    text_sequences_train = text_tokenizer.texts_to_sequences(X_train)
    text_sequences_test = text_tokenizer.texts_to_sequences(X_test)

    # Label Tokenization
    li_tokenizer = Tokenizer(num_words=Vy+1, filters='', oov_token='<OOV>')
    li_tokenizer.fit_on_texts(y_train)
    li_sequences_train = li_tokenizer.texts_to_sequences(y_train)
    li_sequences_test = li_tokenizer.texts_to_sequences(y_test)

    # Padding
    text_inputs_train = pad_sequences(text_sequences_train, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
    text_inputs_test = pad_sequences(text_sequences_test, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
    li_targets_train = pad_sequences(li_sequences_train, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
    li_targets_test = pad_sequences(li_sequences_test, maxlen=MAX_SEQUENCE_LENGTH, padding='post')

    # Create and compile model
    model = create_and_compile_model(num_tags=Vy + 1, vocab_size=Vx)

    # Train model with Early Stopping
    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
    history = model.fit(text_inputs_train, li_targets_train, batch_size=32, epochs=15, validation_split=0.2, callbacks=[early_stopping])

In [None]:
   # Save the trained model
    model_path = f'/content/drive/MyDrive/seq2seq/{language.lower()}_transformer_model.h5'
    model.save(model_path)

    # Evaluate model
    loss, accuracy = model.evaluate(text_inputs_test, li_targets_test)
    print(f"Evaluation Results for {language}:")
    print(f"Loss: {loss:.4f}, Accuracy: {accuracy:.4f}")

    # Predict and convert predictions to labels
    y_pred_prob = model.predict(text_inputs_test)
    y_pred = np.argmax(y_pred_prob, axis=-1)

    # Flatten the arrays for computing the metrics
    y_test_flat = li_targets_test.flatten()
    y_pred_flat = y_pred.flatten()

    # Remove padding from the flattened arrays
    non_zero_indices = y_test_flat != 0
    y_test_flat_non_zero = y_test_flat[non_zero_indices]
    y_pred_flat_non_zero = y_pred_flat[non_zero_indices]

    # Compute classification report
    print(f"Classification Report for {language}:\n", classification_report(y_test_flat_non_zero, y_pred_flat_non_zero))
    print("Accuracy:", accuracy_score(y_test_flat_non_zero, y_pred_flat_non_zero))
    print("Macro F1 Score:", f1_score(y_test_flat_non_zero, y_pred_flat_non_zero, average='macro'))
    print("Precision:", precision_score(y_test_flat_non_zero, y_pred_flat_non_zero, average='macro'))
    print("Recall:", recall_score(y_test_flat_non_zero, y_pred_flat_non_zero, average='macro'))