### ***ALL IN ONE ***

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report
import numpy as np

# Define function to load and preprocess data
def load_and_preprocess_data(train_filepath, val_filepath):
    # Load datasets
    train_data = pd.read_csv(train_filepath).dropna()
    val_data = pd.read_csv(val_filepath).dropna()

    # Extract text and labels
    X_train = train_data['Word'].astype(str).tolist()
    y_train = train_data['Tag'].astype(str).tolist()
    X_val = val_data['Word'].astype(str).tolist()
    y_val = val_data['Tag'].astype(str).tolist()

    # Encode labels
    label_encoder = LabelEncoder()
    y_train_encoded = label_encoder.fit_transform(y_train)
    y_val_encoded = label_encoder.transform(y_val)

    return X_train, y_train_encoded, X_val, y_val_encoded, label_encoder

# Define function to create and train the model
def create_and_train_model(X_train, y_train_encoded, X_val, y_val_encoded, label_encoder):
    # Tokenization
    max_words = 15000
    tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')
    tokenizer.fit_on_texts(X_train)

    # Sequences and padding
    max_len = 100
    X_train_seq = tokenizer.texts_to_sequences(X_train)
    X_val_seq = tokenizer.texts_to_sequences(X_val)
    X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post')
    X_val_pad = pad_sequences(X_val_seq, maxlen=max_len, padding='post')

    # Model parameters
    embedding_dim = 100
    lstm_units = 128
    dropout_rate = 0.3

    # Define BiLSTM model
    model = Sequential([
        Embedding(input_dim=min(max_words, len(tokenizer.word_index) + 1),
                  output_dim=embedding_dim, input_length=max_len),
        Bidirectional(LSTM(units=lstm_units, dropout=dropout_rate, return_sequences=True)),
        Bidirectional(LSTM(units=lstm_units, dropout=dropout_rate)),
        Dense(len(np.unique(y_train_encoded)), activation='softmax')
    ])

    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    # Training with early stopping
    early_stopping = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)
    history = model.fit(
        X_train_pad, y_train_encoded,
        epochs=8,
        batch_size=8,
        validation_data=(X_val_pad, y_val_encoded),
        callbacks=[early_stopping],
        verbose=1
    )

    # Evaluation
    loss, accuracy = model.evaluate(X_val_pad, y_val_encoded, verbose=1)
    print(f'Validation Loss: {loss:.4f}')
    print(f'Validation Accuracy: {accuracy:.4f}')

    # Classification report
    y_pred_prob = model.predict(X_val_pad)
    y_pred = np.argmax(y_pred_prob, axis=1)

    # Generate the classification report with correct label handling
    target_names = label_encoder.classes_
    print(classification_report(y_val_encoded, y_pred, target_names=target_names, labels=np.unique(y_val_encoded)))

# Define datasets with file paths
datasets = {
    "Tamil": {
        "train": "/content/drive/MyDrive/DATASET/all language correct format dataset /correct_tamil_dataset.csv",
        "validation": "/content/drive/MyDrive/DATASET/all language correct format dataset /correct_tamil_validation"
    },
    "Malayalam": {
        "train": "/content/drive/MyDrive/DATASET/all language correct format dataset /Final_mal_train(80%)  (1).csv",
        "validation": "/content/drive/MyDrive/DATASET/all language correct format dataset /Final_mal_dev(20%) (1).csv"
    },
    "Tulu": {
        "train": "/content/drive/MyDrive/DATASET/all language correct format dataset /correct_tulu_train_set",
        "validation": "/content/drive/MyDrive/DATASET/all language correct format dataset /correct_tulu_validation_set"
    },
    "Kannada": {
        "train": "/content/drive/MyDrive/DATASET/all language correct format dataset /correct_kannada_train",
        "validation": "/content/drive/MyDrive/DATASET/all language correct format dataset /correct_kannada_validation"
    }
}

# Process each dataset
for language, paths in datasets.items():
    print(f"\nProcessing {language} dataset...")
    X_train, y_train_encoded, X_val, y_val_encoded, label_encoder = load_and_preprocess_data(paths['train'], paths['validation'])
    create_and_train_model(X_train, y_train_encoded, X_val, y_val_encoded, label_encoder)



Processing Tamil dataset...
Epoch 1/8




[1m1692/1692[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 23ms/step - accuracy: 0.5959 - loss: 1.1652 - val_accuracy: 0.7752 - val_loss: 0.6150
Epoch 2/8
[1m1692/1692[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 24ms/step - accuracy: 0.8193 - loss: 0.4932 - val_accuracy: 0.7717 - val_loss: 0.5733
Epoch 3/8
[1m1692/1692[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 23ms/step - accuracy: 0.9281 - loss: 0.2740 - val_accuracy: 0.7243 - val_loss: 1.1044
Epoch 4/8
[1m1692/1692[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 23ms/step - accuracy: 0.9579 - loss: 0.1796 - val_accuracy: 0.7223 - val_loss: 1.1238
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 0.7792 - loss: 0.5672
Validation Loss: 0.5840
Validation Accuracy: 0.7601
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

    Location       0.00      0.00      0.00         1
       Other       0.97      0.73      0.83       496
          en       0.79      0.40      0.53       160
        name       0.97      1.00      0.99       183
         sym       0.84      0.81      0.82      1000
          tm       0.24      0.64      0.35       144

    accuracy                           0.76      1984
   macro avg       0.64      0.60      0.59      1984
weighted avg       0.84      0.76      0.78      1984


Processing Malayalam dataset...
Epoch 1/8




[1m4728/4728[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m123s[0m 25ms/step - accuracy: 0.7110 - loss: 0.9009 - val_accuracy: 0.8359 - val_loss: 0.5633
Epoch 2/8
[1m4728/4728[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 25ms/step - accuracy: 0.9332 - loss: 0.2122 - val_accuracy: 0.8123 - val_loss: 0.6153
Epoch 3/8
[1m4728/4728[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m117s[0m 25ms/step - accuracy: 0.9683 - loss: 0.1183 - val_accuracy: 0.7525 - val_loss: 0.6829
[1m295/295[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 9ms/step - accuracy: 0.8556 - loss: 0.4836
Validation Loss: 0.5619
Validation Accuracy: 0.8365
[1m295/295[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step
              precision    recall  f1-score   support

     ENGLISH       0.98      0.80      0.88      2230
   MALAYALAM       0.81      0.97      0.89      4371
       MIXED       0.55      0.08      0.14       375
        NAME       0.65      0.74      0.69       504
     



[1m3690/3690[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m96s[0m 25ms/step - accuracy: 0.6455 - loss: 1.0155 - val_accuracy: 0.8270 - val_loss: 0.5538
Epoch 2/8
[1m3690/3690[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 23ms/step - accuracy: 0.9052 - loss: 0.2991 - val_accuracy: 0.8453 - val_loss: 0.5216
Epoch 3/8
[1m3690/3690[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 23ms/step - accuracy: 0.9477 - loss: 0.1879 - val_accuracy: 0.7485 - val_loss: 0.6609
Epoch 4/8
[1m3690/3690[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 23ms/step - accuracy: 0.9609 - loss: 0.1404 - val_accuracy: 0.7385 - val_loss: 0.7165
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.8437 - loss: 0.5214
Validation Loss: 0.5206
Validation Accuracy: 0.8433
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 15ms/step
              precision    recall  f1-score   support

     English       0.97      0.82      0.89       742
     



[1m3750/3750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m89s[0m 23ms/step - accuracy: 0.7538 - loss: 0.7140 - val_accuracy: 0.8339 - val_loss: 0.3760
Epoch 2/8
[1m3750/3750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 22ms/step - accuracy: 0.9666 - loss: 0.0973 - val_accuracy: 0.8456 - val_loss: 0.3975
Epoch 3/8
[1m3750/3750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m85s[0m 23ms/step - accuracy: 0.9903 - loss: 0.0348 - val_accuracy: 0.8396 - val_loss: 0.4481
[1m78/78[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.8553 - loss: 0.3262
Validation Loss: 0.3748
Validation Accuracy: 0.8295
[1m78/78[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step
              precision    recall  f1-score   support

          en       0.77      0.99      0.87      1109
          kn       0.93      0.63      0.75       634
    location       0.00      0.00      0.00        13
       mixed       0.85      0.50      0.63       180
        na

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
