In [None]:
  from google.colab import drive
  drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, accuracy_score, f1_score
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, Bidirectional, Add
from tensorflow.keras.optimizers import Adam

In [None]:
# Load and preprocess data
df = pd.read_csv('/content/drive/MyDrive/seq2seq/Final_mal_data.csv')
df = df.dropna()

sentences = []
labels = []

current_sentence = []
current_labels = []

for index, row in df.iterrows():
    word = row['WORD']
    label = row['LABEL']

    if word == '.':
        if current_sentence:
            current_sentence.append(word)
            sentences.append(' '.join(current_sentence))
            labels.append(' '.join(current_labels))
            current_sentence = []
            current_labels = []
    else:
        current_sentence.append(word)
        current_labels.append(label)

df_preprocessed = pd.DataFrame({'Sentence': sentences, 'Labels': labels})

# Prepare tokenizer for text
text_tokenizer = Tokenizer()
text_tokenizer.fit_on_texts(df_preprocessed['Sentence'])
text_sequences = text_tokenizer.texts_to_sequences(df_preprocessed['Sentence'])

# Prepare tokenizer for labels (single-word labels)
label_tokenizer = Tokenizer()
label_tokenizer.fit_on_texts(df_preprocessed['Labels'])
label_sequences = label_tokenizer.texts_to_sequences(df_preprocessed['Labels'])

# Determine maximum sequence lengths
max_text_len = max(len(seq) for seq in text_sequences)
max_label_len = max(len(seq) for seq in label_sequences)

# Pad sequences
text_sequences_padded = pad_sequences(text_sequences, maxlen=max_text_len, padding='post')
label_sequences_padded = pad_sequences(label_sequences, maxlen=max_label_len, padding='post')

# Convert sequences to numpy arrays ----->for consistency input shape,batch processing,efficiency,
text_sequences_padded = np.array(text_sequences_padded)
label_sequences_padded = np.array(label_sequences_padded)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    text_sequences_padded, label_sequences_padded, test_size=0.3, random_state=42
)

In [None]:
max_text_len

92

In [None]:
max_label_len

92

In [None]:

# Prepare decoder inputs for training and testing
decoder_input_train = np.zeros_like(y_train)  # Initialize with zeros (or start tokens if needed)
decoder_input_test = np.zeros_like(y_test)    # Initialize with zeros (or start tokens if needed)

# Expand dims for decoder output By adding this extra dimension, the model’s loss function can properly compare the predicted probabilities against the true labels
decoder_output_train = np.expand_dims(y_train, -1)
decoder_output_test = np.expand_dims(y_test, -1)

# Model Definition with Bidirectional LSTM and Attention (simplified example)
vocab_size = len(text_tokenizer.word_index) + 1
label_vocab_size = len(label_tokenizer.word_index) + 1
embedding_dim = 100
lstm_units = 128



In [None]:
# Encoder Bilstm
encoder_inputs = Input(shape=(max_text_len,))
embedding_layer = Embedding(input_dim=vocab_size, output_dim=embedding_dim, mask_zero=True)(encoder_inputs)
encoder_lstm = Bidirectional(LSTM(lstm_units, return_sequences=True, return_state=True))
encoder_output, forward_h, forward_c, backward_h, backward_c = encoder_lstm(embedding_layer)
encoder_states = [Add()([forward_h, backward_h]), Add()([forward_c, backward_c])]

# Decoder Lstm
decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(input_dim=label_vocab_size, output_dim=embedding_dim)(decoder_inputs)
decoder_lstm = LSTM(lstm_units, return_sequences=True)(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(label_vocab_size, activation='softmax')(decoder_lstm)

model = Model(inputs=[encoder_inputs, decoder_inputs], outputs=decoder_dense)
optimizer = Adam(learning_rate=0.001)
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Model summary
model.summary()



In [None]:
# Train the model with sample weights
history = model.fit(
    [X_train, decoder_input_train], decoder_output_train,
    epochs=50, batch_size=32,
    validation_split=0.1

)

Epoch 1/50
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 19ms/step - accuracy: 0.8854 - loss: 0.4898 - val_accuracy: 0.9542 - val_loss: 0.1331
Epoch 2/50
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 18ms/step - accuracy: 0.9569 - loss: 0.1263 - val_accuracy: 0.9605 - val_loss: 0.1187
Epoch 3/50
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.9654 - loss: 0.1043 - val_accuracy: 0.9625 - val_loss: 0.1102
Epoch 4/50
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 0.9679 - loss: 0.0947 - val_accuracy: 0.9640 - val_loss: 0.1065
Epoch 5/50
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 0.9699 - loss: 0.0871 - val_accuracy: 0.9643 - val_loss: 0.1022
Epoch 6/50
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 0.9721 - loss: 0.0780 - val_accuracy: 0.9653 - val_loss: 0.0999
Epoch 7/50
[1m104/104

In [None]:
# Evaluate the model
y_pred = model.predict([X_test, decoder_input_test])

# Process predictions to compare with true labels np.argmax(y_pred, axis=-1) finds the index of the highest probability in the last dimension (vocabulary size) for each token position.
y_pred_sequences = np.argmax(y_pred, axis=-1)

# Flatten the arrays for computing the metrics
y_test_flat = y_test.flatten()
y_pred_flat = y_pred_sequences.flatten()

# Remove padding from the flattened arrays
non_zero_indices = y_test_flat != 0
y_test_flat_non_zero = y_test_flat[non_zero_indices]
y_pred_flat_non_zero = y_pred_flat[non_zero_indices]

# Compute classification report
print("Classification Report:\n", classification_report(y_test_flat_non_zero, y_pred_flat_non_zero))
print("Accuracy:", accuracy_score(y_test_flat_non_zero, y_pred_flat_non_zero))
print("Macro F1 Score:", f1_score(y_test_flat_non_zero, y_pred_flat_non_zero, average='macro'))

[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.80      0.84      0.82      6522
           2       0.72      0.74      0.73      2987
           3       0.52      0.43      0.47      1071
           4       0.63      0.60      0.61      1015
           5       0.20      0.15      0.17       491
           6       0.61      0.41      0.49       331
           7       0.47      0.11      0.17        75
           8       0.00      0.00      0.00         5

    accuracy                           0.72     12497
   macro avg       0.44      0.36      0.39     12497
weighted avg       0.71      0.72      0.71     12497

Accuracy: 0.7210530527326559
Macro F1 Score: 0.38544371067950994


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# Train the model
history = model.fit(
    [X_train, decoder_input_train], decoder_output_train,
    epochs=18, batch_size=8,
    validation_split=0.2,

)

Epoch 1/18
[1m368/368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 13ms/step - accuracy: 0.9943 - loss: 0.0163 - val_accuracy: 0.9840 - val_loss: 0.0747
Epoch 2/18
[1m368/368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 16ms/step - accuracy: 0.9923 - loss: 0.0217 - val_accuracy: 0.9844 - val_loss: 0.0669
Epoch 3/18
[1m368/368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 12ms/step - accuracy: 0.9952 - loss: 0.0132 - val_accuracy: 0.9855 - val_loss: 0.0687
Epoch 4/18
[1m368/368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 14ms/step - accuracy: 0.9968 - loss: 0.0091 - val_accuracy: 0.9860 - val_loss: 0.0655
Epoch 5/18
[1m368/368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 15ms/step - accuracy: 0.9975 - loss: 0.0072 - val_accuracy: 0.9855 - val_loss: 0.0684
Epoch 6/18
[1m368/368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 17ms/step - accuracy: 0.9978 - loss: 0.0066 - val_accuracy: 0.9858 - val_loss: 0.0692
Epoch 7/18
[1m368/36

In [None]:
# Evaluate the model
y_pred = model.predict([X_test, decoder_input_test])

# Process predictions to compare with true labels
y_pred_sequences = np.argmax(y_pred, axis=-1)

# Flatten the arrays for computing the metrics
y_test_flat = y_test.flatten()
y_pred_flat = y_pred_sequences.flatten()

# Remove padding from the flattened arrays
non_zero_indices = y_test_flat != 0
y_test_flat_non_zero = y_test_flat[non_zero_indices]
y_pred_flat_non_zero = y_pred_flat[non_zero_indices]

# Compute classification report
print("Classification Report:\n", classification_report(y_test_flat_non_zero, y_pred_flat_non_zero))
print("Accuracy:", accuracy_score(y_test_flat_non_zero, y_pred_flat_non_zero))
print("Macro F1 Score:", f1_score(y_test_flat_non_zero, y_pred_flat_non_zero, average='macro'))

[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.80      0.89      0.84      6522
           2       0.85      0.75      0.80      2987
           3       0.50      0.47      0.48      1071
           4       0.70      0.61      0.65      1015
           5       0.21      0.16      0.19       491
           6       0.63      0.47      0.54       331
           7       0.50      0.12      0.19        75
           8       0.00      0.00      0.00         5

    accuracy                           0.76     12497
   macro avg       0.47      0.39      0.41     12497
weighted avg       0.75      0.76      0.75     12497

Accuracy: 0.7552212531007442
Macro F1 Score: 0.4105805378663947


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# Define a function to clean up the predicted labels
def clean_label_sequence(label_sequence):
    cleaned_labels = ' '.join(map(str, label_sequence)).strip()
    cleaned_labels = ' '.join(cleaned_labels.split())
    return cleaned_labels

# Convert numerical sequences back to text labels
y_pred_labels = label_tokenizer.sequences_to_texts(y_pred_sequences)
y_test_labels = label_tokenizer.sequences_to_texts(y_test)

# Post-process test data
test_sentences = text_tokenizer.sequences_to_texts(X_test)

# Clean predicted labels
predicted_labels = [clean_label_sequence(labels.split()) for labels in y_pred_labels]
true_labels = [clean_label_sequence(labels.split()) for labels in y_test_labels]

# Prepare DataFrame for predictions
df_predictions = pd.DataFrame({
    'Sentence': test_sentences,
    'True Labels': true_labels,
    'Predicted Labels': predicted_labels
})

# Display a sample of the predictions
df_predictions


Unnamed: 0,Sentence,True Labels,Predicted Labels
0,pwli trailer,malayalam english,malayalam english
1,lucifer mass kuranja strogillatha trailormadhu...,name english malayalam mixed other english eng...,other english malayalam malayalam malayalam en...
2,mammookkaye ee traileril kandu njettattavar aa...,mixed malayalam mixed malayalam malayalam mala...,mixed malayalam mixed malayalam malayalam mala...
3,nivin pauli annel ithe oru onnonnara padam ayene,name name malayalam malayalam malayalam malaya...,other other malayalam english malayalam malaya...
4,oh enthoot kidu trailer laletta kidu mass love...,malayalam malayalam malayalam english name mal...,english malayalam malayalam english malayalam ...
...,...,...,...
1572,i like mammuka lalettan evenly smiling face op...,english english name name english other other ...,english english mixed mixed english english en...
1573,pirates caribian alle etu atanta angana,english english malayalam malayalam name malay...,other name malayalam malayalam malayalam malay...
1574,happy birthday mammookkka love sooooo much,english english mixed english english english,english english name english english english
1575,lalettan fans like adiiiii,name english english malayalam,name english english


In [None]:
len(X_test)

1577

In [None]:
len(X_train)

3679

In [None]:
len(y_test)

1577

In [None]:
len(y_test)

1577

In [None]:
df_predictions.to_csv('predicted_labels_cleaned.csv', index=False)

print("Cleaned predictions saved to 'predicted_labels_cleaned.csv'.")

In [None]:
#
# from sklearn.utils.class_weight import compute_class_weight
# from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

# # Compute class weights to handle imbalanced classes
# class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train.flatten())
# class_weights_dict = dict(enumerate(class_weights))

# # Define early stopping and model checkpoint callbacks
# early_stopping = EarlyStopping(monitor='val_loss', patience=3)
# model_checkpoint = ModelCheckpoint('best_model.h5', save_best_only=True)

# # Train the model with class weights and callbacks
# history = model.fit(
#     [X_train, decoder_input_train], decoder_output_train,
#     epochs=15, batch_size=4,
#     validation_split=0.1,
#     class_weight=class_weights_dict,  # Add this line to handle class imbalance
#     callbacks=[early_stopping, model_checkpoint]  # Add callbacks for early stopping and saving the best model
# )
# in addition