In [62]:
import tensorflow as tf

In [63]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, Dense, Attention, Concatenate
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import classification_report, accuracy_score
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk




In [64]:

nltk.download('punkt')
nltk.download('stopwords')
data = pd.read_csv("AMMUSED (1).csv")



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [65]:

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Apply preprocessing
data['processed_text'] = data['title'].apply(preprocess_text)



In [66]:
# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_text = tfidf_vectorizer.fit_transform(data['processed_text']).toarray()



In [67]:

data['text_length'] = data['processed_text'].apply(len)
data['hashtag_count'] = data['processed_text'].apply(lambda x: x.count('#'))
data['url_count'] = data['title'].apply(lambda x: x.count('http'))



In [68]:

X_features = np.hstack((X_text, data[['text_length', 'hashtag_count', 'url_count']].values))



In [69]:

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(data['label_x'])



In [70]:

X_train, X_test, y_train, y_test = train_test_split(X_features, y, test_size=0.3, random_state=42)



In [81]:
def build_mbi_lstm_model(input_dim):
    inputs = Input(shape=(input_dim,))

    # Attention Layer
    x = Dense(128, activation='relu')(inputs)
    attention = Dense(1, activation='tanh')(x)
    attention = Dense(4, activation='softmax')(attention)
    x = Concatenate()([x, attention])
    x = tf.keras.layers.Reshape((128, x.shape[1]))(x)
    lstm_out = Bidirectional(LSTM(64, return_sequences=False))(x)
    outputs = Dense(len(np.unique(y)), activation='softmax')(lstm_out)

    model = Model(inputs, outputs)
    model.compile(optimizer=Adam(learning_rate=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model



In [82]:
# Build and train model
model = build_mbi_lstm_model(X_train.shape[1])
model.fit(X_train, y_train, batch_size=32, epochs=5, validation_split=0.2)



Epoch 1/5
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 26ms/step - accuracy: 0.6306 - loss: 0.6564 - val_accuracy: 0.6878 - val_loss: 0.6181
Epoch 2/5
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 20ms/step - accuracy: 0.6752 - loss: 0.6312 - val_accuracy: 0.6878 - val_loss: 0.6173
Epoch 3/5
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 33ms/step - accuracy: 0.6752 - loss: 0.6290 - val_accuracy: 0.6878 - val_loss: 0.6222
Epoch 4/5
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 22ms/step - accuracy: 0.6795 - loss: 0.6265 - val_accuracy: 0.6878 - val_loss: 0.6246
Epoch 5/5
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 18ms/step - accuracy: 0.6760 - loss: 0.6299 - val_accuracy: 0.6878 - val_loss: 0.6183


<keras.src.callbacks.history.History at 0x7cefe6a45e10>

In [83]:
# Evaluate model
y_pred = np.argmax(model.predict(X_test), axis=1)
print("Accuracy:", accuracy_score(y_test, y_pred))

# Convert label_encoder.classes_ to strings if it contains boolean values
target_names = [str(class_name) for class_name in label_encoder.classes_]

print(classification_report(y_test, y_pred, target_names=target_names)) # Use converted target_names

[1m298/298[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step
Accuracy: 0.6749528005034613
              precision    recall  f1-score   support

       False       0.62      0.26      0.36      3440
        True       0.68      0.91      0.78      6094

    accuracy                           0.67      9534
   macro avg       0.65      0.58      0.57      9534
weighted avg       0.66      0.67      0.63      9534

