This model has two Bidirectional LSTM layers, with dropout layers added after each of them to prevent overfitting. The final output layer uses a sigmoid activation function to output a probability between 0 and 1 for binary classification.

In [None]:
import pandas as pd
import numpy as np
import re
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

# Load the data
from google.colab import drive
drive.mount('/content/drive')

# Load the dataset of tweets
file_path = "/content/drive/MyDrive/CS298/tweets.csv"
df = pd.read_csv(file_path)

# Clean the data
def clean_text(text):
    text = re.sub(r'http\S+', '', text) # Remove URLs
    text = re.sub(r'<.*?>', '', text) # Remove HTML tags
    text = re.sub(r'[^\w\s]', '', text) # Remove punctuation
    text = re.sub(r'\d+', '', text) # Remove digits
    text = text.lower() # Convert text to lowercase
    return text

df['text'] = df['text'].apply(clean_text)

Mounted at /content/drive


In [None]:
# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['text'])
X = tokenizer.texts_to_sequences(df['text'])
maxlen = max(len(x) for x in X)
X = pad_sequences(X, padding='post', maxlen=maxlen)

# Create word embeddings using pre-trained GloVe embeddings
def create_embedding_matrix(filepath, word_index, embedding_dim):
    vocab_size = len(word_index) + 1
    embedding_matrix = np.zeros((vocab_size, embedding_dim))

    with open(filepath) as f:
        for line in f:
            word, *vector = line.split()
            if word in word_index:
                idx = word_index[word]
                embedding_matrix[idx] = np.array(vector, dtype=np.float32)[:embedding_dim]

    return embedding_matrix

embedding_dim = 100
embedding_matrix = create_embedding_matrix('/content/drive/MyDrive/CS298/glove.6B.100d.txt', tokenizer.word_index, embedding_dim)


In [None]:
# Split the data into training and testing sets
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the BiLSTM model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(len(tokenizer.word_index)+1, embedding_dim, input_length=maxlen, weights=[embedding_matrix], trainable=False),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, dropout=0.2)),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [None]:
# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
# Evaluate the model on the test set and get the predicted labels
y_pred = model.predict(X_test)
y_pred = (y_pred > 0.5) # Convert probabilities to binary labels

# Get the true labels for the test set
y_true = y_test

# Calculate the evaluation metrics
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)

print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1 score:', f1)

Accuracy: 0.8036769533814839
Precision: 0.8038194444444444
Recall: 0.7134052388289677
F1 score: 0.7559183673469387
