To build a model that identifies if a tweet informs about a disaster using Long Short-Term Memory (LSTM) neural networks, we need to follow these steps:

Data collection: We need to collect a dataset of tweets that inform about disasters and tweets that do not inform about disasters.

Data pre-processing: We need to clean and preprocess the data by removing stop words, special characters, and converting text to lowercase.

Feature extraction: We need to extract features from the preprocessed data. In this case, we will use word embeddings to represent each word in the text as a vector.

Train the LSTM model: We will train the LSTM model on the preprocessed and feature extracted data.

Evaluate the model: We will evaluate the performance of the LSTM model using metrics such as accuracy, precision, recall, and F1 score.

This will train the model for 10 epochs with a batch size of 32 and use the validation data to evaluate the performance after each epoch.

In [None]:
# Step 1: Data collection
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')

# Load the dataset of tweets
file_path = "/content/drive/MyDrive/CS298/tweets.csv"
df = pd.read_csv(file_path)

Mounted at /content/drive


In [None]:
# Step 2: Data pre-processing
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Download the stop words and stemmer from NLTK
nltk.download('stopwords')
stemmer = PorterStemmer()
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
# Function to clean and preprocess the text
def preprocess(text):
    # Remove URLs, mentions, and special characters
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'@[^\s]+', '', text)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # Convert text to lowercase
    text = text.lower()
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    # Remove stop words and stem the words
    words = [stemmer.stem(word) for word in tokens if word not in stopwords.words('english')]
    # Join the words back into a string
    return ' '.join(words)

# Apply the preprocess function to the text column
df['text'] = df['text'].apply(preprocess)

In [None]:
import numpy as np

def create_embedding_matrix(filepath, word_index, embedding_dim):
    """
    This function creates an embedding matrix for use in an Embedding layer.
    
    Args:
    filepath (str): Path to the GloVe word embeddings file.
    word_index (dict): A dictionary mapping words to their respective indices in the tokenizer.
    embedding_dim (int): Dimension of the word embeddings.
    
    Returns:
    embedding_matrix (numpy.ndarray): A matrix of shape (vocab_size, embedding_dim), where each row contains the word embeddings for a particular word in the vocabulary.
    """
    vocab_size = len(word_index) + 1  # Adding 1 because of reserved 0 index
    embedding_matrix = np.zeros((vocab_size, embedding_dim))

    with open(filepath) as f:
        for line in f:
            word, *vector = line.split()
            if word in word_index:
                idx = word_index[word]
                embedding_matrix[idx] = np.array(
                    vector, dtype=np.float32)[:embedding_dim]

    return embedding_matrix

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.models import Sequential
from sklearn.model_selection import train_test_split

# Convert the text to sequences of word indices
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['text'])
sequences = tokenizer.texts_to_sequences(df['text'])

# Pad the sequences to have equal length
maxlen = max(len(seq) for seq in sequences)
X = pad_sequences(sequences, maxlen=maxlen)

# Create word embeddings using pre-trained GloVe embeddings
embedding_dim = 100
embedding_matrix = create_embedding_matrix('/content/drive/MyDrive/CS298/glove.6B.100d.txt', tokenizer.word_index, embedding_dim)
embedding_layer = Embedding(len(tokenizer.word_index) + 1, embedding_dim, weights=[embedding_matrix], input_length=maxlen, trainable=False)

# Split the data into training and testing sets
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Define the LSTM model
model = Sequential()
model.add(embedding_layer)
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


In [None]:
# Train the LSTM model
history = model.fit(X_train, y_train, batch_size=32, epochs=10, validation_data=(X_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
# Evaluate the model on the test set and get the predicted labels
y_pred = model.predict(X_test)
y_pred = (y_pred > 0.5) # Convert probabilities to binary labels

# Get the true labels for the test set
y_true = y_test

# Calculate the evaluation metrics
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)

print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1 score:', f1)

Accuracy: 0.7728168089297439
Precision: 0.75809199318569
Recall: 0.6856702619414484
F1 score: 0.7200647249190939


In [None]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print('Loss:', loss)
print('Accuracy:', accuracy)

Loss: 0.589633047580719
Accuracy: 0.7728168368339539
