Three models are trained and tested to good accuracies. Improved BERT	has  86.13 % accuracy whereas
SBert 	(91.99) and
Improved XGBoost model on top of SBert	(93) have even higher accuracies in 90s.

# Improved BERT Model

In [None]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, TFBertForSequenceClassification
import tensorflow as tf
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.utils import class_weight

# Downloads NLTK data
nltk.download('stopwords')
nltk.download('wordnet')

# Initialises stopword list and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Loading dataset
tweets = pd.read_csv('/content/DisasterTweets.csv')

# Enhanced text cleaning function
def clean_text(text):
    text = re.sub(r'http\S+', '', text)  # Removes URLs
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Removes non-alphabetic characters
    text = text.lower().strip()  # To lowercase the text
    words = text.split()  # Splits into words
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]  # To lemmatize and remove stopwords
    return ' '.join(words)

# Cleaning to the dataset
tweets['clean_text'] = tweets['Tweets'].apply(clean_text)

# To ensure target labels are in int format
tweets['Verified'] = tweets['Verified'].astype(int)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(tweets['clean_text'], tweets['Verified'], test_size=0.2, random_state=42)

# To load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def encode_sentences(sentences, max_len=512):
    return tokenizer(sentences, padding=True, truncation=True, max_length=max_len, return_tensors='tf')

# To encode the data
train_encodings = encode_sentences(X_train.tolist())
test_encodings = encode_sentences(X_test.tolist())

# Converts the labels to int32 for TensorFlow compatibility
y_train = tf.convert_to_tensor(y_train.values, dtype=tf.int32)
y_test = tf.convert_to_tensor(y_test.values, dtype=tf.int32)

# Loads pre-trained BERT model for sequence classification
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Checks for class imbalance and computes class weights
class_weights = class_weight.compute_class_weight('balanced', classes=[0, 1], y=y_train.numpy())
class_weights_dict = {i: weight for i, weight in enumerate(class_weights)}

# Compiling the model
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)  # Reduced learning rate
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)  # Logits output
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

# Early stopping to prevent overfitting
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# To train the model with early stopping and class weights
model.fit(
    {'input_ids': train_encodings['input_ids'], 'attention_mask': train_encodings['attention_mask']},
    y_train,
    validation_data=({'input_ids': test_encodings['input_ids'], 'attention_mask': test_encodings['attention_mask']}, y_test),
    batch_size=16,
    epochs=10,
    callbacks=[early_stopping],
    class_weight=class_weights_dict  # Applying class weights to handle imbalance
)

# Evaluating the model
model.evaluate({'input_ids': test_encodings['input_ids'], 'attention_mask': test_encodings['attention_mask']}, y_test)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10


[0.3515661358833313, 0.861328125]

# SBert on disaster tweets data. Manually defined stop words without NLTK downloads

In [None]:
!pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-3.1.0-py3-none-any.whl.metadata (23 kB)
Downloading sentence_transformers-3.1.0-py3-none-any.whl (249 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m249.1/249.1 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence-transformers
Successfully installed sentence-transformers-3.1.0


In [None]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from nltk.stem import WordNetLemmatizer

# Manually defined list of stopwords (avoiding the need for NLTK downloads)
stop_words = set([
    'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your',
    'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it',
    "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this',
    'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had',
    'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while',
    'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above',
    'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here',
    'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no',
    'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should',
    "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't",
    'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't",
    'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't",
    'won', "won't", 'wouldn', "wouldn't"
])

# To Initialising lemmatizer
lemmatizer = WordNetLemmatizer()

# Text cleaning function
def clean_text(text):
    # Removes URLs and non-alphabetic characters, converts to lowercase, lemmatizes and removes stopwords
    text = re.sub(r'http\S+', '', text)  # Removes URLs
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Removes non-alphabetic characters
    text = text.lower().strip()  # Lowercase the text
    words = text.split()  # Splits into words
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]  # Lemmatizes and removes stopwords
    return ' '.join(words)

file_path = '/content/DisasterTweets.csv'
disaster_tweets_df = pd.read_csv(file_path)

# Applying text cleaning to the Tweets column
disaster_tweets_df['clean_text'] = disaster_tweets_df['Tweets'].apply(clean_text)

# Encodes the target label (assuming 'Disaster' is the target label)
label_encoder = LabelEncoder()
disaster_tweets_df['Disaster_encoded'] = label_encoder.fit_transform(disaster_tweets_df['Disaster'])

# Train-test split (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(disaster_tweets_df['clean_text'], disaster_tweets_df['Disaster_encoded'], test_size=0.2, random_state=42)

# Loads the pre-trained SBERT model
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')

# Generates SBERT embeddings for both the training and test datasets
X_train_embeddings = sbert_model.encode(X_train.tolist(), batch_size=16, show_progress_bar=True)
X_test_embeddings = sbert_model.encode(X_test.tolist(), batch_size=16, show_progress_bar=True)

# To train a classifier (Random Forest) on the SBERT embeddings
classifier = RandomForestClassifier(n_estimators=100, random_state=42)
classifier.fit(X_train_embeddings, y_train)

# Makes predictions on the test set
y_pred = classifier.predict(X_test_embeddings)

# Evaluates the model's accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Checks if we reached 90%+ accuracy
if accuracy >= 0.90:
    print("Achieved 90%+ accuracy!")
else:
    print("Consider further tuning to improve accuracy.")




Batches:   0%|          | 0/128 [00:00<?, ?it/s]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

Accuracy: 91.99%
Achieved 90%+ accuracy!


# Improved XGBoost model on SBERT embeddings

In [None]:
!pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-3.1.0-py3-none-any.whl.metadata (23 kB)
Downloading sentence_transformers-3.1.0-py3-none-any.whl (249 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m249.1/249.1 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence-transformers
Successfully installed sentence-transformers-3.1.0


In [None]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split, cross_val_score
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

stop_words = set([
    'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your',
    'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it',
    "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this',
    'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had',
    'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while',
    'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above',
    'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here',
    'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no',
    'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should',
    "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't",
    'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't",
    'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't",
    'won', "won't", 'wouldn', "wouldn't"
])

# Text cleaning function without lemmatisation
def clean_text(text):
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower().strip()
    words = text.split()
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

file_path = '/content/DisasterTweets.csv'
disaster_tweets_df = pd.read_csv(file_path)

# Applying text cleaning to the 'Tweets' column
disaster_tweets_df['clean_text'] = disaster_tweets_df['Tweets'].apply(clean_text)

# Encoding the target label
label_encoder = LabelEncoder()
disaster_tweets_df['Disaster_encoded'] = label_encoder.fit_transform(disaster_tweets_df['Disaster'])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(disaster_tweets_df['clean_text'], disaster_tweets_df['Disaster_encoded'], test_size=0.2, random_state=42)

# Loading the pre-trained SBERT model
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')

# To generate SBERT embeddings for both the training and test datasets
X_train_embeddings = sbert_model.encode(X_train.tolist(), batch_size=16, show_progress_bar=True)
X_test_embeddings = sbert_model.encode(X_test.tolist(), batch_size=16, show_progress_bar=True)

# To train XGBoost model on SBERT embeddings
xgb_model = xgb.XGBClassifier(objective='multi:softmax', num_class=len(np.unique(y_train)), eval_metric='mlogloss', use_label_encoder=False,
                               learning_rate=0.01, max_depth=6, n_estimators=500)

# Training the XGBoost model
xgb_model.fit(X_train_embeddings, y_train)

# Predicting on the test set
y_pred_xgb = xgb_model.predict(X_test_embeddings)

# Evaluating accuracy
xgb_accuracy = accuracy_score(y_test, y_pred_xgb)
print(f"XGBoost Model Test Accuracy: {xgb_accuracy * 100:.2f}%")

# Cross-validation to ensure model robustness
cross_val_scores = cross_val_score(xgb_model, X_train_embeddings, y_train, cv=5)
print(f"Cross-Validation Accuracy: {cross_val_scores.mean() * 100:.2f}%")

# Classification Report
print("\nClassification Report:")
print(classification_report(y_test, y_pred_xgb, target_names=label_encoder.classes_))




Batches:   0%|          | 0/128 [00:00<?, ?it/s]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

Parameters: { "use_label_encoder" } are not used.



XGBoost Model Test Accuracy: 92.58%


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Cross-Validation Accuracy: 92.62%

Classification Report:
              precision    recall  f1-score   support

     Drought       0.94      0.90      0.92       167
  Earthquake       1.00      0.99      1.00       102
      Floods       0.86      0.83      0.85        84
  Hurricanes       1.00      0.92      0.96        36
   Tornadoes       0.91      0.94      0.92        32
    Wildfire       0.86      0.98      0.92        91

    accuracy                           0.93       512
   macro avg       0.93      0.93      0.93       512
weighted avg       0.93      0.93      0.93       512

