In [1]:
pip install transformers datasets scikit-learn pandas torch


Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install 'accelerate>=0.26.0'

Note: you may need to restart the kernel to use updated packages.


In [3]:
import re
import numpy as np
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score



In [4]:
# ============================================================================
# 1. Text Cleaning Function
# ============================================================================

def clean_text(text: str) -> str:
    """
    Cleans the tweet text:
      - Removes URLs
      - Removes mentions (@username)
      - Removes extra whitespace
      - Removes some punctuation artifacts
    You can expand this function with additional cleaning steps.
    """
    text = text.lower()
    text = re.sub(r'http\S+', '', text)         # remove URLs
    text = re.sub(r'@\w+', '', text)              # remove @mentions
    text = re.sub(r'#', '', text)                # remove hashtag symbol (keep the word)
    text = re.sub(r'\s+', ' ', text).strip()      # remove extra spaces
    return text



In [5]:
# ============================================================================
# 2. Dataset Class for PyTorch
# ============================================================================

class DisasterDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer, max_length: int = 128):
        self.texts = texts.reset_index(drop=True)
        self.labels = labels.reset_index(drop=True) if labels is not None else None
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt'
        )
        item = {key: encoding[key].squeeze(0) for key in encoding}
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item



In [6]:
# ============================================================================
# 3. Compute Metrics for Evaluation
# ============================================================================

def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    acc = accuracy_score(labels, preds)
    precision = precision_score(labels, preds)
    recall = recall_score(labels, preds)
    f1 = f1_score(labels, preds)
    return {'accuracy': acc, 'precision': precision, 'recall': recall, 'f1': f1}



In [7]:
# ============================================================================
# 4. Load and Prepare Data
# ============================================================================

# Change the file paths if needed
df_train = pd.read_csv('train.csv', dtype={'id': np.int32, 'target': np.int8})
df_test = pd.read_csv('test.csv', dtype={'id': np.int32})

# Apply cleaning (if desired, you can combine text with keyword/location features)
df_train['text_clean'] = df_train['text'].apply(clean_text)
df_test['text_clean'] = df_test['text'].apply(clean_text)

# For this example we use the cleaned text as input.
train_texts = df_train['text_clean']
train_labels = df_train['target']
test_texts = df_test['text_clean']



In [8]:
# ============================================================================
# 5. Set Up the Transformer Model and Tokenizer
# ============================================================================

model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)

# We initialize the model once; note that if you train in CV folds, you may reinitialize or reload weights per fold.
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
# ============================================================================
# 6. Cross-Validation Training with StratifiedKFold
# ============================================================================

N_FOLDS = 5  # Define the number of folds for cross-validation

oof_preds = np.zeros(len(df_train))
fold_metrics = {}

print(f"Starting {N_FOLDS}-fold cross-validation...")

skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=42)

for fold, (train_idx, val_idx) in enumerate(skf.split(train_texts, train_labels), 1):
    print(f"\n===== Fold {fold} =====")
    train_texts_fold = train_texts.iloc[train_idx].reset_index(drop=True)
    train_labels_fold = train_labels.iloc[train_idx].reset_index(drop=True)
    val_texts_fold = train_texts.iloc[val_idx].reset_index(drop=True)
    val_labels_fold = train_labels.iloc[val_idx].reset_index(drop=True)

    # Create datasets
    train_dataset = DisasterDataset(train_texts_fold, train_labels_fold, tokenizer, max_length=128)
    val_dataset = DisasterDataset(val_texts_fold, val_labels_fold, tokenizer, max_length=128)

    # Define training arguments
    training_args = TrainingArguments(
        output_dir=f'./results_fold_{fold}',
        num_train_epochs=3,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=2e-5,
        logging_steps=50,
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        seed=42,
        disable_tqdm=False,
        logging_dir=f'./logs_fold_{fold}',
    )

    # Reinitialize the model for each fold for fairness
    model_fold = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)

    trainer = Trainer(
        model=model_fold,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
    )

    trainer.train()

    # Evaluate fold performance
    eval_results = trainer.evaluate()
    print(f"Fold {fold} evaluation: {eval_results}")
    fold_metrics[fold] = eval_results

    # Get out-of-fold predictions
    preds_output = trainer.predict(val_dataset)
    preds = np.argmax(preds_output.predictions, axis=1)
    oof_preds[val_idx] = preds

# Overall cross-validation performance
from sklearn.metrics import accuracy_score, f1_score
cv_acc = accuracy_score(train_labels, oof_preds)
cv_f1 = f1_score(train_labels, oof_preds)
print("\n===== Overall CV Performance =====")
print(f"Accuracy: {cv_acc:.4f}")
print(f"F1 Score: {cv_f1:.4f}")



Starting 5-fold cross-validation...

===== Fold 1 =====


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4139,0.417671,0.827971,0.780314,0.835115,0.806785
2,0.3195,0.441213,0.839133,0.81155,0.815267,0.813404
3,0.2336,0.488389,0.841103,0.821151,0.806107,0.813559


Fold 1 evaluation: {'eval_loss': 0.4883894622325897, 'eval_accuracy': 0.8411030860144452, 'eval_precision': 0.8211508553654744, 'eval_recall': 0.8061068702290076, 'eval_f1': 0.8135593220338984, 'eval_runtime': 12.4881, 'eval_samples_per_second': 121.956, 'eval_steps_per_second': 7.687, 'epoch': 3.0}

===== Fold 2 =====


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3996,0.384422,0.83979,0.87963,0.7263,0.795645
2,0.3302,0.392196,0.846356,0.883212,0.740061,0.805324
3,0.2192,0.448345,0.846356,0.858362,0.769113,0.81129


Fold 2 evaluation: {'eval_loss': 0.3921959102153778, 'eval_accuracy': 0.8463558765594222, 'eval_precision': 0.8832116788321168, 'eval_recall': 0.7400611620795107, 'eval_f1': 0.8053244592346089, 'eval_runtime': 12.1448, 'eval_samples_per_second': 125.404, 'eval_steps_per_second': 7.905, 'epoch': 3.0}

===== Fold 3 =====


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.401,0.414846,0.829284,0.856884,0.723242,0.784411
2,0.3014,0.437139,0.832567,0.862069,0.7263,0.788382
3,0.2069,0.498464,0.827315,0.815832,0.772171,0.793401


Fold 3 evaluation: {'eval_loss': 0.4371393620967865, 'eval_accuracy': 0.8325673013788575, 'eval_precision': 0.8620689655172413, 'eval_recall': 0.7262996941896025, 'eval_f1': 0.7883817427385892, 'eval_runtime': 12.3799, 'eval_samples_per_second': 123.022, 'eval_steps_per_second': 7.754, 'epoch': 3.0}

===== Fold 4 =====


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3952,0.420283,0.835742,0.811728,0.804281,0.807988
2,0.3724,0.404947,0.845598,0.832013,0.802752,0.817121
3,0.2266,0.438392,0.843627,0.854949,0.766055,0.808065


Fold 4 evaluation: {'eval_loss': 0.4049474895000458, 'eval_accuracy': 0.8455978975032852, 'eval_precision': 0.8320126782884311, 'eval_recall': 0.8027522935779816, 'eval_f1': 0.8171206225680934, 'eval_runtime': 12.1506, 'eval_samples_per_second': 125.261, 'eval_steps_per_second': 7.901, 'epoch': 3.0}

===== Fold 5 =====


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4492,0.393715,0.840342,0.836334,0.781346,0.807905
2,0.2908,0.407349,0.840342,0.847716,0.766055,0.804819
3,0.2388,0.459853,0.839685,0.817337,0.807339,0.812308


Fold 5 evaluation: {'eval_loss': 0.39371541142463684, 'eval_accuracy': 0.8403416557161629, 'eval_precision': 0.8363338788870703, 'eval_recall': 0.7813455657492355, 'eval_f1': 0.807905138339921, 'eval_runtime': 12.177, 'eval_samples_per_second': 124.99, 'eval_steps_per_second': 7.884, 'epoch': 3.0}

===== Overall CV Performance =====
Accuracy: 0.8412
F1 Score: 0.8067


In [16]:
# ============================================================================
# 7. Retrain on Full Training Data and Predict on Test Set
# ============================================================================

print("\nRetraining on full training data...")

train_dataset_full = DisasterDataset(train_texts, train_labels, tokenizer, max_length=128)
test_dataset = DisasterDataset(test_texts, labels=None, tokenizer=tokenizer, max_length=128)

training_args_full = TrainingArguments(
    output_dir='./results_full',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    learning_rate=2e-5,
    logging_steps=50,
    save_strategy="epoch",
    seed=42,
    logging_dir='./logs_full',
)

# You can start from the best checkpoint from CV or reinitialize
model_full = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)

trainer_full = Trainer(
    model=model_full,
    args=training_args_full,
    train_dataset=train_dataset_full,
    compute_metrics=compute_metrics,
)

trainer_full.train()

# Make predictions on the test set
test_preds = trainer_full.predict(test_dataset)
test_pred_labels = np.argmax(test_preds.predictions, axis=1)





Retraining on full training data...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
50,0.585
100,0.4777
150,0.4355
200,0.4546
250,0.3593
300,0.4038
350,0.4415
400,0.3953
450,0.3975
500,0.3786


In [17]:
# Create a submission DataFrame manually using the 'id' column from your test data
submission = pd.DataFrame({
    'id': df_test['id'],
    'target': test_pred_labels  # your predictions from the model
})

# Save the submission file to disk
submission.to_csv('submission.csv', index=False)
print("Submission file 'submission.csv' created successfully.")


Submission file 'submission.csv' created successfully.


In [18]:
## Load Tensorboard Extension
%load_ext tensorboard


In [20]:
%tensorboard --logdir logs_full/

Reusing TensorBoard on port 6006 (pid 24877), started 0:00:02 ago. (Use '!kill 24877' to kill it.)

In [21]:
import streamlit as st
from transformers import BertTokenizer, BertForSequenceClassification
import torch

# Load model and tokenizer
model_path = './results_full'
tokenizer_name = 'bert-base-uncased'

model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer = BertTokenizer.from_pretrained(tokenizer_name)

# Prediction function with adjustable threshold
def predict_with_threshold(text, model, tokenizer, threshold=0.5, max_length=128):
    """
    Predict whether the input text is a disaster or not using a custom threshold.
    """
    model.eval()
    inputs = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=max_length,
        truncation=True,
        padding='max_length',
        return_tensors='pt'
    )
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.softmax(outputs.logits, dim=1).numpy()[0]  # Get probabilities
        prediction = 1 if probs[1] > threshold else 0
    return "DISASTER" if prediction == 1 else "NOT DISASTER"

# Streamlit UI
st.title("Disaster Prediction App")
st.write("Enter text below to predict whether it indicates a disaster or not.")

user_input = st.text_input("Enter text:")
threshold = st.slider("Set Classification Threshold", min_value=0.0, max_value=1.0, value=0.5)

if st.button("Predict"):
    if user_input:
        prediction = predict_with_threshold(user_input, model, tokenizer, threshold)
        st.write(f"Prediction: {prediction}")
    else:
        st.write("Please enter some text.")


In [27]:
model.save_pretrained('disaster_model')