In [1]:
import numpy as np
import pandas as pd

import re
import nltk
from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [25]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModel.from_pretrained('bert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [6]:
def clean_text(tweet):
    #Only valid characters
    tweet = re.sub(r'[^A-Za-z0-9\s]', '', tweet)
    # Remove single and double letter words
    tweet = re.sub(r'\b\w{1,2}\b', '', tweet)
    # Remove extra spaces
    tweet = re.sub(r'\s+', ' ', tweet).strip()
    #remove stopwords
    stop_words = set(stopwords.words('english'))
    tweet = ' '.join(word for word in tweet.split() if word not in stop_words)
    return tweet


In [19]:
def tokenize(tweet, tokenizer):
    return tokenizer.encode_plus(tweet, add_special_tokens = True, max_length=512, return_tensors='pt', return_attention_mask = True, truncation=True, padding='max_length')

In [17]:
def prepare_inputs(df):
    df = df.fillna('')
    df['cleaned_tweet'] = df['text'].astype(str).apply(clean_text)
    df['tokens'] = df['cleaned_tweet'].apply(lambda x: tokenize(x, tokenizer))
    return df


In [9]:
class DisasterData(Dataset):
    def __init__(self, enc_tokens, labels = None):
        self.data = enc_tokens.reset_index(drop=True)
        #account for no testing labels
        self.labels = labels.reset_index(drop=True) if labels is not None else None

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        if idx >= len(self.data):
            raise IndexError("Index out of bounds.")

        encoding = self.data[idx]

        #account for no testing labels
        if self.labels is not None:
          label = self.labels[idx]
          return {
              'input_ids' : torch.tensor(encoding['input_ids'].squeeze(0)),
              'attention_mask' : torch.tensor(encoding ['attention_mask'].squeeze(0)),
              'labels' : torch.tensor(label, dtype = torch.long)}
        else:
          return {
              'input_ids' : torch.tensor(encoding['input_ids'].squeeze(0)),
              'attention_mask' : torch.tensor(encoding ['attention_mask'].squeeze(0))}

In [10]:
def calculate_f1(preds, labels):
    tp = ((preds == 1) & (labels == 1)).sum().item()
    fp = ((preds == 1) & (labels == 0)).sum().item()
    fn = ((preds == 0) & (labels == 1)).sum().item()

    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0

    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return f1

In [11]:
def get_data_loader(x, y, batch_size):
    #account for no testing labels
    if y is None: ds = DisasterData(x, None)
    else: ds = DisasterData(x, y)
    return DataLoader(ds, batch_size = batch_size, shuffle = y is not None)


In [12]:
def fine_tune(n_epochs, learning_rate, train_loader, val_loader):
    torch.cuda.empty_cache()
    model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)
    for epoch in range(n_epochs):
        model.train()
        training_loss = 0
        for batch_num, batch in enumerate(train_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask_enc = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()

            outputs = model(input_ids, attention_mask = attention_mask_enc)
            logits = outputs.last_hidden_state[:, 0, :]

            loss = criterion(logits, labels)

            loss.backward()
            optimizer.step()

            training_loss += loss.item()

        print(f"TRAINING LOSS: {training_loss}")
        model.eval()
        val_loss = 0
        correct = 0
        f1_score_sum = 0
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask_enc = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                outputs = model(input_ids, attention_mask=attention_mask_enc)
                logits = outputs.last_hidden_state[:, 0, :]

                loss = criterion(logits, labels)
                val_loss += loss.item()

                preds = torch.argmax(logits, dim=1)
                correct += (preds == labels).sum().item()
                f1_score_sum += calculate_f1(preds, labels)

        print(f"EPOCH: {epoch + 1}")
        print(f"VAL_LOSS: {val_loss}")
        print(f"ACCURACY: {correct/len(val_y)}")
        print(f"F1 Score Average: {f1_score_sum/len(val_loader)}")


In [13]:
def search_parameters(n_epochs, learning_rates, batch_sizes, train_x, train_y, val_x, val_y):
  for batch_size in batch_sizes:
    train_loader = get_data_loader(train_x, train_y, batch_size)
    val_loader = get_data_loader(val_x, val_y, batch_size)
    for n_epoch in n_epochs:
          for learning_rate in learning_rates:
              print(f"BATCH SIZE: {batch_size}, EPOCHS: {n_epoch}, LEARNING RATE: {learning_rate}")
              fine_tune(n_epoch, learning_rate, train_loader, val_loader)

In [14]:
def compute_predictions(model, data_loader):
    predictions = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)

            logits = outputs.last_hidden_state[:, 0, :]

            preds = torch.argmax(logits, dim=1).tolist()
            predictions.extend(preds)
    return predictions

In [26]:
#clean training data
train_clean = prepare_inputs(train)

In [27]:
#extract train/val splits
train_x, val_x, train_y, val_y = train_test_split(train_clean['tokens'].reset_index(drop=True), train_clean['target'].reset_index(drop=True), test_size=0.2, random_state = 15, shuffle = True)

In [28]:
#set device to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

#cross-entropy loss for classification
criterion = nn.CrossEntropyLoss()

In [None]:
#define parameter search space
n_epochs = [5, 8]
learning_rates = [1e-4, 1e-5]
batch_sizes = [32, 64]

#run grid search
search_parameters(n_epochs, learning_rates, batch_sizes, train_x, train_y, val_x, val_y)

In [29]:
#identify best hyperparameters
BEST_N_EPOCH = 5
BEST_LEARNING_RATE = 1e-5
BEST_BATCH_SIZE = 64

In [30]:
#get loaders with optimal batch_size
train_loader = get_data_loader(train_x, train_y, BEST_BATCH_SIZE)
val_loader = get_data_loader(val_x, val_y, BEST_BATCH_SIZE)

#fine tune using best number of epochs and lr for optimizer
fine_tune(BEST_N_EPOCH, BEST_LEARNING_RATE, train_loader, val_loader)


  'input_ids' : torch.tensor(encoding['input_ids'].squeeze(0)),
  'attention_mask' : torch.tensor(encoding ['attention_mask'].squeeze(0)),


TRAINING LOSS: 167.1251493692398
EPOCH: 1
VAL_LOSS: 12.192704170942307
ACCURACY: 0.7806959947472094
F1 Score Average: 0.736220585329946
TRAINING LOSS: 54.456102162599564
EPOCH: 2
VAL_LOSS: 11.246410429477692
ACCURACY: 0.8076165462902167
F1 Score Average: 0.7747694312693164
TRAINING LOSS: 43.03291627764702
EPOCH: 3
VAL_LOSS: 10.949333637952805
ACCURACY: 0.8207485226526592
F1 Score Average: 0.7755147182236176
TRAINING LOSS: 36.574724555015564
EPOCH: 4
VAL_LOSS: 10.51359498500824
ACCURACY: 0.824688115561392
F1 Score Average: 0.7709723526696676
TRAINING LOSS: 32.942984119057655
EPOCH: 5
VAL_LOSS: 11.940500617027283
ACCURACY: 0.8207485226526592
F1 Score Average: 0.784529579480401


In [31]:
#prepare testing data inputs
test_clean = prepare_inputs(test)

#get test data loader
test_loader = get_data_loader(test_clean['tokens'].reset_index(drop=True), None, 32)

#compute predictions, store in data frame, and write to csv
final_preds = compute_predictions(model, test_loader)
final_preds_df = pd.DataFrame({'id': test_clean['id'].reset_index(drop = True), 'target': final_preds})
final_preds_df.to_csv('final_submission.csv', index=False)


  'input_ids' : torch.tensor(encoding['input_ids'].squeeze(0)),
  'attention_mask' : torch.tensor(encoding ['attention_mask'].squeeze(0))}
