In [21]:
import pandas as pd
import re
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
from torch.optim import AdamW
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.nn import CrossEntropyLoss

In [2]:
tweets_df = pd.read_csv('data/train.csv')

Data Preprocessing

In [3]:
tweets_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
tweets_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [5]:
tweets_df['target'].value_counts()

target
0    4342
1    3271
Name: count, dtype: int64

In [6]:
tweets_clean = tweets_df.copy()

In [7]:
tweets_clean['keyword'].value_counts()

keyword
fatalities               45
deluge                   42
armageddon               42
sinking                  41
damage                   41
                         ..
forest%20fire            19
epicentre                12
threat                   11
inundation               10
radiation%20emergency     9
Name: count, Length: 221, dtype: int64

In [8]:
# Fill missing values and replace regex space
tweets_clean['keyword'] = tweets_clean['keyword'].fillna('unknown')
tweets_clean['keyword'] = tweets_clean['keyword'].apply(lambda x: x.replace('%20', ' '))

In [9]:
# Combine 'keyword' and 'text' into 'combined_text'
def combine_text(row):
    keyword = row['keyword']
    text = row['text']
    return f'keyword: {keyword} text: {text}'

tweets_clean['combined_text'] = tweets_clean.apply(combine_text, axis=1)

In [10]:
# Text cleaning
def clean_text(text):
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'@\w+', '', text) # remove full mentions
    text = re.sub(r'#', '', text) # remove only hashtag sign but leave hashtag text
    text = re.sub(r'[^\w\s]', '', text)
    text = text.lower()
    return text

tweets_clean['combined_text'] = tweets_clean['combined_text'].apply(clean_text)

In [11]:
df = tweets_clean.drop(columns=['keyword', 'location', 'text'])
df.head()

Unnamed: 0,id,target,combined_text
0,1,1,keyword unknown text our deeds are the reason ...
1,4,1,keyword unknown text forest fire near la ronge...
2,5,1,keyword unknown text all residents asked to sh...
3,6,1,keyword unknown text 13000 people receive wild...
4,7,1,keyword unknown text just got sent this photo ...


Define a Dataset Class

In [13]:
class DisasterTweetsDataset(Dataset):
    def __init__(self, texts, targets, tokenizer, max_len):
        self.texts = texts
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        target = self.targets[idx]
        encoding = self.tokenizer(
            text,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(target, dtype=torch.long),
        }

Load and Tokenize Data

In [15]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

train_texts, test_texts, train_targets, test_targets = train_test_split(
    df['combined_text'].tolist(), df['target'].tolist(), test_size=0.2, random_state=42
)

train_dataset = DisasterTweetsDataset(train_texts, train_targets, tokenizer, max_len=128)
test_dataset = DisasterTweetsDataset(test_texts, test_targets, tokenizer, max_len=128)

Define a DataLoader

In [16]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True)

Define the Model

In [18]:
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training

In [22]:
optimizer = AdamW(model.parameters(), lr=1e-5)
loss_fn = CrossEntropyLoss()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

epochs = 5
for epoch in range(epochs):
    model.train()
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch + 1}/{epochs} completed')

Epoch 1/5 completed
Epoch 2/5 completed
Epoch 3/5 completed
Epoch 4/5 completed
Epoch 5/5 completed


Evaluation

In [23]:
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        predictions = torch.argmax(outputs.logits, dim=1)
        correct += (predictions == labels).sum().item()
        total += labels.size(0)

accuracy = correct / total
print(f'Accuracy: {accuracy}')

Accuracy: 0.8266579120157583


In [25]:
# Save the trained model and the tokenizer
model_dir = 'model/'

model.save_pretrained(model_dir)
tokenizer.save_pretrained(model_dir)

('model/tokenizer_config.json',
 'model/special_tokens_map.json',
 'model/vocab.txt',
 'model/added_tokens.json',
 'model/tokenizer.json')