In [12]:
import pandas as pd
import re
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset

In [2]:
tweets_df = pd.read_csv('data/train.csv')

Data Preprocessing

In [3]:
tweets_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
tweets_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [5]:
tweets_df['target'].value_counts()

target
0    4342
1    3271
Name: count, dtype: int64

In [6]:
tweets_clean = tweets_df.copy()

In [7]:
tweets_clean['keyword'].value_counts()

keyword
fatalities               45
deluge                   42
armageddon               42
sinking                  41
damage                   41
                         ..
forest%20fire            19
epicentre                12
threat                   11
inundation               10
radiation%20emergency     9
Name: count, Length: 221, dtype: int64

In [8]:
# Fill missing values and replace regex space
tweets_clean['keyword'] = tweets_clean['keyword'].fillna('unknown')
tweets_clean['keyword'] = tweets_clean['keyword'].apply(lambda x: x.replace('%20', ' '))

In [9]:
# Combine 'keyword' and 'text' into 'combined_text'
def combine_text(row):
    keyword = row['keyword']
    text = row['text']
    return f'keyword: {keyword} text: {text}'

tweets_clean['combined_text'] = tweets_clean.apply(combine_text, axis=1)

In [10]:
# Text cleaning
def clean_text(text):
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'@\w+', '', text) # remove full mentions
    text = re.sub(r'#', '', text) # remove only hashtag sign but leave hashtag text
    text = re.sub(r'[^\w\s]', '', text)
    text = text.lower()
    return text

tweets_clean['combined_text'] = tweets_clean['combined_text'].apply(clean_text)

In [11]:
df = tweets_clean.drop(columns=['keyword', 'location', 'text'])
df.head()

Unnamed: 0,id,target,combined_text
0,1,1,keyword unknown text our deeds are the reason ...
1,4,1,keyword unknown text forest fire near la ronge...
2,5,1,keyword unknown text all residents asked to sh...
3,6,1,keyword unknown text 13000 people receive wild...
4,7,1,keyword unknown text just got sent this photo ...


Define a Dataset Class

In [13]:
class DisasterTweetsDataset(Dataset):
    def __init__(self, texts, targets, tokenizer, max_len):
        self.texts = texts
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        target = self.targets[idx]
        encoding = self.tokenizer(
            text,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(target, dtype=torch.long),
        }

Load and Tokenize Data