In [1]:
import torch
from torch import nn
import matplotlib.pyplot as plt

device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [2]:
import pandas as pd

df = pd.read_csv("./data/Disaster_Tweet_Classification/train.csv")
df

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [3]:
df = df[['text', 'target']].dropna()
df

Unnamed: 0,text,target
0,Our Deeds are the Reason of this #earthquake M...,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...
7608,Two giant cranes holding a bridge collapse int...,1
7609,@aria_ahrary @TheTawniest The out of control w...,1
7610,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,Police investigating after an e-bike collided ...,1


In [4]:
import re


def preprocessing(text):
    text = text.lower()
    text = re.sub(r"http\s+", "", text)
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text


df['text'] = df['text'].apply(preprocessing)
df

Unnamed: 0,text,target
0,our deeds are the reason of this earthquake ma...,1
1,forest fire near la ronge sask canada,1
2,all residents asked to shelter in place are be...,1
3,13000 people receive wildfires evacuation orde...,1
4,just got sent this photo from ruby alaska as s...,1
...,...,...
7608,two giant cranes holding a bridge collapse int...,1
7609,ariaahrary thetawniest the out of control wild...,1
7610,m194 0104 utc5km s of volcano hawaii httptcozd...,1
7611,police investigating after an ebike collided w...,1


In [5]:
from collections import Counter


def tokenize(text):
    return re.findall(r"\b\w+\b", text.lower())


def build_vocab(texts, min_freq=1):
    counter = Counter()
    for text in texts:
        tokens = tokenize(text)
        counter.update(tokens)
    vocab = {'<PAD>': 0, '<UNK>': 1}
    for word, freq in counter.items():
        if freq >= min_freq:
            vocab[word] = len(vocab)
    return vocab

In [6]:
def encode(text, vocab, max_len=100):
    tokens = tokenize(text)
    token_ids = [vocab.get(token, vocab['<UNK>']) for token in tokens]

    if len(token_ids) < max_len:
        token_ids += [vocab['<PAD>']] * (max_len - len(token_ids))
    else:
        token_ids = token_ids[:max_len]

    return token_ids


In [7]:
from torch.utils.data import Dataset


class TweetDataset(Dataset):
    def __init__(self, texts, labels, vocab, max_len=100):
        self.texts = texts
        self.labels = labels
        self.vocab = vocab
        self.max_len = max_len

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        x = encode(self.texts[idx], self.vocab, self.max_len)
        y = self.labels[idx]
        return torch.tensor(x), torch.tensor(y)

In [8]:
texts = df['text'].tolist()
labels = df['target'].tolist()

vocab = build_vocab(texts)
dataset = TweetDataset(texts, labels, vocab, max_len=100)

from torch.utils.data import DataLoader

train_loader = DataLoader(dataset, batch_size=32, shuffle=True)

In [9]:
class DisasterTweet_LSTM(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, 300, padding_idx=0)
        self.BiLSTM = nn.LSTM(input_size=300, hidden_size=256, batch_first=True, bidirectional=True)
        self.fc1 = nn.Linear(in_features=256 * 2, out_features=1024)
        self.dropout = nn.Dropout(0.3)
        self.fc2 = nn.Linear(in_features=1024, out_features=1)
        self.sigmoid = nn.Sigmoid()
        self.relu = nn.ReLU()

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, (h_n, c_n) = self.BiLSTM(embedded)

        h_forward = h_n[0]
        h_backward = h_n[1]
        h_cat = torch.cat((h_backward, h_forward), dim=1)

        out = self.fc1(h_cat)
        out = self.relu(out)
        out = self.dropout(out)
        out = self.fc2(out)
        return out

In [10]:
model = DisasterTweet_LSTM(vocab_size=len(vocab))
loss_fn = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.002)


In [11]:
n_epochs = 10
model.to(device)

for epoch in range(n_epochs):
    train_loss = 0
    for batch, (X, y) in enumerate(train_loader):
        X, y = X.to(device), y.to(device).float()
        y_pred = model(X).squeeze(1)
        loss = loss_fn(y_pred, y)
        train_loss += loss
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    train_loss = train_loss / len(train_loader)

    print("Epoch:",epoch+1,"| Train Loss:", train_loss)





Epoch: 1 | Train Loss: tensor(0.5617, device='cuda:0', grad_fn=<DivBackward0>)
Epoch: 2 | Train Loss: tensor(0.3538, device='cuda:0', grad_fn=<DivBackward0>)
Epoch: 3 | Train Loss: tensor(0.1760, device='cuda:0', grad_fn=<DivBackward0>)
Epoch: 4 | Train Loss: tensor(0.1795, device='cuda:0', grad_fn=<DivBackward0>)
Epoch: 5 | Train Loss: tensor(0.0737, device='cuda:0', grad_fn=<DivBackward0>)
Epoch: 6 | Train Loss: tensor(0.0421, device='cuda:0', grad_fn=<DivBackward0>)
Epoch: 7 | Train Loss: tensor(0.0327, device='cuda:0', grad_fn=<DivBackward0>)
Epoch: 8 | Train Loss: tensor(0.0260, device='cuda:0', grad_fn=<DivBackward0>)
Epoch: 9 | Train Loss: tensor(0.0329, device='cuda:0', grad_fn=<DivBackward0>)
Epoch: 10 | Train Loss: tensor(0.0316, device='cuda:0', grad_fn=<DivBackward0>)


In [12]:
import pandas as pd

test_df = pd.read_csv("./data/Disaster_Tweet_Classification/test.csv")  # assumes a column called 'text'
test_texts = test_df['text'].tolist()

# Encode using your existing vocab and encode function
encoded_test = [encode(text, vocab, max_len=50) for text in test_texts]


In [13]:
import torch
from torch.utils.data import Dataset, DataLoader

class TestDataset(torch.utils.data.Dataset):
    def __init__(self, encoded_texts):
        self.encoded_texts = encoded_texts

    def __len__(self):
        return len(self.encoded_texts)

    def __getitem__(self, idx):
        return torch.tensor(self.encoded_texts[idx])

test_dataset = TestDataset(encoded_test)
test_loader = DataLoader(test_dataset, batch_size=32)


In [14]:
model.eval()
all_preds = []

with torch.inference_mode():
    for batch in test_loader:
        batch = batch.to(device)
        outputs = model(batch)
        probs = torch.sigmoid(outputs)
        preds = (probs > 0.5).long()
        preds = preds.squeeze().cpu().numpy()
        all_preds.extend(preds)


In [15]:
submission = pd.DataFrame({
    'id': test_df['id'],  # assuming test.csv has an 'id' column
    'target': all_preds
})

submission.to_csv("submission.csv", index=False)


In [16]:
submission

Unnamed: 0,id,target
0,0,0
1,2,0
2,3,0
3,9,1
4,11,1
...,...,...
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1
