In [1]:
import torch
from torch import nn
import torch.autograd as autograd
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from transformers import BertForSequenceClassification


import matplotlib.pyplot as plt


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/nlp-getting-started/sample_submission.csv
/kaggle/input/nlp-getting-started/train.csv
/kaggle/input/nlp-getting-started/test.csv


In [2]:
BATCH_SIZE = 8
NUM_EPOCHS = 3

In [3]:
df_train = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
df_test = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')

df_train.head(5)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
df_train.shape

(7613, 5)

Belowe we can see, that keywords and location has missiong data.
Also, I will not use these columns in my model, since they can dissmiss into training
and create the issues.(But maybe I will change my mind :) )

In [5]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


From text describe we can see that minimum of words in our data is 7, what mean that
it no need to delete/change row from our dataset

In [6]:
df_train_text_length = df_train['text'].apply(lambda x : len(x))

df_train_text_length.describe()

count    7613.000000
mean      101.037436
std        33.781325
min         7.000000
25%        78.000000
50%       107.000000
75%       133.000000
max       157.000000
Name: text, dtype: float64

In [7]:
train_x, test_x, train_y, test_y = train_test_split(df_train['text'], df_train['target'], train_size = 0.9)
train_x = train_x.reset_index(drop=True)
train_y = train_y.reset_index(drop=True)
test_x = test_x.reset_index(drop=True)
test_y = test_y.reset_index(drop=True)

In [8]:
train_x.shape

(6851,)

# Data Preprocessing

In [9]:
class DisasterTweetDataset(Dataset):

    def __init__(self, texts , values, max_length=160, tokenizer_name='bert-base-uncased', is_train=False):
        self.texts = texts
        self.targets = values
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
        self.max_length = max_length
        self.is_train = is_train
        
    def __len__(self):
        return len(self.texts)
        
    def __getitem__(self, idx):
        text = self.texts[idx]
        target = self.targets[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        input_ids = encoding['input_ids'].flatten()
        attention_mask = encoding['attention_mask'].flatten()
        
        if self.is_train==True:
                    return {
            'input_ids': torch.as_tensor(input_ids, dtype=torch.long),
            'attention_mask': torch.as_tensor(attention_mask, dtype=torch.long),
            'targets': torch.as_tensor(target, dtype=torch.long),
            'text': text
        }
        else:
            return {
            'input_ids': torch.as_tensor(input_ids, dtype=torch.long),
            'attention_mask': torch.as_tensor(attention_mask, dtype=torch.long),
            'text': text
        }

In [10]:
train_tokens = DisasterTweetDataset(train_x, train_y, is_train=True)
validation_tokens = DisasterTweetDataset(test_x,test_y, is_train=True)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [11]:
train_loader = DataLoader(train_tokens, 
                          batch_size=BATCH_SIZE,
                          shuffle=True)
next(iter(train_loader))

{'input_ids': tensor([[  101, 18558,  1054,  ...,     0,     0,     0],
         [  101,  1030,  4086,  ...,     0,     0,     0],
         [  101,  3571,  2003,  ...,     0,     0,     0],
         ...,
         [  101, 15490, 17761,  ...,     0,     0,     0],
         [  101,  3099,  4473,  ...,     0,     0,     0],
         [  101,  6884,  4062,  ...,     0,     0,     0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'targets': tensor([0, 1, 0, 0, 0, 1, 0, 1]),
 'text': ['INFO R. CURFEW IN OPER UNTIL 2030 Z. TAXIWAYS FOXTROT 5 &amp; FOXTROT 6 NAVBL. WND: 060/5. EXP INST APCH. RWY 05. DAMP. TMP: 10. QNH: 1028.',
  '@Sweet2Young -runs at her for setting my forest on fire and bites the shit out of her neck-',
  'Fear is the mind killer. Fear is the little-death that brings total oblite

In [12]:
val_loader = DataLoader(validation_tokens, 
                          batch_size=BATCH_SIZE,
                          shuffle=True)
next(iter(val_loader))

{'input_ids': tensor([[  101,  1045,  2514,  ...,     0,     0,     0],
         [  101,  2006,  4606,  ...,     0,     0,     0],
         [  101,  2484,  2730,  ...,     0,     0,     0],
         ...,
         [  101,  1030, 20912,  ...,     0,     0,     0],
         [  101,  4062, 13456,  ...,     0,     0,     0],
         [  101,  1016, 15042,  ...,     0,     0,     0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'targets': tensor([0, 0, 1, 0, 1, 0, 1, 0]),
 'text': ["I feel like I'm drowning inside my own body!!",
  'On plus side LOOK AT THE SKY LAST NIGHT IT WAS ABLAZE http://t.co/qqsmshaJ3N',
  '24 killed in two simultaneous rail crash as acute floods derail the two trains #India #mumbai... http://t.co/4KBWPCmMbM',
  'Mopheme and Bigstar Johnson are a problem in this game bod

In [28]:
class DisasterTweetModel(nn.Module):
    def __init__(self, bert_base_name='bert-base-uncased', dropout_rate=0.3):
        super(DisasterTweetModel,self).__init__()
        self.bert_layer = BertForSequenceClassification.from_pretrained(bert_base_name)        
        self.dropout = nn.Dropout(dropout_rate)
        self.classifier = nn.Linear(2, 1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, input, device=torch.device("cuda")):
        # Extract input tensors directly
        input_ids = input['input_ids']
        attention_mask = input['attention_mask']
        
        input_ids.to(device)
        attention_mask.to(device)
        
        # Get BERT outputs
        out = self.bert_layer(input_ids=input_ids,
                                  attention_mask=attention_mask,
                                  return_dict=True)
        out = out['logits']
        
        out = self.classifier(F.relu(out))
        out = self.sigmoid(out)
        return out

In [33]:
losses = []
loss_function = nn.BCELoss()
model = DisasterTweetModel()
optimizer = optim.SGD(model.parameters(), lr=0.001)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Pre-Training Validation

In [34]:
pre_trained_validation_loss = []

with torch.no_grad():
    for data in val_loader:
        pred = model(data, device=torch.device('cpu'))
        loss = loss_function(pred.view(-1), data['targets'].float())
        pre_trained_validation_loss.append(loss.item())
        torch.cuda.empty_cache()  # Free unused GPU memory


In [38]:
device = torch.device("cuda")
model.to(device)
train_loader.to(device)

AttributeError: 'DataLoader' object has no attribute 'to'

In [37]:
for epoch in range(1, NUM_EPOCHS+1):
    model.train()
    train_loss = 0
    torch.multiprocessing.set_start_method('spawn')
    

    print(f'EPOCH {epoch}')
    print('--------------------------------------------------')
    for input in train_loader:
        
        model.zero_grad()

        tag_scores = model(input, device=torch.device("cuda"))

        loss = loss_function(tag_scores.view(-1), input['targets'].float())
        train_loss += loss.item()
                
        if epoch == NUM_EPOCHS:
            losses.append(loss)
        
        loss.backward()
        optimizer.step()
        torch.cuda.empty_cache()
        
    avg_train_loss = train_loss / len(train_loader)
    print(f'Average loss: {avg_train_loss}')
    print('--------------------------------------------------')

EPOCH 1
--------------------------------------------------


RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper_CUDA__index_select)