In [27]:
import torch
from torch import nn
import torch.autograd as autograd
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from transformers import BertForSequenceClassification


import matplotlib.pyplot as plt


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/nlp-getting-started/sample_submission.csv
/kaggle/input/nlp-getting-started/train.csv
/kaggle/input/nlp-getting-started/test.csv


In [2]:
BATCH_SIZE = 16
NUM_EPOCHS = 3

In [3]:
df_train = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv', index_col=0)
df_test = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv', index_col=0)

df_train.head(5)

Unnamed: 0_level_0,keyword,location,text,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,,,Our Deeds are the Reason of this #earthquake M...,1
4,,,Forest fire near La Ronge Sask. Canada,1
5,,,All residents asked to 'shelter in place' are ...,1
6,,,"13,000 people receive #wildfires evacuation or...",1
7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
df_train.shape

(7613, 4)

Belowe we can see, that keywords and location has missiong data.
Also, I will not use these columns in my model, since they can dissmiss into training
and create the issues.(But maybe I will change my mind :) )

In [5]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7613 entries, 1 to 10873
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   keyword   7552 non-null   object
 1   location  5080 non-null   object
 2   text      7613 non-null   object
 3   target    7613 non-null   int64 
dtypes: int64(1), object(3)
memory usage: 297.4+ KB


From text describe we can see that minimum of words in our data is 7, what mean that
it no need to delete/change row from our dataset

In [9]:
df_train_text_length = df_train['text'].apply(lambda x : len(x))

df_train_text_length.describe()

count    7613.000000
mean      101.037436
std        33.781325
min         7.000000
25%        78.000000
50%       107.000000
75%       133.000000
max       157.000000
Name: text, dtype: float64

In [5]:
train_x, test_x, train_y, test_y = train_test_split(df_train['text'], df_train['target'], train_size = 0.7, shuffle = 42)

# Data Preprocessing

In [7]:
class DisasterTweetDataset(Dataset):

    def __init__(self, texts , values, max_length=160, tokenizer_name='bert-base-uncased', is_train=False):
        self.texts = texts
        self.targets = values
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
        self.max_length = max_length
        self.is_train = is_train
        
    def __len__(self):
        return len(self.texts)
        
    def __getitem__(self, idx):
        text = self.texts[idx]
        target = self.targets[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        input_ids = encoding['input_ids'].flatten()
        attention_mask = encoding['attention_mask'].flatten()
        
        if self.is_train==True:
                    return {
            'input_ids': torch.as_tensor(input_ids, dtype=torch.long),
            'attention_mask': torch.as_tensor(attention_mask, dtype=torch.long),
            'targets': torch.as_tensor(target, dtype=torch.long),
            'text': text
        }
        else:
            return {
            'input_ids': torch.as_tensor(input_ids, dtype=torch.long),
            'attention_mask': torch.as_tensor(attention_mask, dtype=torch.long),
            'text': text
        }

In [8]:
train_tokens = DisasterTweetDataset(train_x, train_y, is_train=True)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [18]:
train_tokens[100]

{'input_ids': tensor([  101,  8299,  1024,  1013,  1013,  1056,  1012,  2522,  1013,  1043,
          4801,  2063,  2575,  2290,  3501,  2102,  2243,  2629,  2018,  1037,
          1001,  3167,  2378,  9103,  2854,  4926,  2023,  2621,  1029,  3191,
          2256,  6040,  1004, 23713,  1025,  2156,  2129,  1037,  1001, 15468,
          2064,  2393,  1001, 27178,  3051,  6806,  3126,   102,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,   

In [19]:
train_loader = DataLoader(train_tokens, 
                          batch_size=BATCH_SIZE,
                          shuffle=True)

In [51]:
class DisasterTweetModel(nn.Module):
    def __init__(self, bert_base_name='bert-base-uncased', dropout_rate=0.3):
        super(DisasterTweetModel,self).__init__()
        self.bert_layer = BertForSequenceClassification.from_pretrained(bert_base_name)
    
    def forward(self, input, attention_mask):
        bert_out = self.bert_layer(input_ids=input, attention_mask=attention_mask)
        pooled_output = bert_out[1]  # The [CLS] token output
        probabilities = F.sigmoid(self.classifier(pooled_output))
        return probabilities

In [52]:
losses = []
loss_function = nn.BCELoss()
model = DisasterTweetModel()
optimizer = optim.SGD(model.parameters(), lr=0.001)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [53]:
print(model)

DisasterTweetModel(
  (bert_layer): BertForSequenceClassification(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0-11): 12 x BertLayer(
            (attention): BertAttention(
              (self): BertSdpaSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_

In [None]:
input = 