In [2]:
import time
import pandas as pd
import numpy as np

import torch
import torch.nn.functional as F
import torchtext

import transformers
from transformers import DistilBertTokenizerFast
from transformers import DistilBertForSequenceClassification

In [3]:
torch.backends.cudnn.deterministic = True
RANDOM_SEED = 123
torch.manual_seed(RANDOM_SEED)
DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

NUM_EPOCHS = 2

In [4]:
df = pd.read_csv('test.csv')
df

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan
...,...,...,...,...
3258,10861,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259,10865,,,Storm in RI worse than last hurricane. My city...
3260,10868,,,Green Line derailment in Chicago http://t.co/U...
3261,10874,,,MEG issues Hazardous Weather Outlook (HWO) htt...


In [5]:
dfsub = pd.read_csv('sample_submission.csv')

In [6]:
df_train = pd.read_csv('train.csv')
df_train

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [7]:
df_train.describe()

Unnamed: 0,id,target
count,7613.0,7613.0
mean,5441.934848,0.42966
std,3137.11609,0.49506
min,1.0,0.0
25%,2734.0,0.0
50%,5408.0,0.0
75%,8146.0,1.0
max,10873.0,1.0


In [8]:
df_train[df_train['target']==0] 

Unnamed: 0,id,keyword,location,text,target
15,23,,,What's up man?,0
16,24,,,I love fruits,0
17,25,,,Summer is lovely,0
18,26,,,My car is so fast,0
19,28,,,What a goooooooaaaaaal!!!!!!,0
...,...,...,...,...,...
7581,10833,wrecked,Lincoln,@engineshed Great atmosphere at the British Li...,0
7582,10834,wrecked,,Cramer: Iger's 3 words that wrecked Disney's s...,0
7584,10837,,,These boxes are ready to explode! Exploding Ki...,0
7587,10841,,,Sirens everywhere!,0


In [9]:
train_texts = df_train.iloc[:5330]['text'].values
train_labels = df_train.iloc[:5330]['target'].values

valid_texts = df_train.iloc[5330:6091]['text'].values
valid_labels = df_train.iloc[5330:6091]['target'].values

test_texts = df_train.iloc[6091:]['text'].values
test_labels = df_train.iloc[6091:]['target'].values

In [10]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [11]:
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True)

In [12]:
valid_encodings = tokenizer(list(valid_texts), truncation=True, padding=True)

In [13]:
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True)

In [14]:
train_encodings[0]

Encoding(num_tokens=84, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [15]:
class DisastersTweetsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


train_dataset = DisastersTweetsDataset(train_encodings, train_labels)
valid_dataset = DisastersTweetsDataset(valid_encodings, valid_labels)
test_dataset = DisastersTweetsDataset(test_encodings, test_labels)

In [16]:
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16, shuffle=True)
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=16, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=16, shuffle=True)

In [17]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
model.to(DEVICE)
model.train()

optim = torch.optim.Adam(model.parameters(), lr=5e-5)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [28]:
def computing_accuracy(model, data_loader, device):

    with torch.no_grad():

        correct_pred, num_examples = 0, 0

        for batch_idx, batch in enumerate(data_loader):

            ### Prepare data
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss, logits = outputs['loss'], outputs['logits']

            _, predicted_labels = torch.max(logits, 1)

            num_examples += labels.size(0)

            correct_pred += (predicted_labels == labels).sum()
    return correct_pred.float()/num_examples * 100



In [29]:
start_time = time.time()

for epoch in range(NUM_EPOCHS):
    
    model.train()
    
    for batch_idx, batch in enumerate(train_loader):
        
        ### Prepare data
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        labels = batch['labels'].to(DEVICE)

        ### Forward
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss, logits = outputs['loss'], outputs['logits']
        
        ### Backward
        optim.zero_grad()
        loss.backward()
        optim.step()
        
        ### Logging
        if not batch_idx % 32:
            print (f'Epoch: {epoch+1:04d}/{NUM_EPOCHS:04d} | '
                   f'Batch {batch_idx:04d}/{len(train_loader):04d} | '
                   f'Loss: {loss:.4f}')
            
    model.eval()

    with torch.set_grad_enabled(False):
        print(f'training accuracy: '
              f'{computing_accuracy(model, train_loader, DEVICE):.2f}%'
              f'\nvalid accuracy: '
              f'{computing_accuracy(model, valid_loader, DEVICE):.2f}%')
        
    print(f'Time elapsed: {(time.time() - start_time)/60:.2f} min')
    
    
print(f'Total Training Time: {(time.time() - start_time)/60:.2f} min')
print(f'Test accuracy: {computing_accuracy(model, test_loader, DEVICE):.2f}%')

Epoch: 0001/0002 | Batch 0000/0334 | Loss: 0.1158
Epoch: 0001/0002 | Batch 0032/0334 | Loss: 0.4097
Epoch: 0001/0002 | Batch 0064/0334 | Loss: 0.0286
Epoch: 0001/0002 | Batch 0096/0334 | Loss: 0.0290
Epoch: 0001/0002 | Batch 0128/0334 | Loss: 0.0563
Epoch: 0001/0002 | Batch 0160/0334 | Loss: 0.0234
Epoch: 0001/0002 | Batch 0192/0334 | Loss: 0.0402
Epoch: 0001/0002 | Batch 0224/0334 | Loss: 0.0540
Epoch: 0001/0002 | Batch 0256/0334 | Loss: 0.2980
Epoch: 0001/0002 | Batch 0288/0334 | Loss: 0.0695
Epoch: 0001/0002 | Batch 0320/0334 | Loss: 0.0163
training accuracy: 97.62%
valid accuracy: 78.58%
Time elapsed: 1.12 min
Epoch: 0002/0002 | Batch 0000/0334 | Loss: 0.1533
Epoch: 0002/0002 | Batch 0032/0334 | Loss: 0.1657
Epoch: 0002/0002 | Batch 0064/0334 | Loss: 0.1449
Epoch: 0002/0002 | Batch 0096/0334 | Loss: 0.1083
Epoch: 0002/0002 | Batch 0128/0334 | Loss: 0.0093
Epoch: 0002/0002 | Batch 0160/0334 | Loss: 0.0065
Epoch: 0002/0002 | Batch 0192/0334 | Loss: 0.0249
Epoch: 0002/0002 | Batch 022

In [33]:
outputs

SequenceClassifierOutput(loss=tensor(0.0039, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-2.7628,  2.6699],
        [ 2.9455, -2.7005]], device='cuda:0', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)