In [None]:
!pip install torch torchvision transformers

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 8.3 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 50.9 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 47.2 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 55.2 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 6.7 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Fo

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from LoadData import LoadDataAndProcessing, LoadTestDataAndProcessing
from RumourDataSet import RumourDataset
from torch.utils.data import DataLoader

In [None]:
train_file = "/content/drive/MyDrive/NLP/data/train_data_all.json"
train_label_file = "/content/drive/MyDrive/NLP/data/train.label.txt"
dev_file = "/content/drive/MyDrive/NLP/data/dev_data_all.json"
dev_label_file =  "/content/drive/MyDrive/NLP/data/dev.label.txt"

load_twitter_train_data = LoadDataAndProcessing(train_file, train_label_file)
load_twitter_dev_data = LoadDataAndProcessing(dev_file, dev_label_file)

In [None]:
train_input = load_twitter_train_data.prepareDataset()
dev_input = load_twitter_dev_data.prepareDataset()

In [None]:
import torch
import torch.nn as nn
import pandas as pd
from torch.utils.data import Dataset
from transformers import BertTokenizer, BertConfig, BertForSequenceClassification
from transformers import AdamW


class RumourDataset(Dataset):

    def __init__(self, data, maxlen):

        self.data = data

        if maxlen > 512:
            self.maxlen = 512
        else:
            self.maxlen = maxlen

        #Initialize the BERT tokenizer
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        text_source, text_reply = self.data[index]['text_source'], self.data[index]['text_reply']

        if len(self.data[index]) == 3:
          label = self.data[index]['label']
        else:
          label = None

        # Preprocessing the text to be suitable for BERT
        # Insering the CLS and SEP token in the beginning and end of the source and replies
        tokens = ["[CLS]"]
        tokens_source = self.tokenizer.tokenize(text_source) # Tokenize the sentence
        tokens += tokens_source + ["[SEP]"]
        len_source = len(tokens)

        tokens_reply = self.tokenizer.tokenize(text_reply) # Tokenize the sentence
        tokens += tokens_reply + ["[SEP]"]
        len_reply = len(tokens) - len_source

        if len(tokens) < self.maxlen:
            tokens = tokens + ['[PAD]' for _ in range(self.maxlen - len(tokens))] #Padding sentences
        else:
            tokens = tokens[:self.maxlen-1] + ['[SEP]'] #Prunning the list to be of specified max length

        tokens_ids = self.tokenizer.convert_tokens_to_ids(tokens) #Obtaining the indices of the tokens in the BERT Vocabulary
        tokens_ids_tensor = torch.tensor(tokens_ids) #Converting the list to a pytorch tensor

        #Obtaining the attention mask i.e a tensor containing 1s for no padded tokens and 0s for padded ones
        attn_mask = (tokens_ids_tensor != 0).long()

        if len(self.data[index]) == 3:
          return tokens_ids_tensor, attn_mask, label
        else:
          return tokens_ids_tensor, attn_mask

In [None]:
#Creating instances of training and development set
#maxlen sets the maximum length a sentence can have
#any sentence longer than this length is truncated to the maxlen size
train_set = RumourDataset(data = train_input, maxlen = 500)
dev_set = RumourDataset(data = dev_input, maxlen = 500)

#Creating intsances of training and development dataloaders
train_loader = DataLoader(train_set, batch_size = 8, num_workers = 2, shuffle=True)
dev_loader = DataLoader(dev_set, batch_size = 8, num_workers = 2, shuffle=True)

print("Done preprocessing training and development data.")

Done preprocessing training and development data.


In [None]:
import torch
import torch.nn as nn
from transformers import BertModel

class RumorClassifier(nn.Module):

    def __init__(self):
        super(RumorClassifier, self).__init__()
        #Instantiating BERT model object 
        self.bert_layer = BertModel.from_pretrained('bert-base-uncased')
        
        #Classification layer
        #input dimension is 768 because [CLS] embedding has a dimension of 768
        #output dimension is 1 because we're working with a binary classification problem
        self.cls_layer = nn.Linear(768, 1)

    def forward(self, seq, attn_masks):
        '''
        Inputs:
            -seq : Tensor of shape [B, T] containing token ids of sequences
            -attn_masks : Tensor of shape [B, T] containing attention masks to be used to avoid contibution of PAD tokens
        '''

        #Feeding the input to BERT model to obtain contextualized representations
        outputs = self.bert_layer(seq, attention_mask = attn_masks)
        cont_reps = outputs.last_hidden_state

        #Obtaining the representation of [CLS] head (the first token)
        cls_rep = cont_reps[:, 0]

        #Feeding cls_rep to the classifier layer
        logits = self.cls_layer(cls_rep)

        return logits

In [None]:
gpu = 0 #gpu ID

print("Creating the rumour classifier, initialised with pretrained BERT-BASE parameters...")
model = RumorClassifier()
model.cuda(gpu)  #Enable gpu support for the model
print("Done creating the rumour classifier.")

Creating the rumour classifier, initialised with pretrained BERT-BASE parameters...


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Done creating the rumour classifier.


In [None]:
import torch.nn as nn
import torch.optim as optim

criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr = 2e-5, eps=1e-8)

In [None]:
import time

def train(net, criterion, opti, train_loader, dev_loader, max_eps, gpu):

    best_acc= 0
    st = time.time()
    for ep in range(max_eps):
        
        net.train()
        for it, (seq, attn_masks, labels) in enumerate(train_loader):
            #Clear gradients
            opti.zero_grad()  
            #Converting these to cuda tensors
            seq, attn_masks, labels = seq.cuda(gpu), attn_masks.cuda(gpu), labels.cuda(gpu)

            #Obtaining the logits from the model
            logits = net(seq, attn_masks)

            #Computing loss
            loss = criterion(logits.squeeze(-1), labels.float())

            #Backpropagating the gradients
            loss.backward()

            #Optimization step
            opti.step()
              
            if it % 100 == 0:
                
                acc = get_accuracy_from_logits(logits, labels)
                print("Iteration {} of epoch {} complete. Loss: {}; Accuracy: {}; Time taken (s): {}".format(it, ep, loss.item(), acc, (time.time()-st)))
                st = time.time()

        
        dev_acc, dev_loss = evaluate(net, criterion, dev_loader, gpu)
        print("Epoch {} complete! Development Accuracy: {}; Development; Development Loss: {}".format(ep, dev_acc, dev_loss))
        if dev_acc > best_acc:
            print("Best development accuracy improved from {} to {}, saving model...".format(best_acc, dev_acc))
            print("-----------------------------------------------------------------")
            best_acc = dev_acc
            torch.save(net.state_dict(), 'sstcls_{}.dat'.format(ep))


In [None]:
def get_accuracy_from_logits(logits, labels):
    probs = torch.sigmoid(logits.unsqueeze(-1))
    soft_probs = (probs > 0.5).long()
    acc = (soft_probs.squeeze() == labels).float().mean()

    return acc

def evaluate(net, criterion, dataloader, gpu):
    net.eval()

    mean_acc, mean_loss = 0, 0
    count = 0

    with torch.no_grad():
        for seq, attn_masks, labels in dataloader:
            seq, attn_masks, labels = seq.cuda(gpu), attn_masks.cuda(gpu), labels.cuda(gpu)
            logits = net(seq, attn_masks)
            mean_loss += criterion(logits.squeeze(-1), labels.float()).item()
            mean_acc += get_accuracy_from_logits(logits, labels)
            count += 1

    return mean_acc / count, mean_loss / count

In [None]:
num_epoch = 4

#fine-tune the model
train(model, criterion, optimizer, train_loader, dev_loader, num_epoch, gpu)

Iteration 0 of epoch 0 complete. Loss: 0.7939672470092773; Accuracy: 0.25; Time taken (s): 1.114790916442871


RuntimeError: ignored

In [None]:
import pickle
test_input = pickle.load(open("/content/drive/MyDrive/NLP/data/test_input.pickle", "rb", -1))

In [None]:
test_set = RumourDataset(data = test_input, maxlen = 500)
test_loader = DataLoader(test_set, batch_size = 8, num_workers = 2)

In [None]:
def predict(net, test_loader, model_file=None):
    # load weight
    if model_file != None:
      net.load_state_dict(torch.load(model_file))
    
    predictions = []
    
    # Predict process
    with torch.no_grad():
        for seq, attn_masks in test_loader:
            seq, attn_masks = seq.cuda(gpu), attn_masks.cuda(gpu)
            logits = net(seq, attn_masks)
            probs = torch.sigmoid(logits.unsqueeze(-1))
            soft_probs = (probs > 0.5).long()
            predictions.extend(soft_probs.cpu().numpy().squeeze())
    
    return predictions

In [None]:
model_file = "sstcls_1.dat"
prediction = predict(model, test_loader, model_file)
len(prediction)

In [None]:
pred = pd.DataFrame(data=prediction, columns=['Predicted']).reset_index()
pred = pred.rename(columns={'index': 'Id'})
print(pred)

pred.to_csv('test_prediction.csv', index = False)

      Id  Predicted
0      0          0
1      1          0
2      2          0
3      3          0
4      4          0
..   ...        ...
553  553          0
554  554          0
555  555          1
556  556          0
557  557          0

[558 rows x 2 columns]


In [None]:
!/opt/bin/nvidia-smi

Sun May  1 16:13:21 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   70C    P0    30W /  70W |  12072MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces