In [None]:
!pip install torch torchvision transformers



In [None]:
import json
from pprint import pprint
import pandas as pd
import re
import nltk
from nltk import word_tokenize, FreqDist
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

class DataProcessing():
  def __init__(self, data_file, label_file):
    with open(data_file,'r',encoding='utf-8') as f:
      self.data = json.load(f)
      self.df_label = pd.read_csv(label_file, delimiter = '\n', header = None)

  def filter_keys(self):
    # choose what keys do we want

    # wanted_tweet_keys = ("id","text","lang","created_at","author_id","public_metrics")
    # wanted_user_keys = ("author_id","user_name","description","verified","public_metrics","created_at")
    wanted_tweet_keys = ("id","text","author_id","lang")
    wanted_user_keys = ("author_id","verified","public_metrics","created_at")

    for i in self.data.keys():
      for tweet in self.data[i]['data']:
        for k in [key for key in tweet.keys() if not key in wanted_tweet_keys]:
            tweet.pop(k, None)
      
      for user in self.data[i]['includes']['users']:
        for k in [key for key in user.keys() if not key in wanted_user_keys]:
            user.pop(k, None)
    
    # remove outer list 'includes' and 'errors' data
    for i in self.data.keys():
      self.data[i]['users'] = self.data[i]['includes']['users']
      self.data[i].pop('includes')
      if len(self.data[i]) == 3:
        self.data[i].pop('errors')
    
    return self.data

  def replace_urls(self, text):
    text = re.sub(r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.]
    [a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\
    s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))''', "{url}", text)

    return(text)

  def clean_text(self):
    for i in self.data.keys():
      for tweet in self.data[i]['data']:
        tweet['text'] = tweet['text'].lower() # lowercase the text
        tweet['text'] = " ".join(tweet['text'].split()) # remove duplicate spaces or newline
        tweet['text'] = self.replace_urls(tweet['text']) # replace url
        tweet['text'] = re.sub(r'@[^ ]+', '@mention', tweet['text']) # replace mention
        # tweet['text'] = re.sub(r'#', '', tweet['text']) # remove hashtag
        tweet['text'] = re.sub(r'([a-z])\1{2,}', r'\1', tweet['text']) # character normalization
        tweet['text'] = re.sub(r'[^a-z@{}# ]', '', tweet['text']) # remove non-letter characters except for @, {}, #
        stop_words = set(stopwords.words('english'))
        tweet['text'] = tweet['text'].apply(lambda x: [item for item in x if item not in stop_words]) # remove stop words
        tweet['text'] = tweet['text'].strip()

    return self.data

  def preprocessing(self):
    self.data = self.filter_keys()
    self.data = self.clean_text()

    return self.data

  def prepare_dataset(self):
    data = self.preprocessing()
    df_data = pd.DataFrame.from_dict(data, orient='columns').T
    # keep accesible tweets only
    self.df_label = self.df_label.drop([i for i in self.df_label.index if str(i) not in data.keys()])
    # set label of rumour as 1 and non-rumoor as 0
    self.df_label['rumour_label'] = self.df_label[0].apply(lambda x: 1 if x == 'rumour' else 0)

    df = pd.DataFrame()
    df['rumour_label'] = self.df_label['rumour_label']
    df['source_data'] = df_data['data'].map(lambda x: x[0]).set_axis(df.index)
    # df['combined_text'] = df_data['data'].map(lambda x: ' - '.join([n['text'] for n in x])).set_axis(df.index)
    df['combined_text'] = df_data['data'].map(lambda x: [n['text'] for n in x]).set_axis(df.index)
    df = df.reset_index()

    return df

In [None]:
# train_data = DataProcessing('train_data_sample.json').preprocessing()
# dev_data = DataProcessing('dev_data_sample.json').preprocessing()

In [None]:
df_train = DataProcessing('train_data.json', 'train.label.txt').prepare_dataset()
df_dev = DataProcessing('dev_data.json', 'dev.label.txt').prepare_dataset()

In [None]:
import torch
from torch.utils.data import Dataset
from transformers import BertTokenizer
import pandas as pd

class SSTDataset(Dataset):

    def __init__(self, df, maxlen):

        #Store the contents of the file in a pandas dataframe
        self.df = df

        #Initialize the BERT tokenizer
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

        self.maxlen = maxlen

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):

        #Selecting the sentence and label at the specified index in the data frame
        sentence = self.df['source_data'][index]['text']
        label = self.df['rumour_label'][index]

        #Preprocessing the text to be suitable for BERT
        tokens = self.tokenizer.tokenize(sentence) #Tokenize the sentence
        tokens = ['[CLS]'] + tokens + ['[SEP]'] #Insering the CLS and SEP token in the beginning and end of the sentence
        if len(tokens) < self.maxlen:
            tokens = tokens + ['[PAD]' for _ in range(self.maxlen - len(tokens))] #Padding sentences
        else:
            tokens = tokens[:self.maxlen-1] + ['[SEP]'] #Prunning the list to be of specified max length

        tokens_ids = self.tokenizer.convert_tokens_to_ids(tokens) #Obtaining the indices of the tokens in the BERT Vocabulary
        tokens_ids_tensor = torch.tensor(tokens_ids) #Converting the list to a pytorch tensor

        #Obtaining the attention mask i.e a tensor containing 1s for no padded tokens and 0s for padded ones
        attn_mask = (tokens_ids_tensor != 0).long()

        return tokens_ids_tensor, attn_mask, label

In [None]:
from torch.utils.data import DataLoader

#Creating instances of training and development set
#maxlen sets the maximum length a sentence can have
#any sentence longer than this length is truncated to the maxlen size
train_set = SSTDataset(df = df_train, maxlen = 150)
dev_set = SSTDataset(df = df_dev, maxlen = 150)

#Creating intsances of training and development dataloaders
train_loader = DataLoader(train_set, batch_size = 32, num_workers = 2, shuffle=True)
dev_loader = DataLoader(dev_set, batch_size = 32, num_workers = 2, shuffle=True)

print("Done preprocessing training and development data.")

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Done preprocessing training and development data.


In [None]:
import torch
import torch.nn as nn
from transformers import BertModel

class SentimentClassifier(nn.Module):

    def __init__(self):
        super(SentimentClassifier, self).__init__()
        #Instantiating BERT model object 
        self.bert_layer = BertModel.from_pretrained('bert-base-uncased')
        
        #Classification layer
        #input dimension is 768 because [CLS] embedding has a dimension of 768
        #output dimension is 1 because we're working with a binary classification problem
        self.cls_layer = nn.Linear(768, 1)

    def forward(self, seq, attn_masks):
        '''
        Inputs:
            -seq : Tensor of shape [B, T] containing token ids of sequences
            -attn_masks : Tensor of shape [B, T] containing attention masks to be used to avoid contibution of PAD tokens
        '''

        #Feeding the input to BERT model to obtain contextualized representations
        outputs = self.bert_layer(seq, attention_mask = attn_masks)
        cont_reps = outputs.last_hidden_state

        #Obtaining the representation of [CLS] head (the first token)
        cls_rep = cont_reps[:, 0]

        #Feeding cls_rep to the classifier layer
        logits = self.cls_layer(cls_rep)

        return logits

In [None]:
gpu = 0 #gpu ID

print("Creating the sentiment classifier, initialised with pretrained BERT-BASE parameters...")
net = SentimentClassifier()
net.cuda(gpu)  #Enable gpu support for the model
print("Done creating the sentiment classifier.")

Creating the sentiment classifier, initialised with pretrained BERT-BASE parameters...


Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Done creating the sentiment classifier.


In [None]:
import torch.nn as nn
import torch.optim as optim

criterion = nn.BCEWithLogitsLoss()
opti = optim.Adam(net.parameters(), lr = 2e-5, eps=1e-8)

In [None]:
import time

def train(net, criterion, opti, train_loader, dev_loader, max_eps, gpu):

    best_acc, best_f1 = 0, 0
    st = time.time()
    for ep in range(max_eps):
        
        net.train()
        for it, (seq, attn_masks, labels) in enumerate(train_loader):
            #Clear gradients
            opti.zero_grad()  
            #Converting these to cuda tensors
            seq, attn_masks, labels = seq.cuda(gpu), attn_masks.cuda(gpu), labels.cuda(gpu)

            #Obtaining the logits from the model
            logits = net(seq, attn_masks)

            #Computing loss
            loss = criterion(logits.squeeze(-1), labels.float())

            #Backpropagating the gradients
            loss.backward()

            #Optimization step
            opti.step()
              
            if it % 100 == 0:
                
                acc, f1 = get_accuracy_from_logits(logits, labels)
                print("Iteration {} of epoch {} complete. Loss: {}; Accuracy: {}; F1-score: {}; Time taken (s): {}".format(it, ep, loss.item(), acc, f1, (time.time()-st)))
                st = time.time()

        
        dev_acc, dev_loss, dev_f1 = evaluate(net, criterion, dev_loader, gpu)
        print("Epoch {} complete! Development Accuracy: {}; Development F1-score: {}; Development Loss: {}".format(ep, dev_acc, dev_f1, dev_loss))
        # if dev_acc > best_acc:
        #     print("Best development accuracy improved from {} to {}, saving model...".format(best_acc, dev_acc))
        #     best_acc = dev_acc
        #     torch.save(net.state_dict(), 'sstcls_{}.dat'.format(ep))
        if dev_f1 > best_f1:
            print("Best development F1-score improved from {} to {}".format(best_f1, dev_f1))
            print("With development accuracy {}, saving model...".format(dev_acc))
            best_f1 = dev_f1
            torch.save(net.state_dict(), 'sstcls_{}.dat'.format(ep))

In [None]:
from sklearn.metrics import f1_score

def get_accuracy_from_logits(logits, labels):
    probs = torch.sigmoid(logits.unsqueeze(-1))
    soft_probs = (probs > 0.5).long()
    acc = (soft_probs.squeeze() == labels).float().mean()

    f1 = f1_score(labels.cpu().data, soft_probs.squeeze().cpu().data)

    return acc, f1

def evaluate(net, criterion, dataloader, gpu):
    net.eval()

    mean_acc, mean_loss, f1 = 0, 0, 0
    count = 0

    with torch.no_grad():
        for seq, attn_masks, labels in dataloader:
            seq, attn_masks, labels = seq.cuda(gpu), attn_masks.cuda(gpu), labels.cuda(gpu)
            logits = net(seq, attn_masks)
            mean_loss += criterion(logits.squeeze(-1), labels.float()).item()
            mean_acc += get_accuracy_from_logits(logits, labels)[0]
            f1 += get_accuracy_from_logits(logits, labels)[1]
            count += 1

    return mean_acc / count, mean_loss / count, f1 / count

In [None]:
num_epoch = 2

#fine-tune the model
train(net, criterion, opti, train_loader, dev_loader, num_epoch, gpu)

Iteration 0 of epoch 0 complete. Loss: 0.6445541381835938; Accuracy: 0.75; F1-score: 0.0; Time taken (s): 2.1779251098632812
Epoch 0 complete! Development Accuracy: 0.8102205991744995; Development F1-score: 0.20372046254399193; Development Loss: 0.32829341204727397
Best development F1-score improved from 0 to 0.20372046254399193, saving model...
With development accuracy 0.8102205991744995
Iteration 0 of epoch 1 complete. Loss: 0.20121634006500244; Accuracy: 0.9375; F1-score: 0.8; Time taken (s): 87.30074381828308
Epoch 1 complete! Development Accuracy: 0.9304412007331848; Development F1-score: 0.8314167310065738; Development Loss: 0.17545407221597784
Best development F1-score improved from 0.20372046254399193 to 0.8314167310065738, saving model...
With development accuracy 0.9304412007331848
