In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import json
import csv
import re
import torch
import pandas as pd
import time
import operator
import math
from torch.utils.data import TensorDataset, DataLoader, RandomSampler
from torch.optim import AdamW
import torch.nn as nn
from transformers import BertModel, BertTokenizer, BertPreTrainedModel, AdamW
from sklearn import preprocessing
from sklearn.utils.class_weight import compute_class_weight

Task 1

In [None]:
DEV_FILE = '/kaggle/input/preprocessed-data/dev.csv'
TRAIN_FILE = '/kaggle/input/preprocessed-data/train.csv'
TEST_FILE = '/kaggle/input/preprocessed-data/test.csv'

# over-sampling non-rumour data
train_set = pd.read_csv(TRAIN_FILE)
train_set_rumour_idx = train_set.index[train_set['Label'] == 'rumour'].tolist()
ratio = math.floor(train_set['Label'].value_counts()[0]/train_set['Label'].value_counts()[1])
while ratio:
    train_set = pd.concat([train_set, train_set.loc[train_set_rumour_idx]], ignore_index=True, axis=0)
    ratio -= 1
train_set.to_csv('./train.csv', index = False)
TRAIN_FILE = '/kaggle/working/train.csv'

dev_set = pd.read_csv(DEV_FILE)
dev_set_rumour_idx = dev_set.index[dev_set['Label'] == 'rumour'].tolist()
ratio = math.floor(dev_set['Label'].value_counts()[0]/dev_set['Label'].value_counts()[1])
while ratio:
    dev_set = pd.concat([dev_set, dev_set.loc[dev_set_rumour_idx]], ignore_index=True, axis=0)
    ratio -= 1
dev_set.to_csv('./dev.csv', index = False)
DEV_FILE = '/kaggle/working/dev.csv'

In [None]:
# converts 'nonrumour' to 0 and 'rumour' to 1 in the list of labels in both training and development sets
label_encoder = preprocessing.LabelEncoder()
label_encoder.fit(['nonrumour', 'rumour'])

train_events = pd.read_csv(TRAIN_FILE)
train_events['Label'] = label_encoder.transform(train_events['Label'])
dev_events = pd.read_csv(DEV_FILE)
dev_events['Label'] = label_encoder.transform(dev_events['Label'])

In [None]:
# compute the class weights for the loss function later on
#class_weights = compute_class_weight(class_weight = 'balanced', classes = list(set(train_events['Label'])), y = train_events['Label'])
#weights = torch.tensor(class_weights, dtype = torch.float)
#weights = weights.to('cuda')

In [None]:
def text_split(text):
    #text = text.replace('[CLS]','')
    #text = text.replace('[SEP]','')
    new_text = ''
    curr_text = ''
    
    for word in text.split():
        curr_text += word + ' '
        
        if word == '[SEP]':
            new_text += curr_text
            curr_text = ''
            
    new_text = new_text[:-1]
    
    return new_text

split_train_events = train_events.copy()
split_train_events['Event Tweets'] = split_train_events['Event Tweets'].apply(text_split)
split_dev_events = dev_events.copy()
split_dev_events['Event Tweets'] = split_dev_events['Event Tweets'].apply(text_split)

In [None]:
# finds the max length of all the tweets for both the training and development
train_max = 0
curr_count = 0

for i in split_train_events['Event Tweets']:
    for j in i.split():
        curr_count += 1
        if j == '[SEP]':
            if curr_count > train_max:
                train_max = curr_count
            curr_count = 0
        
dev_max = 0
curr_count = 0

for i in split_dev_events['Event Tweets']:
    for j in i.split():
        curr_count += 1
        if j == '[SEP]':
            if curr_count > dev_max:
                dev_max = curr_count
            curr_count = 0
        
# max counts used for padding later on
print("Training set has a max sentence length of", train_max)
print("Development set has a max sentence length of", dev_max)

In [None]:
# our main BERT classifier model
class BertClassifier(nn.Module):

    def __init__(self, bert_base):
        super(BertClassifier, self).__init__()
        self.bert = bert_base
        self.in = nn.Linear(768,512)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)
        self.out = nn.Linear(512, 2)
        self.softmax = nn.LogSoftmax(dim = 1)

    def forward(self, input_ids, attention_mask):
        
        _, inputs = self.bert(input_ids, attention_mask = attention_mask, return_dict = False)
        
        x = self.in(inputs)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.out(x)
        x = self.softmax(x)
        
        return x

In [None]:
bert_base = BertModel.from_pretrained('bert-base-uncased')
model = BertClassifier(bert_base)
model.to('cuda')

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case = True)

# batch_encode_plus tokenizes and prepares a list of ids using the BERT word embeddings dictionary and attention masks
# which include padding with a max length to consider the padding for
train_tokens = tokenizer.batch_encode_plus(
    list(split_train_events['Event Tweets']),
    add_special_tokens = False,
    max_length = train_max,
    pad_to_max_length = True,
    truncation = True
)

dev_tokens = tokenizer.batch_encode_plus(
    list(split_dev_events['Event Tweets']),
    add_special_tokens = False,
    max_length = dev_max,
    pad_to_max_length = True,
    truncation = True
)

In [None]:
# convert the lists of ids, attention masks, and labels to tensors to be read into tensor datasets to form
# the samplers and data loaders
train_ids = torch.tensor(train_tokens['input_ids'])
train_mask = torch.tensor(train_tokens['attention_mask'])
train_y = torch.tensor((list(split_train_events['Label'])))
train_data = TensorDataset(train_ids, train_mask, train_y)
train_sampler = RandomSampler(train_data)
train_loader = DataLoader(train_data, sampler = train_sampler, batch_size = batch_size)

dev_ids = torch.tensor(dev_tokens['input_ids'])
dev_mask = torch.tensor(dev_tokens['attention_mask'])
dev_y = torch.tensor(list(split_dev_events['Label']))
dev_data = TensorDataset(dev_ids, dev_mask, dev_y)
dev_sampler = RandomSampler(dev_data)
dev_loader = DataLoader(dev_data, sampler = dev_sampler, batch_size = batch_size)

In [None]:
def train():
    model.train()
    total_loss = 0.0
    total_acc = 0.0
    total = 0
    
    # for each batch
    for step, (ids, masks, labels) in enumerate(train_loader):            
        ids, masks, labels = ids.to('cuda'), masks.to('cuda'), labels.to('cuda')
        
        model.zero_grad() # clears gradients

        preds = model(ids, masks)

        loss = criterion(preds, labels)
        
        # l2 regularisation
        l2_norm = sum(param.pow(2.0).sum() for param in model.parameters())
        loss = loss + l_lambda * l2_norm
        
        # l1 regularisation
        #l1_norm = sum(param.abs().sum() for param in model.parameters())
        #loss = loss + l_lambda * l1_norm
        
        loss.backward() # computes gradient during neural network backward pass
        total_loss = total_loss + loss.item()

        # used to prevent the "exploding gradients" problem
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        # update parameters
        opti.step()

        _, predicts = preds.max(1)
        total += labels.size(0)
        total_acc += predicts.eq(labels).sum().item()
        
    avg_loss = total_loss / len(train_loader)
    total_acc = total_acc / total
    
    return avg_loss, total_acc

In [None]:
def evaluate():
    model.eval()
    total_loss = 0.0
    total_acc = 0.0
    total = 0

    # for each batch
    for step, (ids, masks, labels) in enumerate(dev_loader):
        ids, masks, labels = ids.to('cuda'), masks.to('cuda'), labels.to('cuda')

        with torch.no_grad():
            preds = model(ids, masks)
            
            loss = criterion(preds, labels)
            
            # l2 regularisation
            l2_norm = sum(param.pow(2.0).sum() for param in model.parameters())
            loss = loss + l_lambda * l2_norm
            
            # l1 regularisation
            #l1_norm = sum(param.abs().sum() for param in model.parameters())
            #loss = loss + l_lambda * l1_norm
            
            total_loss = total_loss + loss.item()
            
            _, predicts = preds.max(1)
            total += labels.size(0)
            total_acc += predicts.eq(labels).sum().item()
        
    avg_loss = total_loss / len(train_loader)
    total_acc = total_acc / total
    
    return avg_loss, total_acc

In [None]:
# other parameters for the training and evaluation process
opti = AdamW(model.parameters(), lr = 2e-5)    # AdamW optimiser
criterion = nn.NLLLoss()                       # negative log-likelihood loss function
l_lambda = 0.0001                              # lambda for l1/l2 regularisation
max_epochs = 10
batch_size = 3

In [None]:
best_dev_loss = float('inf')

train_loses = []
dev_loses = []

# actual training and evaluation occurs here for each epoch
for ep in range(max_epochs):
    print("Epoch " + str(ep + 1) + ":")
    
    train_loss, train_acc = train()
    dev_loss, dev_acc = evaluate()
    
    # judges value based on development loss rather than development accuracy
    if dev_loss < best_dev_loss:
        best_dev_loss = dev_loss
        torch.save(model.state_dict(), 'best_weights.pt') # saves the weights of the model of the current epoch
        
    train_loses.append(train_loss)
    dev_loses.append(dev_loss)
    
    print("Training Loss:   ", train_loss, "Training Accuracy:   ", train_acc)
    print("Development Loss:", dev_loss, "Development Accuracy:", dev_acc, "\n")

In [None]:
# same processes as for the training and development sets but for the test set
test_events = pd.read_csv(TEST_FILE)
split_test_events = test_events.copy()
split_test_events['Event Tweets'] = split_test_events['Event Tweets'].apply(text_split)

test_max = 0
curr_count = 0

for i in split_test_events['Event Tweets']:
    for j in i.split():
        curr_count += 1
        if j == '[SEP]':
            if curr_count > test_max:
                test_max = curr_count
            curr_count = 0
            
print("Test set has a max sentence length of", test_max)

In [None]:
test_tokens = tokenizer.batch_encode_plus(
    list(split_test_events['Event Tweets']),
    add_special_tokens = False,
    max_length = train_max,
    pad_to_max_length = True,
    truncation = True
)

test_ids = torch.tensor(test_tokens['input_ids'])
test_mask = torch.tensor(test_tokens['attention_mask'])

test_data = TensorDataset(test_ids, test_mask)
test_sampler = RandomSampler(test_data)
test_loader = DataLoader(test_data, sampler = test_sampler, batch_size = batch_size)

In [None]:
weight_file = f'best_weights.pt'
model.load_state_dict(torch.load(weight_file)) # loads the best recorded weights during the best epoch

with torch.no_grad():
    preds = model(test_ids.to('cuda'), test_mask.to('cuda'))
    preds = preds.detach().cpu().numpy() # gets the prediction labels from the GPU by pushing it to the CPU
    preds = np.argmax(preds, axis = 1)

In [None]:
with open('/kaggle/working/final.csv', 'w', newline='') as o:
    csv_write = csv.writer(o)
    csv_write.writerow(['Id', 'Predicted'])

    for i in range(len(preds)):
        idx = i
        is_rumour = preds[i]
        csv_write.writerow([idx, is_rumour])

Task 2

In [3]:
# to process covid data, no need to run any of the cells in Section 'Task 1', just those in this section
# some of these cells are repeats from above as we will use them in this section
# if 'Task 1' cells have been run skip running cells labelled "SKIP"
# SKIP
def text_split(text):
    #text = text.replace('[CLS]','')
    #text = text.replace('[SEP]','')
    new_text = ''
    curr_text = ''
    
    for word in text.split():
        curr_text += word + ' '
        
        if word == '[SEP]':
            new_text += curr_text
            curr_text = ''
            
    new_text = new_text[:-1]
    
    return new_text

In [4]:
# SKIP
class BertClassifier(nn.Module):

    def __init__(self, bert_base):
        super(BertClassifier, self).__init__()
        self.bert = bert_base
        self.dropout = nn.Dropout(0.3)
        self.relu = nn.ReLU()
        self.in = nn.Linear(768,512)
        self.out = nn.Linear(512, 2)
        self.softmax = nn.LogSoftmax(dim = 1)

    def forward(self, input_ids, attention_mask):
        
        _, inputs = self.bert(input_ids, attention_mask = attention_mask, return_dict = False)
        
        x = self.in(inputs)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.out(x)
        x = self.softmax(x)
        
        return x

In [5]:
# SKIP
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case = True)
bert_base = BertModel.from_pretrained('bert-base-uncased')
model = BertClassifier(bert_base)
model.to('cuda')

In [6]:
# had to breakup the COVID dataset in 2 to be able to process it within the memory limit of 16GB
# hence covid_1.csv and covid_2.csv
#COVID_FILE = '/kaggle/input/preprocessed-data/covid_1.csv'
COVID_FILE = '/kaggle/input/preprocessed-data/covid_2.csv'

covid_events = pd.read_csv(COVID_FILE)
split_covid_events = covid_events.copy()
split_covid_events['Event Tweets'] = split_covid_events['Event Tweets'].apply(text_split)

batch_size = 1 # have to set to 1 as setting to 3 causes memory issues due to size of COVID data
covid_max = 30 # similarly with max length of padding, 67 is the actual max length in the COVID dataset
#curr_count = 0

#for i in split_covid_events['Event Tweets']:
#    for j in i.split():
#        curr_count += 1
#        if j == '[SEP]':
#            if curr_count > covid_max:
#                covid_max = curr_count
#            curr_count = 0
#            
#print("The COVID set has a max sentence length of", covid_max)

In [7]:
covid_tokens = tokenizer.batch_encode_plus(
    list(split_covid_events['Event Tweets']),
    add_special_tokens = False,
    max_length = covid_max,
    pad_to_max_length = True,
    truncation = True
)

covid_ids = torch.tensor(covid_tokens['input_ids'])
covid_mask = torch.tensor(covid_tokens['attention_mask'])

covid_data = TensorDataset(covid_ids, covid_mask)
covid_sampler = RandomSampler(covid_data)
covid_loader = DataLoader(covid_data, sampler = covid_sampler, batch_size = batch_size)

In [8]:
# use f'best_weights.pt' if 'Task 1' was ran, if not used the initially saved best weights of our best model
#weight_file = f'best_weights.pt'
weight_file = '/kaggle/input/preprocessed-data/best_weights.pt'
model.load_state_dict(torch.load(weight_file))

In [9]:
with torch.no_grad():
    preds = model(covid_ids.to('cuda'), covid_mask.to('cuda'))
    preds = preds.detach().cpu().numpy()
    preds = np.argmax(preds, axis = 1)

In [11]:
#with open('/kaggle/working/covid_preds_1.csv', 'w', newline='') as o:
with open('/kaggle/working/covid_preds_2.csv', 'w', newline='') as o:
    csv_write = csv.writer(o)
    csv_write.writerow(['Id', 'Predicted'])

    for i in range(len(preds)):
        idx = i
        is_rumour = preds[i]
        csv_write.writerow([idx, is_rumour])