In [None]:
!pip install torch torchvision transformers

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 9.3 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.50.tar.gz (880 kB)
[K     |████████████████████████████████| 880 kB 54.0 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 47.4 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 52.1 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 3.7 MB/s 
Collecting click==8.0
  Downloading click-8.0.0-py3-none-any.whl (96 kB)
[K     |████████████████████████████████| 96 kB 5.3 MB/s 
Bu

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import torch
import torch.nn as nn
import pandas as pd
import nltk
nltk.download("stopwords")
from torch.utils.data import Dataset
from transformers import BertTokenizer, BertConfig, BertForSequenceClassification
from transformers import AdamW

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
from LoadData import LoadDataAndProcessing, LoadTestDataAndProcessing
from RumourDataSet import RumourDataset
from torch.utils.data import DataLoader

In [None]:
train_file = "/content/drive/MyDrive/NLP/data/train_data_all.json"
train_label_file = "/content/drive/MyDrive/NLP/data/train.label.txt"
dev_file = "/content/drive/MyDrive/NLP/data/dev_data_all.json"
dev_label_file =  "/content/drive/MyDrive/NLP/data/dev.label.txt"

load_twitter_train_data = LoadDataAndProcessing(train_file, train_label_file)
load_twitter_dev_data = LoadDataAndProcessing(dev_file, dev_label_file)

In [None]:
train_input = load_twitter_train_data.prepareDataset()
dev_input = load_twitter_dev_data.prepareDataset()

In [None]:
# Parameters setting

BATCH_SIZE = 8
NUM_WORKERS = 2
MAX_LEN = 512
num_labels = 2
hidden_dropout_prob = 0.3
learning_rate = 2e-5
weight_decay = 1e-2

gpu = 0 # gpu ID
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
#Creating instances of training and development set
#maxlen sets the maximum length a sentence can have
#any sentence longer than this length is truncated to the maxlen size
train_set = RumourDataset(data = train_input, maxlen = MAX_LEN)
dev_set = RumourDataset(data = dev_input, maxlen = MAX_LEN)

#Creating intsances of training and development dataloaders
train_loader = DataLoader(train_set, batch_size = BATCH_SIZE, 
                          collate_fn=train_set.create_mini_batch, 
                          num_workers = NUM_WORKERS, shuffle=True)
dev_loader = DataLoader(dev_set, batch_size = BATCH_SIZE, 
                        collate_fn=dev_set.create_mini_batch, 
                        num_workers = NUM_WORKERS, shuffle=True)

print("Done preprocessing training and development data.")

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Done preprocessing training and development data.


In [None]:
# Load the model
print("Creating the rumour classifier, initialised with pretrained BERT-BASE parameters...")
config = BertConfig.from_pretrained("bert-base-uncased", 
                                    num_labels=num_labels, hidden_dropout_prob=hidden_dropout_prob)
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", config=config)
model.to(device)
print("Done creating the rumour classifier.")

Creating the rumour classifier, initialised with pretrained BERT-BASE parameters...


Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Done creating the rumour classifier.


In [None]:
# Prepare optimizer and schedule (linear warmup and decay)
# set the bias and LayerNorm.weight 
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not 
                    any(nd in n for nd in no_decay)], 'weight_decay': weight_decay},
        {'params': [p for n, p in model.named_parameters() if 
                    any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

#optimizer = AdamW(model.parameters(), lr=learning_rate)
optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=learning_rate)
criterion = nn.CrossEntropyLoss()

In [None]:
# import torch.nn as nn
# import torch.optim as optim

# criterion = nn.BCEWithLogitsLoss()
# optimizer = optim.Adam(model.parameters(), lr = 2e-5, eps=1e-8)

In [None]:
def train(model, dataloader, optimizer, criterion, device):
    model.train()
    epoch_loss = 0
    epoch_acc = 0
    for i, batch in enumerate(dataloader):

        tokens_tensors, segments_tensors, masks_tensors, labels = [t.to(device) for t in batch]

        optimizer.zero_grad()

        output = model(input_ids=tokens_tensors, 
                       token_type_ids=segments_tensors, 
                       attention_mask=masks_tensors, labels=labels)

        y_pred_prob = output[1]
        y_pred_label = y_pred_prob.argmax(dim=1)

        loss = criterion(y_pred_prob.view(-1, 2), labels.view(-1))

        acc = ((y_pred_label == labels.view(-1)).sum()).item()

        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc

        if i % 200 == 0:
            print("Iteration {} complete. Loss:{}, Accuracy:{}".format(
                i, epoch_loss / (i+1), epoch_acc / ((i+1)*len(labels))))

    return epoch_loss / len(dataloader), epoch_acc / len(dataloader.dataset)

def evaluate(model, iterator, device):
    model.eval()
    epoch_loss = 0
    epoch_acc = 0
    with torch.no_grad():
        for _, batch in enumerate(iterator):
            tokens_tensors, segments_tensors, masks_tensors, labels = [t.to(device) for t in batch]

            output = model(input_ids=tokens_tensors, 
                           token_type_ids=segments_tensors, 
                           attention_mask=masks_tensors, labels=labels)
            y_pred_label = output[1].argmax(dim=1)
            loss = output[0]
            acc = ((y_pred_label == labels.view(-1)).sum()).item()

            epoch_loss += loss.item()
            epoch_acc += acc

    return epoch_loss / len(iterator), epoch_acc / len(iterator.dataset)

In [None]:
best_acc = 0
EPOCHS = 6
for ep in range(EPOCHS):
    print("EPOCH {} ---------------------------------------------------".format(ep+1))
    train_loss, train_acc = train(model, train_loader, optimizer, criterion, device)
    print("EPOCH {}:".format(ep+1), "Mean train loss: ", train_loss, "\t", "Mean train acc:", train_acc)
    dev_loss, dev_acc = evaluate(model, dev_loader, device)
    print("EPOCH {}:".format(ep+1), "Mean dev loss: ", dev_loss, "\t", "Mean dev acc:", dev_acc)

    if dev_acc > best_acc:
      print("Best development accuracy improved from {} to {}, saving model...".format(best_acc, dev_acc))
      print("-------------------------------------------------------------")
      best_acc = dev_acc
      torch.save(model.state_dict(), 'sstcls_{}.dat'.format(ep))

EPOCH 1 ---------------------------------------------------
Iteration 0 complete. Loss:0.9641666412353516, Accuracy:0.25
EPOCH 1: Mean train loss:  0.45388157446203486 	 Mean train acc: 0.7869481765834933
EPOCH 1: Mean dev loss:  0.433355724856035 	 Mean dev acc: 0.7854477611940298
Best development accuracy improved from 0 to 0.7854477611940298, saving model...
-----------------------------------------------------------------
EPOCH 2 ---------------------------------------------------
Iteration 0 complete. Loss:0.2010485827922821, Accuracy:0.875
EPOCH 2: Mean train loss:  0.31871604091221734 	 Mean train acc: 0.8522072936660269
EPOCH 2: Mean dev loss:  0.2418567283398736 	 Mean dev acc: 0.9123134328358209
Best development accuracy improved from 0.7854477611940298 to 0.9123134328358209, saving model...
-----------------------------------------------------------------
EPOCH 3 ---------------------------------------------------
Iteration 0 complete. Loss:0.19649329781532288, Accuracy:1.0


In [None]:
for ep in range(6,10):
    print("EPOCH {} ---------------------------------------------------".format(ep+1))
    train_loss, train_acc = train(model, train_loader, optimizer, criterion, device)
    print("EPOCH {}:".format(ep+1), "Mean train loss: ", train_loss, "\t", "Mean train acc:", train_acc)
    dev_loss, dev_acc = evaluate(model, dev_loader, device)
    print("EPOCH {}:".format(ep+1), "Mean dev loss: ", dev_loss, "\t", "Mean dev acc:", dev_acc)

    if dev_acc > best_acc:
      print("Best development accuracy improved from {} to {}, saving model...".format(best_acc, dev_acc))
      print("-------------------------------------------------------------")
      best_acc = dev_acc
      torch.save(model.state_dict(), 'sstcls_{}.dat'.format(ep))

EPOCH 7 ---------------------------------------------------
Iteration 0 complete. Loss:0.003511265851557255, Accuracy:1.0
EPOCH 7: Mean train loss:  0.025723624532289175 	 Mean train acc: 0.9929622520793346
EPOCH 7: Mean dev loss:  0.1221358551688965 	 Mean dev acc: 0.960820895522388
EPOCH 8 ---------------------------------------------------
Iteration 0 complete. Loss:0.007822192274034023, Accuracy:1.0
EPOCH 8: Mean train loss:  0.030913645420603608 	 Mean train acc: 0.9897632757517595
EPOCH 8: Mean dev loss:  0.19603450289048227 	 Mean dev acc: 0.9421641791044776
EPOCH 9 ---------------------------------------------------
Iteration 0 complete. Loss:0.0016880237963050604, Accuracy:1.0
EPOCH 9: Mean train loss:  0.00808192387743549 	 Mean train acc: 0.9987204094689699
EPOCH 9: Mean dev loss:  0.1615985656107616 	 Mean dev acc: 0.960820895522388
EPOCH 10 ---------------------------------------------------
Iteration 0 complete. Loss:0.0006720353267155588, Accuracy:1.0
EPOCH 10: Mean trai

In [None]:
import pickle
test_input = pickle.load(open("/content/drive/MyDrive/NLP/data/test_input.pickle", "rb", -1))

In [None]:
test_set = RumourDataset(data = test_input, maxlen = MAX_LEN)
test_loader = DataLoader(test_set, collate_fn=test_set.create_mini_batch, 
                         batch_size = BATCH_SIZE, num_workers = NUM_WORKERS)

In [None]:
def predict(net, test_loader, model_file=None):
    # load weight
    if model_file != None:
      net.load_state_dict(torch.load(model_file))
    
    predictions = []
    
    for _, batch in enumerate(test_loader):
      tokens_tensors, segments_tensors, masks_tensors = [t.to(device) for t in batch]

      output = model(input_ids=tokens_tensors, 
                      token_type_ids=segments_tensors, 
                      attention_mask=masks_tensors)
      probs = output[0].argmax(dim=1)
      soft_probs = (probs > 0.5).long()
      predictions.extend(soft_probs.cpu().numpy().squeeze())
    
    return predictions

In [None]:
model_file = "sstcls_9.dat"
predictions = predict(model, test_loader, model_file)
len(predictions)

558

In [None]:
pred = pd.DataFrame(data=predictions, columns=['Predicted']).reset_index()
pred = pred.rename(columns={'index': 'Id'})
print(pred)

pred.to_csv('test_prediction.csv', index = False)

      Id  Predicted
0      0          0
1      1          1
2      2          0
3      3          0
4      4          1
..   ...        ...
553  553          0
554  554          0
555  555          1
556  556          0
557  557          0

[558 rows x 2 columns]
