In [180]:
import os
import json
import re
import pandas as pd
from transformers import BertModel, BertTokenizerFast, Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from torch import optim

train_data_path = './train_dev_data/train/'
dev_data_path = './train_dev_data/dev/'

In [129]:
events = 0
rumour = 0
non_rumour = 0

with open("./project-data/train.label.txt", "r") as f:
    for line in f.readlines():
        events += 1
        if line[:-1] == 'rumour':
            rumour += 1
        else:
            non_rumour += 1
    
print("There are", events, "events in the training set")
print(rumour, "rumour")
print(non_rumour, "non_rumour")

There are 1895 events in the training set
420 rumour
1475 non_rumour


In [130]:
events_train = []
events_dev = []

with open("./project-data/train.data.txt", "r") as f:
    for line in f.readlines():
        events_train.append(line[:-1].split(","))
        
with open("./project-data/dev.data.txt", "r") as f:
    for line in f.readlines():
        events_dev.append(line[:-1].split(","))

#events = lists of all events, the first id is the source id

In [132]:
dic_train = {}
dic_dev = {}
label_train_all = []
label_dev_all = []

with open("./project-data/train.label.txt", "r") as f:
    for labeline in f.readlines():
        label_train_all.append(labeline[:-1])
for i in range(len(events_train)):
    dic_train[events_train[i][0]] = label_train_all[i]
    
with open("./project-data/dev.label.txt", "r") as f:
    for labeline in f.readlines():
        label_dev_all.append(labeline[:-1])
for i in range(len(events_dev)):
    dic_dev[events_dev[i][0]] = label_dev_all[i]

#dic = {source_tweet_id: label}

In [143]:
#use BERT
#[CLS] sentence1 [SEP] sentence2 ...
def process(events, dic):
    bert = []
    label = []
    for i in range(len(events)):
        try:
            l = '[CLS] '
            for j in range(len(events[i])):
                with open(train_data_path + events[i][j] + '.json', 'r') as f:
                    temp = json.loads(f.read())
                    text = temp['text']
                    text = re.sub('@[\S]+', '', text).lower() #remove @mention
                    text = re.sub('https://[\S]+', '', text) #remove url
                    text = re.sub('http://[\S]+', '', text)
                    text = re.sub('[\n]+', ' ', text).strip() #remove \n
                    l += text + ' [SEP]'
            bert.append(l)
            #0: non-rumour
            #1: rumour
            if dic[events[i][0]] == 'nonrumour':
                label.append('0') 
            else:
                label.append('1')
        except:
            continue
    return bert, label
        
bert_train, label_train = process(events_train, dic_train)
bert_dev, label_dev = process(events_dev, dic_dev)

In [146]:
#BERT format
bert_train_f = pd.DataFrame({'label': label_train, 'text': bert_train})
bert_dev_f = pd.DataFrame({'label': label_dev, 'text': bert_dev})

In [82]:
model = BertModel.from_pretrained('bert-base-uncased') #load Bert model
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased') #load Bert tokenizer

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [147]:
class Dataset(Dataset):
    def __init__(self, dataframe, tokenizer):
        self.df = dataframe
        self.tokenizer = tokenizer
        self.max_len = 500 #can be adjust
        
    def __len__(self):
        return len(self.df)
    
    def __get__(self, index):
        text = self.df.loc([index, 'text'])
        text = self.tokenizer.tokenize(text) #tokenize the text
        #add [PAD]
        if len(text) < self.max_len:
            for i in range(self.max_len-len(text)):
                text += ['[PAD]']
        else:
            text = text[:self.max-1] + ['[SEP]']
        text_ids = self.tokenizer.convert_tokens_to_ids(text) #token to id
        at_mask = []
        for i in text_ids:
            if i != 0:
                at_mask.append(1)
            else:
                at_mask.append(0)
        
        label = self.df.loc([index, 'label'])
        return text, text_ids, at_mask, label

In [148]:
train_data = Dataset(bert_train_f, tokenizer)
train_loader = DataLoader(train_data, batch_size = 8, num_workers = 2)

dev_data = Dataset(bert_dev_f, tokenizer)
dev_loader = DataLoader(dev_data, batch_size = 8, num_workers = 2)

In [167]:
class RumourClassifier(nn.Module):
    def __init__(self, model):
        super().__init__()
        self.model = model
        self.out = nn.Linear(self.model.config.hidden_size, 1) #linear transformations
        
    def forward(self, ids, at_mast):
        last_hidden_state, pooler_output = self.model(input_ids=ids, attention_mast=at_mast)
        return self.out(pooler_output)

In [187]:
trainer = RumourClassifier(model)

try:
    trainer = trainer.to(torch.device('cuda:0'))
except:
    trainer = trainer.to(torch.device('cpu'))
    
optimizer = optim.AdamW(model.parameters(), lr=2e-5)

In [188]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=2,
    per_device_train_batch_size=7,
    per_device_eval_batch_size=7,
    warmup_steps=300,
    weight_decay=0.01,
    logging_dir='./logs')

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [189]:
trainer.train()

RumourClassifier(
  (model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=