In [28]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from tqdm.notebook import tqdm
from transformers import BertTokenizer
from torch.utils.data import TensorDataset
from transformers import BertForSequenceClassification, BertModel, BertConfig
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import AdamW, get_linear_schedule_with_warmup
import random
from sklearn.metrics import confusion_matrix, multilabel_confusion_matrix, f1_score, accuracy_score, classification_report
from sklearn.utils.class_weight import compute_class_weight
import seaborn as sns
import random
from timeit import default_timer

seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [1]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d8/b2/57495b5309f09fa501866e225c84532d1fd89536ea62406b2181933fb418/transformers-4.5.1-py3-none-any.whl (2.1MB)
[K     |████████████████████████████████| 2.1MB 17.4MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 48.1MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/ae/04/5b870f26a858552025a62f1649c20d29d2672c02ff3c3fb4c688ca46467a/tokenizers-0.10.2-cp37-cp37m-manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 53.5MB/s 
Installing collected packages: sacremoses, tokenizers, transformers
Successfully installed sacremoses-0.0.45 tokenizers-0.10.2 transformers-4.5.1


In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(torch.cuda.get_device_name(torch.cuda.current_device()))

Tesla T4


In [5]:
#hyperparameters
batch_size = 5
epochs = 5
learning_rate = 5e-5
eps = 1e-7
bertinputmodel = 'bert-base-uncased'
bestmodelpath = "bertmodelbest.model"

#seed for model
seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
#load data, drop na, remove unknowns
df = pd.read_csv("/content/drive/MyDrive/Team 9: RelAIability/FNC - Dataset/news1m.csv", nrows = 10000)
df = df.loc[:,["type","content"]].dropna()
df= df[df["type"] != "unknown"]

#map to num
list_of_tags = df.type.unique()
df['type_num'] = ""
label_dict = {}
for i, tag in enumerate(list_of_tags):
        df.loc[df.type == tag, 'type_num']= i
        label_dict[tag] = i

In [10]:
#weights for all classes
newsweights = compute_class_weight("balanced", np.unique(df.type_num), df.type_num)

In [11]:
list_of_tags = df.type.unique()
print(list_of_tags)

['rumor' 'hate' 'unreliable' 'conspiracy' 'clickbait' 'satire' 'fake'
 'reliable' 'bias' 'political' 'junksci']


In [12]:
#tokenize the testing and training data
train_data, validate_data, test_data = np.split(df.sample(frac=1), [int(.6*len(df)), int(.8*len(df))])
train_data = train_data.reset_index(drop=True)
validate_data = validate_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)

In [13]:
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', 
                                          do_lower_case=True)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




In [14]:
#PAD: pad token to make sequence lengths even, 0
#SEP: separation token, 102
#CLS: classification token for text, 101
#UNK: unknown token aka no token ID, 100
#encode the text data
#input ids for encoding, attention masks to identify which encodings contain information,
#token type IDs for segment IDS (though it's not relevant here)
def text_to_enc(tokenizer, content_data, max_len, sptokens = True, mask = True, segIDs = False, tensor = 'pt'):
    encoding =  tokenizer.batch_encode_plus(
                    content_data, #input
                    max_length = max_len,#fixed length of sequence
                    add_special_tokens = sptokens, #add special tokens
                    padding = True, #make all sequences same length
                    return_attention_mask = mask, 
                    return_token_type_ids = segIDs,
                    truncation = True, #cutoff
                    return_tensors = tensor)
    return encoding

#encoding ID dictionaries
encoded_data_train = text_to_enc(bert_tokenizer, train_data.content.values, 512)
encoded_data_val = text_to_enc(bert_tokenizer, validate_data.content.values, 512)
encoded_data_test = text_to_enc(bert_tokenizer, test_data.content.values, 512)


input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(train_data.type_num)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(validate_data.type_num)

input_ids_test = encoded_data_test['input_ids']
attention_masks_test = encoded_data_test['attention_mask']
labels_test = torch.tensor(test_data.type_num)

dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)
dataset_test = TensorDataset(input_ids_test, attention_masks_test, labels_test)


dataloader_train = DataLoader(dataset_train, 
                              sampler=RandomSampler(dataset_train), 
                              batch_size=batch_size)

dataloader_validation = DataLoader(dataset_val, 
                                   sampler=SequentialSampler(dataset_val), 
                                   batch_size=batch_size)
dataloader_test = DataLoader(dataset_test, 
                                   sampler=SequentialSampler(dataset_val), 
                                   batch_size=batch_size)

In [30]:
#bertmodel
class NewsCLF(nn.Module):
        ## constructor
        """Bert Model for Classification"""
        def __init__(self, bert):
            super(NewsCLF, self).__init__()
            
            #create architecture
            self.bert = bert #input bert model as first layer
            
            self.dropout1 = nn.Dropout(0.2) #drop 20% of data set
            
            #add one hidden layer
            self.dense = nn.Linear(768, 512) 
            self.act1 = nn.ReLU() #activation function
            
            self.dropout2 = nn.Dropout(0.2) #drop another 20%
            
            #classification layer
            self.clf = nn.Linear(512, len(label_dict))
            self.act2 = nn.Softmax(dim = 1) #activation function
            
            
        def forward(self, sent_id, mask):
            """inputs into bert and return last layter outputs"""
            #extract last output layer 
            bert_outputs = self.bert(input_ids = sent_id, attention_mask = mask)
            
            #get pooled outputs
            pooled_outputs = bert_outputs[0]#[:,0,:] #last hidden state, (bs, seq_len, dim)
            
            #feed into model
            pooled_outputs = self.dropout1(pooled_outputs)
            
            #hidden layer
            hiddenlayer = self.dense(pooled_outputs)
            hiddenlayer = self.act1(hiddenlayer)
            hiddenlayeroutputs = self.dropout2(hiddenlayer)
            
            #classification/output layer
            clf = self.clf(hiddenlayeroutputs)
            clf = self.act2(clf)
            
            return clf

In [23]:
config = BertConfig(dropout= 0.2, 
                          attention_dropout= 0.2)


bertsequence = BertModel.from_pretrained(bertinputmodel, config = config)
#load model
model = NewsCLF(bertsequence)

#turn newsweights into a tensor and pass to gpu
model.to(device)

#loss function
newsweightstensor = torch.tensor(newsweights, dtype = torch.float)
newsweightstensor.to(device) #push to gpu
cross_entropy = nn.CrossEntropyLoss(weight = newsweightstensor)

In [24]:
optimizer = AdamW(model.parameters(),
                  lr=learning_rate, 
                  eps=eps)

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)

In [25]:
def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')

In [26]:
def train(dataloader_train):
    model.train()
    
    loss_train_total = 0
    # empty list to save model predictions
    predictions, true_train = [], []
  
    # iterate over batches
    for step,batch in enumerate(dataloader_train):
        
        # clear previously calculated gradients 
        model.zero_grad()  
        
        # progress update after every 50 batches.
        if step % 50 == 0 and not step == 0:
            print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(dataloader_train)))
            
        # push the batch to gpu
        batch = [r.to(device) for r in batch]
        sent_id, mask, labels = batch

        # get model predictions for the current batch
        preds = model(sent_id, mask)

        # compute the loss between actual and predicted values
        cross_entropy.cuda()
        loss = cross_entropy(preds, labels)

        # add on to the total loss
        loss_train_total += loss.item()

        # backward pass to calculate the gradients
        loss.backward()
    
        # clip the the gradients to 1.0. It helps in preventing the exploding gradient problem
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # update parameters
        optimizer.step()
        scheduler.step()

        # model predictions are stored on GPU. So, push it to CPU
        preds = preds.detach().cpu().numpy()
        labels = labels.cpu().numpy()

        # append the model predictions
        predictions.append(preds)
        true_train.append(labels)

    # compute the training loss of the epoch
    loss_train_avg = loss_train_total / len(dataloader_train)
  
    # predictions are in the form of (no. of batches, size of batch, no. of classes).
    # reshape the predictions in form of (number of samples, no. of classes)
    predictions  = np.concatenate(predictions, axis=0)
    true_train = np.concatenate(true_train, axis=0)
    
    return loss_train_avg, predictions, true_train

In [35]:
def evaluate(dataloader_val):
    print("\nEvaluating...")
    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    #iterate over batches
    for step,batch in enumerate(dataloader_val):
        # progress update after every 50 batches.
        if step % 50 == 0 and not step == 0:
            print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(dataloader_val)))
        
        batch = [b.to(device) for b in batch]
        
        sent_id, mask, labels = batch

        with torch.no_grad():        
            preds = model(sent_id, mask)
        
        cross_entropy.cuda()
        loss = cross_entropy(preds,labels)
        loss_val_total += loss.item()

        preds = preds.detach().cpu().numpy()
        labels = labels.cpu().numpy()
        predictions.append(preds)
        true_vals.append(labels)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals

In [54]:
#reset weights
def weight_reset(m):
    if isinstance(m, nn.Linear):
        m.reset_parameters()

In [None]:
model.apply(weight_reset)
#train model
best_valid_loss = float('inf')
train_loss, val_loss, trainf1, valf1, trainacc, valacc = [],[],[],[],[],[]
start_time=default_timer()
for epoch in range(epochs):

    print('\n Epoch {:} / {:}'.format(epoch + 1, epochs))
    loss_train_avg, predictions, true_train = train(dataloader_train)
        
    loss_val_avg, valpredictions, true_val = evaluate(dataloader_validation)
            
        
    if loss_val_avg < best_valid_loss:
        best_valid_loss = loss_val_avg
        torch.save(model.state_dict(), bestmodelpath)
        
    train_loss.append(loss_train_avg)
    val_loss.append(loss_val_avg)
    print(f'Training loss: {loss_train_avg}')
    print(f'Validation loss: {loss_val_avg}')
    
    val_f1 = f1_score_func(valpredictions, true_val)
    train_f1 = f1_score_func(predictions, true_train)
    valf1.append(val_f1)
    trainf1.append(train_f1)
    print(f'Train F1 Score (Weighted): {val_f1}')
    print(f'Val F1 Score (Weighted): {train_f1}')
    
    valpredvec = [np.argmax(valpredictions[i]) for i in range(len(valpredictions))]
    trainpredvec = [np.argmax(predictions[i]) for i in range(len(predictions))]
    val_acc = accuracy_score(true_val, valpredvec)
    train_acc = accuracy_score(true_train,trainpredvec)
    valacc.append(val_acc)
    trainacc.append(train_acc)
    print(f'Train Accuracy: {val_acc}')
    print(f'Val Accuracy: {train_acc}')

end_time=default_timer()
print ('Elapsed training time: {0}s'.format(end_time-start_time))
    


 Epoch 1 / 5
  Batch    50  of  1,187.
  Batch   100  of  1,187.
  Batch   150  of  1,187.
  Batch   200  of  1,187.


In [None]:
testloss, testpredictions, true_test = evaluate(dataloader_test)
accuracy_per_class(testpredictions, true_test)

In [57]:
train_acc

0.3518705763397371