In [3]:
import numpy as np 
import pandas as pd
from transformers import ElectraModel, ElectraTokenizer, ElectraForSequenceClassification
import torch 
import torch.nn as nn  
import torch.nn.functional as F 
from transformers import *
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split 
from sklearn.metrics import log_loss, auc
import random 
import time 
import datetime 
from tqdm import tqdm
import gc  
import seaborn as sns   
import re

## Predict using the simple chunking method

In [108]:
test = pd.read_csv('./storage/minds_fake_news/mindslab_test.csv') 
y_test = test['Label'].values    

In [109]:
def clean_text(s): 
    FILTERS = "([~.,!?\"':;(])"
    CHANGE_FILTER = re.compile(FILTERS)
    return re.sub(CHANGE_FILTER, " ", s) 

In [110]:
def split_text(s, overlap = 20, chunk_size = 50): 
    total = [] 
    partial = [] 
    if len(s.split()) // (chunk_size - overlap) > 0:  
        n = len(s.split()) // (chunk_size - overlap) 
    else: 
        n = 1 
    for w in range(n): 
        if w == 0: 
            partial = s.split()[:chunk_size] 
            total.append(" ".join(partial)) 
        else:  
            partial = s.split()[w*(chunk_size - overlap):w*(chunk_size - overlap) + chunk_size]
            total.append(" ".join(partial)) 
    return total 

In [111]:
contents = test['content'].values
titles = test['title'].values 
labels = test['Label'].values 

In [112]:
tokenizer = ElectraTokenizer.from_pretrained("monologg/koelectra-base-v3-discriminator")

In [113]:
def electra_tokenizer_simple(sent1, sent2, MAX_LEN):  
    encoded_dict = tokenizer.encode_plus(
        text = sent1, 
        text_pair = sent2,  
        add_special_tokens = True, # add [CLS] and [SEP]
        pad_to_max_length = False, 
        return_attention_mask = True # constructing attention_masks 
    )  
    
    input_id = encoded_dict['input_ids'] 
    attention_mask = encoded_dict['attention_mask'] # differentiate padding from non padding 
    token_type_id = encoded_dict['token_type_ids'] # differentiate two sentences    
    
    if len(input_id) > 512: 
        input_id = input_id[:129] + input_id[-383:] 
        attention_mask = attention_mask[:129] + attention_mask[-383:]  
        token_type_id = token_type_id[:129] + token_type_id[-383:]   
    elif len(input_id) < 512: 
        input_id = input_id + [0]*(512 - len(input_id)) 
        attention_mask = attention_mask + [0]*(512 - len(attention_mask))
        token_type_id = token_type_id + [0]*(512 - len(token_type_id))  
        
    return np.asarray(input_id), np.asarray(attention_mask), np.asarray(token_type_id) 

In [114]:
model = ElectraForSequenceClassification.from_pretrained("monologg/koelectra-base-v3-discriminator")    
checkpoint = torch.load('./storage/electra_chunked_2') 
model.load_state_dict(checkpoint)

Some weights of the model checkpoint at monologg/koelectra-base-v3-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-base-v3-discriminator and are newly initialized: 

<All keys matched successfully>

In [115]:
MAX_LEN = 512
pred_labels = [] 
cnt = 1
for i in range(len(titles)): 
    print("Making prediction for datapoint {} ...".format(cnt))
    title = titles[i] 
    splitted_content = split_text(clean_text(contents[i])) 
    pred_sum = 0 
    for content in splitted_content: 
        input_id, attention_mask, token_type_id = electra_tokenizer_simple(title, content, MAX_LEN) 
        
        input_id = torch.tensor(input_id) 
        attention_mask = torch.tensor(attention_mask) 
        token_type_id = torch.tensor(token_type_id)   
        input_id = torch.reshape(input_id, (1,512)) 
        attention_mask = torch.reshape(attention_mask, (1,512)) 
        token_type_id = torch.reshape(token_type_id, (1,512))
        with torch.no_grad(): 
            yhat = model(input_ids=input_id, attention_mask=attention_mask, 
                               token_type_ids = token_type_id)  
        
        p = torch.sigmoid(yhat[0][:,1]) 
        pred_sum += p 
    pred_avg = pred_sum / len(splitted_content) 
    pred_avg_val = pred_avg.item() 
    label = (1 if pred_avg_val > 0.5 else 0)     
    pred_labels.append(label)    
    cnt += 1 

Making prediction for datapoint 1 ...
Making prediction for datapoint 2 ...
Making prediction for datapoint 3 ...
Making prediction for datapoint 4 ...
Making prediction for datapoint 5 ...
Making prediction for datapoint 6 ...
Making prediction for datapoint 7 ...
Making prediction for datapoint 8 ...
Making prediction for datapoint 9 ...
Making prediction for datapoint 10 ...
Making prediction for datapoint 11 ...
Making prediction for datapoint 12 ...
Making prediction for datapoint 13 ...
Making prediction for datapoint 14 ...
Making prediction for datapoint 15 ...
Making prediction for datapoint 16 ...
Making prediction for datapoint 17 ...
Making prediction for datapoint 18 ...
Making prediction for datapoint 19 ...
Making prediction for datapoint 20 ...
Making prediction for datapoint 21 ...
Making prediction for datapoint 22 ...
Making prediction for datapoint 23 ...
Making prediction for datapoint 24 ...
Making prediction for datapoint 25 ...
Making prediction for datapoint 26

In [116]:
pred_labels = np.asarray(pred_labels)

In [117]:
## compute accuracy 
accuracy = np.sum(pred_labels == y_test) / len(y_test) 

In [118]:
print("Accuracy = {}%".format(accuracy * 100))

Accuracy = 57.99999999999999%


## Predict using the head tail truncation method

In [97]:
predictions = [] 
def compute_accuracy(model, dataloader, device):
    tqdm()
    model.eval()
    correct_preds, num_samples = 0,0
    with torch.no_grad():
        for i, batch in enumerate(tqdm(dataloader)):
            b_input_ids, b_input_masks, b_token_type_ids, b_labels = tuple(t.to(device) for t in batch) 
            loss, yhat = model(input_ids=b_input_ids, attention_mask=b_input_masks, 
                               token_type_ids = b_token_type_ids, labels=b_labels.long())
            prediction = (torch.sigmoid(yhat[:,1]) > 0.5).long() 
            predictions.append(prediction)
            num_samples += b_labels.size(0)
            correct_preds += (prediction==b_labels.long()).sum()
            del b_input_ids, b_input_masks, b_token_type_ids, b_labels #memory
        torch.cuda.empty_cache() #memory
        gc.collect() # memory 
        return correct_preds.float()/num_samples*100 


In [98]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') 
device

device(type='cuda', index=0)

In [99]:
test = pd.read_csv('./storage/minds_fake_news/mindslab_test.csv') 
test = test.loc[100:]
y_test = test['Label'].values

In [100]:
def electra_tokenizer_empirical(sent1, sent2, MAX_LEN):  
    encoded_dict = tokenizer.encode_plus(
        text = sent1, 
        text_pair = sent2,  
        add_special_tokens = True, # add [CLS] and [SEP]
        pad_to_max_length = False, 
        return_attention_mask = True # constructing attention_masks 
    )  
    
    input_id = encoded_dict['input_ids'] 
    attention_mask = encoded_dict['attention_mask'] # differentiate padding from non padding 
    token_type_id = encoded_dict['token_type_ids'] # differentiate two sentences    
    
    if len(input_id) > 512: 
        input_id = input_id[:129] + input_id[-383:] 
        attention_mask = attention_mask[:129] + attention_mask[-383:]  
        token_type_id = token_type_id[:129] + token_type_id[-383:]   
    elif len(input_id) < 512: 
        input_id = input_id + [0]*(512 - len(input_id)) 
        attention_mask = attention_mask + [0]*(512 - len(attention_mask))
        token_type_id = token_type_id + [0]*(512 - len(token_type_id))  
        
    return np.asarray(input_id), np.asarray(attention_mask), np.asarray(token_type_id) 

In [101]:
################################################################################
## Data preprocessing step - save the data so that we can just load it next time
################################################################################
MAX_LEN = 512
input_ids = []
attention_masks = [] 
token_type_ids = [] 
cnt = 0
for sent1, sent2 in zip(test['title'], test['content']): 
    if cnt%1000 == 0 and cnt > 0: 
        print("Processed {} datapoints".format(cnt)) 
    cnt += 1
    try: 
        input_id, attention_mask, token_type_id = electra_tokenizer_empirical(sent1, sent2, MAX_LEN)
        input_ids.append(input_id) 
        attention_masks.append(attention_mask) 
        token_type_ids.append(token_type_id) 
    except Exception as e:  
        print(e)  
        print(sent1, sent2) 
        pass

In [102]:
input_ids = torch.tensor(input_ids) 
attention_masks = torch.tensor(attention_masks) 
token_type_ids = torch.tensor(token_type_ids)  
y_test = torch.tensor(y_test)

In [103]:
batch_size = 8

test_data = TensorDataset(input_ids, attention_masks, token_type_ids, y_test) 
test_sampler = SequentialSampler(test_data) 
test_dataloader = DataLoader(test_data, sampler = test_sampler, batch_size = batch_size, shuffle = False)  

In [104]:
model = ElectraForSequenceClassification.from_pretrained("monologg/koelectra-base-v3-discriminator")    
checkpoint = torch.load('./storage/minds_fake_news/electra_M_8') 
model.load_state_dict(checkpoint)

Some weights of the model checkpoint at monologg/koelectra-base-v3-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-base-v3-discriminator and are newly initialized: 

<All keys matched successfully>

In [105]:
model.cuda()
print()




In [107]:
print("Test set accuracy = {}".format(compute_accuracy(model, test_dataloader, device)))

0it [00:00, ?it/s]
  0%|          | 0/13 [00:00<?, ?it/s][A
 15%|█▌        | 2/13 [00:00<00:01,  9.12it/s][A
 23%|██▎       | 3/13 [00:00<00:01,  6.89it/s][A
 31%|███       | 4/13 [00:00<00:01,  6.10it/s][A
 38%|███▊      | 5/13 [00:00<00:01,  5.68it/s][A
 46%|████▌     | 6/13 [00:01<00:01,  5.48it/s][A
 54%|█████▍    | 7/13 [00:01<00:01,  5.36it/s][A
 62%|██████▏   | 8/13 [00:01<00:00,  5.28it/s][A
 69%|██████▉   | 9/13 [00:01<00:00,  5.23it/s][A
 77%|███████▋  | 10/13 [00:01<00:00,  5.20it/s][A
 85%|████████▍ | 11/13 [00:01<00:00,  5.18it/s][A
 92%|█████████▏| 12/13 [00:02<00:00,  5.16it/s][A
100%|██████████| 13/13 [00:02<00:00,  5.48it/s][A
0it [00:02, ?it/s]

Test set accuracy = 74.0



