In [1]:
import sys
sys.path.append('./')
sys.path.append('../')

from authorship_identification import datasets
from authorship_identification import idf_utils
from authorship_identification import linear_models

import torch
import numpy as np
import pandas as pd
pd.options.display.max_colwidth = 100

from scipy import sparse
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import DistilBertTokenizer, BertTokenizer, DebertaTokenizer

random_seed = 42
np.random.seed = 42
torch.random.seed = 42

  from .autonotebook import tqdm as notebook_tqdm


### Load C50 Dataset

- this (original dataset) is 50 train - 50 test split

In [17]:
test_frac = 0.5
train_df, test_df = datasets.get_C50(orig_split=True, split=None, remove_entities=False, seed=random_seed)

- this dataset is the original one with entity names removed

In [None]:
test_frac = 0.5
train_df, test_df = datasets.get_C50(orig_split=True, split=None, remove_entities=True, seed=random_seed)

- this is 90 train - 10 test split

In [None]:
test_frac = 0.1
train_df = pd.concat([train_df, test_df])
test_df = train_df.sample(frac=test_frac, random_state=random_seed)
train_df = train_df.drop(test_df.index)

### Load "All the News" 1 Dataset

- this (original) dataset is 85 train - 15 test split

In [None]:
test_frac = 0.15
train_df, test_df = datasets.get_AllTheNews(orig_split=True, split=None, remove_entities=False, seed=random_seed)

- this dataset is the original one with entity names removed

In [None]:
test_frac = 0.15
train_df, test_df = datasets.get_AllTheNews(orig_split=True, split=None, remove_entities=True, seed=random_seed)

In [10]:
train_df.head()

Unnamed: 0,text,label
0,"The Czech capital market, roundly criticised for a lack of transparency, needs an independent re...",1
1,"Sweden beat the Czech Republic 3-0 in a World Cup ice hockey game on Thursday, setting up a show...",1
2,The fall in the Czech trade deficit to 10.5 billion crowns in September from 14.5 billion in Aug...,1
3,Czech paper concern Sepap Group a.s. on Friday said its nine-month net profit fell as a shutdown...,1
4,Czech shares rallied on Monday following the coalition government's win in weekend Senate electi...,1


### Preprocessing

In [18]:
n_classes = train_df['label'].nunique()
le = preprocessing.LabelEncoder()
le.fit(train_df['label'])
train_df['label'] = le.transform(train_df['label']).astype(int)
test_df['label'] = le.transform(test_df['label']).astype(int)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
print('The average length of text of an article in training set is', np.mean([len(content.split()) for content in train_df['text'].values]), 
      'and in test set is', np.mean([len(content.split()) for content in test_df['text'].values]))
valid_frac = 0.1 / (1 - test_frac)
valid_df = train_df.sample(frac=valid_frac, random_state=random_seed)
valid_idx = valid_df.index
train_df.drop(valid_idx).reset_index(drop=True)
valid_df.reset_index(drop=True)
train_df.head()

The average length of text of an article in training set is 501.77 and in test set is 509.5508


Unnamed: 0,text,label
0,"The Czech capital market, roundly criticised for a lack of transparency, needs an independent re...",1
1,"Sweden beat the Czech Republic 3-0 in a World Cup ice hockey game on Thursday, setting up a show...",1
2,The fall in the Czech trade deficit to 10.5 billion crowns in September from 14.5 billion in Aug...,1
3,Czech paper concern Sepap Group a.s. on Friday said its nine-month net profit fell as a shutdown...,1
4,Czech shares rallied on Monday following the coalition government's win in weekend Senate electi...,1


In [25]:
import re

def mask_contents(threshold, mask_token='[MASK]'):
    vectorizer = TfidfVectorizer(ngram_range=(1, 2), lowercase=False) #, stop_words='english')
    vectorizer.fit(pd.concat([train_df['text'], valid_df['text']]))
    X_train = vectorizer.transform(train_df['text'])
    X_valid = vectorizer.transform(valid_df['text'])
    X_test = vectorizer.transform(test_df['text'])
    feat_names = vectorizer.get_feature_names_out()
    feat_idf = vectorizer.idf_

    def mask_content(X, df):
        mask_words = set(feat_names[feat_idf >= threshold])
        ret = []
        for i in range(df['text'].shape[0]):
            text = df['text'].iloc[i]
            splits = re.split(r"(?u)\b\w\w+\b", text)
            tokens = re.findall(r"(?u)\b\w\w+\b", text)
            tokens_masked = []
            i = 0
            while i < len(tokens):
                t = tokens[i]
                if t in mask_words: # 1-gram
                    tokens_masked.append(mask_token)
                    i += 1
                elif i < len(tokens_masked)-1 and t + ' ' + tokens[i+1] in mask_words: # 2-gram
                    tokens_masked.append(mask_token)
                    tokens_masked.append(mask_token)
                    i += 2
                else:
                    tokens_masked.append(t)
                    i += 1
            text_masked = ''.join([a + b for a, b in zip(splits, tokens_masked)])
            ret.append(text_masked)
        return ret

    train_df['masked_text'] = mask_content(X_train, train_df)
    valid_df['masked_text'] = mask_content(X_valid, valid_df)
    test_df['masked_text'] = mask_content(X_test, test_df)

In [None]:
# for visualization ONLY
mask_contents(threshold=2., mask_token='_')
train_df['masked_text']

### Dataset Class

In [20]:
class Dataset(torch.utils.data.Dataset):

    def __init__(self, df):

        self.labels = df['label'].values
        self.texts = [tokenizer(text, 
                               padding='max_length', max_length = 512, truncation=True,
                                return_tensors="pt") for text in df['masked_text']]
        # add attention mask
#         mask_val = tokenizer.vocab.get('[MASK]')
#         for text in self.texts:
#             text['attention_mask'] = torch.where(text['input_ids'] == mask_val, 0, text['attention_mask'])

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y

### Model Class

In [21]:
from torch import nn
from transformers import DistilBertForSequenceClassification, BertForSequenceClassification, get_linear_schedule_with_warmup

class BertClassifier(nn.Module):

    def __init__(self):

        super().__init__()
        self.bert = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=n_classes)

    def forward(self, input_id, labels=None):

        return self.bert(input_id, labels=labels)

### Training

In [26]:
from torch.optim import AdamW
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score

def train(model, train_data, val_data, learning_rate, epochs, batch_size):

    train, val = Dataset(train_data), Dataset(val_data)

    train_dataloader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=batch_size)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = nn.CrossEntropyLoss()
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    lr_scheduler = get_linear_schedule_with_warmup(
        optimizer=optimizer,
        num_warmup_steps=100,
        num_training_steps=len(train_dataloader) * epochs,
    )

    if use_cuda:
        
        print('cuda version', torch.__version__)
        model = model.cuda()
        criterion = criterion.cuda()

    for epoch_num in range(epochs):
        
        train_loss = 0
        train_pred = []
        train_truth = []
        model.train()

        for train_input, train_label in tqdm(train_dataloader):

            train_label = train_label.to(device)
            input_id = train_input['input_ids'].squeeze(1).to(device)

            output = model(input_id, train_label)
            batch_loss = output.loss
            train_loss += batch_loss.item()
            train_pred.append(output.logits.argmax(dim=1))
            train_truth.append(train_label)

            model.zero_grad()
            batch_loss.backward()
            optimizer.step()
            lr_scheduler.step()
                
        train_truth = torch.cat(train_truth).detach().cpu().numpy()
        train_pred = torch.cat(train_pred).detach().cpu().numpy()
        train_acc = accuracy_score(train_truth, train_pred)
        train_f1 = f1_score(train_truth, train_pred, average='macro')
            
        val_loss = 0
        val_pred = []
        val_truth = []
        model.eval()

        with torch.no_grad():

            for val_input, val_label in val_dataloader:

                val_label = val_label.to(device)
                input_id = val_input['input_ids'].squeeze(1).to(device)

                output = model(input_id, val_label)
                batch_loss = output.loss
                val_loss += batch_loss.item()
                val_pred.append(output.logits.argmax(dim=1))
                val_truth.append(val_label)
            
        val_truth = torch.cat(val_truth).detach().cpu().numpy()
        val_pred = torch.cat(val_pred).detach().cpu().numpy()
        val_acc = accuracy_score(val_truth, val_pred)
        val_f1 = f1_score(val_truth, val_pred, average='macro')
            
        print('Epochs: {} | Train Loss: {:.4f} | Train Accuracy: {:.4f} | Train F1: {:.4f}'\
                .format(epoch_num + 1, train_loss, train_acc, train_f1))
        print('           | Val Loss:   {:.4f} | Val Accuracy:   {:.4f} | Val F1:   {:.4f}'\
                .format(val_loss, val_acc, val_f1))
        
    return train_acc, train_f1, val_acc, val_f1
   
EPOCHS = 15
LR = 2e-5
BATCH_SIZE = 16
model = BertClassifier()
print('There are', sum(p.numel() for p in model.parameters()), 'parameters')

mask_contents(threshold=10.)
print(train_df['masked_text'])

train(model, train_df, valid_df, LR, EPOCHS, BATCH_SIZE)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

There are 109520690 parameters
0       The Czech capital market, roundly criticised for a lack of transparency, needs an independent re...
1       Sweden beat the Czech Republic 3-0 in a World Cup ice hockey game on Thursday, setting up a show...
2       The fall in the Czech trade deficit to 10.5 billion crowns in September from 14.5 billion in Aug...
3       Czech paper concern Sepap Group a.s. on Friday said its nine-month net profit fell as a shutdown...
4       Czech shares rallied on Monday following the coalition government's win in weekend Senate electi...
                                                       ...                                                 
2495    Veronika Hirsch, the flamboyant Canadian stock picker hired recently to spearhead Fidelity Inves...
2496    Toronto, Canada's biggest city and financial capital, is bracing for a near shutdown Friday when...
2497    Shares in Canadian waste and transportation firm Laidlaw Inc soared on Tuesday as investors and .

100%|█████████████████████████████████████████| 157/157 [01:16<00:00,  2.05it/s]


Epochs: 1 | Train Loss: 586.3629 | Train Accuracy: 0.0996 | Train F1: 0.0935
           | Val Loss:   98.1771 | Val Accuracy:   0.4260 | Val F1:   0.3402


100%|█████████████████████████████████████████| 157/157 [01:17<00:00,  2.03it/s]


Epochs: 2 | Train Loss: 404.4801 | Train Accuracy: 0.4948 | Train F1: 0.4579
           | Val Loss:   62.8701 | Val Accuracy:   0.6660 | Val F1:   0.6184


100%|█████████████████████████████████████████| 157/157 [01:18<00:00,  2.01it/s]


Epochs: 3 | Train Loss: 280.0474 | Train Accuracy: 0.6596 | Train F1: 0.6385
           | Val Loss:   44.9112 | Val Accuracy:   0.7640 | Val F1:   0.7373


100%|█████████████████████████████████████████| 157/157 [01:18<00:00,  2.01it/s]


Epochs: 4 | Train Loss: 209.8609 | Train Accuracy: 0.7300 | Train F1: 0.7158
           | Val Loss:   34.1325 | Val Accuracy:   0.8220 | Val F1:   0.8100


100%|█████████████████████████████████████████| 157/157 [01:18<00:00,  2.01it/s]


Epochs: 5 | Train Loss: 162.5655 | Train Accuracy: 0.8032 | Train F1: 0.7932
           | Val Loss:   25.7698 | Val Accuracy:   0.8640 | Val F1:   0.8538


100%|█████████████████████████████████████████| 157/157 [01:18<00:00,  2.00it/s]


Epochs: 6 | Train Loss: 128.9171 | Train Accuracy: 0.8532 | Train F1: 0.8460
           | Val Loss:   19.9386 | Val Accuracy:   0.9220 | Val F1:   0.9130


100%|█████████████████████████████████████████| 157/157 [01:18<00:00,  2.00it/s]


Epochs: 7 | Train Loss: 103.4435 | Train Accuracy: 0.8948 | Train F1: 0.8919
           | Val Loss:   16.0070 | Val Accuracy:   0.9300 | Val F1:   0.9238


100%|█████████████████████████████████████████| 157/157 [01:18<00:00,  2.00it/s]


Epochs: 8 | Train Loss: 82.7715 | Train Accuracy: 0.9308 | Train F1: 0.9280
           | Val Loss:   12.5726 | Val Accuracy:   0.9640 | Val F1:   0.9615


100%|█████████████████████████████████████████| 157/157 [01:18<00:00,  2.00it/s]


Epochs: 9 | Train Loss: 66.5151 | Train Accuracy: 0.9544 | Train F1: 0.9510
           | Val Loss:   9.9414 | Val Accuracy:   0.9840 | Val F1:   0.9822


100%|█████████████████████████████████████████| 157/157 [01:18<00:00,  2.00it/s]


Epochs: 10 | Train Loss: 53.9620 | Train Accuracy: 0.9768 | Train F1: 0.9752
           | Val Loss:   8.2817 | Val Accuracy:   0.9860 | Val F1:   0.9857


100%|█████████████████████████████████████████| 157/157 [01:18<00:00,  2.00it/s]


Epochs: 11 | Train Loss: 45.5782 | Train Accuracy: 0.9832 | Train F1: 0.9824
           | Val Loss:   7.0849 | Val Accuracy:   0.9880 | Val F1:   0.9864


100%|█████████████████████████████████████████| 157/157 [01:18<00:00,  2.01it/s]


Epochs: 12 | Train Loss: 39.3272 | Train Accuracy: 0.9912 | Train F1: 0.9909
           | Val Loss:   6.1203 | Val Accuracy:   0.9920 | Val F1:   0.9916


100%|█████████████████████████████████████████| 157/157 [01:18<00:00,  2.01it/s]


Epochs: 13 | Train Loss: 35.1003 | Train Accuracy: 0.9956 | Train F1: 0.9955
           | Val Loss:   5.5227 | Val Accuracy:   0.9940 | Val F1:   0.9939


100%|█████████████████████████████████████████| 157/157 [01:18<00:00,  2.01it/s]


Epochs: 14 | Train Loss: 32.9857 | Train Accuracy: 0.9960 | Train F1: 0.9960
           | Val Loss:   5.1843 | Val Accuracy:   0.9940 | Val F1:   0.9941


100%|█████████████████████████████████████████| 157/157 [01:18<00:00,  2.01it/s]


Epochs: 15 | Train Loss: 31.2410 | Train Accuracy: 0.9968 | Train F1: 0.9968
           | Val Loss:   5.0483 | Val Accuracy:   0.9960 | Val F1:   0.9963


(0.9968, 0.9967619766193428, 0.996, 0.9962539682539682)

### Evaluate

In [27]:
def evaluate(model, test_data):

    test = Dataset(test_data)

    test_dataloader = torch.utils.data.DataLoader(test, batch_size=32)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:

        model = model.cuda()

    test_pred = []
    test_truth = []
    model.eval()
    
    with torch.no_grad():

        for test_input, test_label in test_dataloader:
            
            test_label = test_label.to(device)
            input_id = test_input['input_ids'].squeeze(1).to(device)

            output = model(input_id, test_label)
            test_pred.append(output.logits.argmax(dim=1))
            test_truth.append(test_label)
        
        test_truth = torch.cat(test_truth).detach().cpu().numpy()
        test_pred = torch.cat(test_pred).detach().cpu().numpy()
        test_acc = accuracy_score(test_truth, test_pred)
        test_f1 = f1_score(test_truth, test_pred, average='macro')
    
    print('Test Accuracy:  {:.4f} | Test F1:  {:.4f}'.format(test_acc, test_f1))
    return test_acc, test_f1
    
print(test_df['masked_text'])
evaluate(model, test_df)

0       The Prague Stock Exchange's 1997 bull run, as expected, ground to a halt on Thursday as investor...
1       Czech annual average consumer inflation eased slightly in 1996, not as much as original governme...
2       The Czech Republic has recorded its first budget deficit despite last ditch efforts by the gover...
3       The Prague Stock Exchange slipped on Monday from last week's year high on a round of profit taki...
4       The Prague Stock Exchange, as expected, backed off last week's year-high on a round of profit ta...
                                                       ...                                                 
2495    Ontario Finance Minister Ernie Eves is expected to deliver more good news on the province's defi...
2496    A Canadian exploration company's claim to have made the biggest gold find this century were cast...
2497    Canada's Barrick Gold Corp, the world's second largest gold producer, said on Thursday Indonesia...
2498    Royal Bank of Canada

(0.68, 0.678809450921158)

### Fine-Tuning

In [None]:
def fine_tuning(lr_list, epoch_list, batch_list, checkpoint_path='checkpoint_tuning'):
    
    index_list, index2_list, index3_list = [], [], []
    train_accs, train_f1s = [], []
    test_accs, test_f1s = [], []
    
    for lr in lr_list:
        for epoch in epoch_list:
            for batch_size in batch_list:
            
                model = BertClassifier()
                # mask_contents(threshold=...)
                print('lr at {}, epoch at {}, batch_size at {}'.format(lr, epoch, batch_size))
                train_acc, train_f1, val_acc, val_f1 = train(model, train_df, valid_df, lr, epoch, batch_size)
                test_acc, test_f1 = evaluate(model, test_df)

                train_accs.append(train_acc)
                train_f1s.append(train_f1)
                test_accs.append(test_acc)
                test_f1s.append(test_f1)
                index_list.append(lr)
                index2_list.append(epoch)
                index3_list.append(batch_size)

                result_df = pd.DataFrame(index_list, columns=['lr'])
                result_df['epoch'] = index2_list
                result_df['batch_size'] = index3_list
                result_df['train_accs'] = train_accs
                result_df['train_f1s'] = train_f1s
                result_df['test_accs'] = test_accs
                result_df['test_f1s'] = test_f1s
                result_df['model_name'] = 'bert_uncased'
                result_df.to_csv(checkpoint_path + '.csv', index=False)
                # break
    return index_list, index2_list, index3_list, train_accs, train_f1s, test_accs, test_f1s

In [None]:
# C50 [5e-5 + 10 + 16, 2e-5 + 15 + 16, 1e-5 + 15 + 8]
lr_list = [2e-5] # [5e-5, 2e-5, 1e-5]
epoch_list = [15] # [10, 15]
batch_list = [8, 16, 32] # [8, 16, 32]
fine_tuning(lr_list=lr_list, epoch_list=epoch_list, batch_list=batch_list)

### Grid Search IDF

In [None]:
import warnings

def grid_search_idf(threshold_list, checkpoint_path='checkpoint_IDF'):
    
    index_list = []
    train_accs, train_f1s = [], []
    test_accs, test_f1s = [], []
    
    for threshold in threshold_list:
        
        # warnings.filterwarnings(action='once')
        model = BertClassifier()
        mask_contents(threshold=threshold)
        print('idf removal threshold at', threshold)
        print(train_df['masked_text'])
        train_acc, train_f1, val_acc, val_f1 = train(model, train_df, valid_df, LR, EPOCHS, BATCH_SIZE)
        test_acc, test_f1 = evaluate(model, test_df)
        
        train_accs.append(train_acc)
        train_f1s.append(train_f1)
        test_accs.append(test_acc)
        test_f1s.append(test_f1)
        index_list.append(threshold)
        
        result_df = pd.DataFrame(index_list, columns=['idf_threshold'])
        result_df['train_accs'] = train_accs
        result_df['train_f1s'] = train_f1s
        result_df['test_accs'] = test_accs
        result_df['test_f1s'] = test_f1s
        result_df['model_name'] = 'bert_uncased'
        result_df.to_csv(checkpoint_path + '.csv', index=False)
        # break
    return index_list, train_accs, train_f1s, test_accs, test_f1s

In [None]:
EPOCHS = 15
LR = 2e-5
BATCH_SIZE = 16
grid_search_idf(threshold_list=np.arange(10., 1., -0.5))

### Saliency Map (has issue right now)

In [None]:
def get_saliency_map(model, df):
    input_text = [tokenizer(text, padding='max_length', max_length = 512, truncation=True, return_tensors="pt") for text in df['text']]
    input = input_text[0]
    print(input.shape)
    for param in model.parameters():
        param.requires_grad = False
    model.eval()
    # input.unsqueeze_(0)
    input.requires_grad = True
    preds = model(input)
    score, indices = torch.max(preds, 1)
    #backward pass to get gradients of score predicted class w.r.t. input image
    score.backward()
    #get max along channel axis
    slc, _ = torch.max(torch.abs(input.grad[0]), dim=0)
    #normalize to [0..1]
    slc = (slc - slc.min())/(slc.max()-slc.min())
    return slc

input_slc = get_saliency_map(model, train_df)
print(input_slc.shape)
input_slc

### SHAP

In [28]:
import shap
import scipy as sp

def f(x):
    tv = torch.tensor([tokenizer.encode(v, padding='max_length', max_length=512, truncation=True) for v in x]).cuda()
    outputs = model(tv)[0].detach().cpu().numpy()
    scores = (np.exp(outputs).T / np.exp(outputs).sum(-1)).T
    val = sp.special.logit(scores[:,1]) # use one vs rest logit units
    return val

explainer = shap.Explainer(f, tokenizer)
shap_values = explainer(train_df['masked_text'][:2], fixed_context=1, batch_size=32)

Token indices sequence length is longer than the specified maximum sequence length for this model (529 > 512). Running this sequence through the model will result in indexing errors
Partition explainer:  50%|████████████████                | 1/2 [00:00<?, ?it/s]
  0%|                                                   | 0/498 [00:00<?, ?it/s][A
 64%|█████████████████████████▌              | 318/498 [00:00<00:00, 453.17it/s][A
 77%|██████████████████████████████▋         | 382/498 [00:02<00:00, 151.63it/s][A
 83%|█████████████████████████████████▎      | 414/498 [00:02<00:00, 114.71it/s][A
 90%|████████████████████████████████████▋    | 446/498 [00:03<00:00, 91.68it/s][A
 96%|███████████████████████████████████████▎ | 478/498 [00:04<00:00, 76.87it/s][A
510it [00:04, 66.99it/s]                                                        [A
530it [00:05, 62.13it/s][A
Partition explainer: 3it [00:24, 12.44s/it]                                     


In [None]:
shap.plots.bar(shap_values.abs.mean(0))

In [None]:
shap.plots.bar(shap_values.abs.sum(0))

In [None]:
shap.plots.bar(shap_values.abs.max(0))

In [29]:
shap.plots.text(shap_values) # textual importance