In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

# import os
# os.chdir("drive/MyDrive/Colab Notebooks/Authorship Identification/")

import re
import torch
import numpy as np
import pandas as pd
from scipy import sparse
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import DistilBertTokenizer, BertTokenizer, DebertaTokenizer

random_seed = 42
np.random.seed = 42
torch.random.seed = 42

  from .autonotebook import tqdm as notebook_tqdm


### Load C50 Dataset (if already loaded)

- this (original dataset) is 50 train - 50 test split

In [None]:
train_df = pd.read_csv('../authorship_identification/data/train.csv')
test_df = pd.read_csv('../authorship_identification/data/test.csv')
train_df.rename(columns={'text': 'content', 'label': 'author'}, inplace=True)
test_df.rename(columns={'text': 'content', 'label': 'author'}, inplace=True)

- this dataset is the original one with entity names removed

In [11]:
train_df = pd.read_csv('../authorship_identification/data/train_noents.csv')
test_df = pd.read_csv('../authorship_identification/data/test_noents.csv')
train_df.rename(columns={'text': 'content', 'label': 'author'}, inplace=True)
test_df.rename(columns={'text': 'content', 'label': 'author'}, inplace=True)

- this is 90 train - 10 test split

In [None]:
test_frac = 0.1
train_df = pd.concat([train_df, test_df])
test_df = train_df.sample(frac=test_frac, random_state=random_seed)
train_df = train_df.drop(test_df.index)

### Load "All the News" 1 Dataset

In [None]:
train_df = pd.read_csv('data/all_the_news_noents.csv')
train_df = train_df[(train_df.publication == 'Breitbart')]
top10_authors = train_df[['author','id']].groupby('author').count().sort_values(by='id', ascending=False).head(10).reset_index() # this == what the paper proposed
print('The authors are', top10_authors.author.values)
train_df = pd.merge(train_df, top10_authors[['author']], on='author', how='right')
train_df = train_df.groupby(by='author').sample(500, random_state=random_seed)
train_df.shape

### Preprocessing

In [12]:
# if C50 split 50-50
test_frac = 0

# if NOT C50 split 50-50
# test_frac = 0.15
# test_df = train_df.sample(frac=test_frac, random_state=random_seed)
# train_df = train_df.drop(test_df.index).reset_index()
# test_df = test_df.reset_index()

le = preprocessing.LabelEncoder()
le.fit(train_df['author'])
train_df['author_id'] = le.transform(train_df['author']).astype(int)
test_df['author_id'] = le.transform(test_df['author']).astype(int)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [13]:
print('The average length of text of an article in training set is', np.mean([len(content.split()) for content in train_df['content'].values]), 
      'and in test set is', np.mean([len(content.split()) for content in test_df['content'].values]))

The average length of text of an article in training set is 420.3584 and in test set is 427.668


In [14]:
valid_frac = 0.1 / (1 - test_frac)
valid_df = train_df.sample(frac=valid_frac, random_state=random_seed)
valid_index = valid_df.index
train_df = train_df.drop(valid_df.index).reset_index()
valid_df = valid_df.reset_index()

In [15]:
pd.options.display.max_colwidth = 100
tmp_df = pd.read_csv('../authorship_identification/data/train.csv').drop(valid_index).reset_index()
# tmp_df = tmp_df.merge(train_df[['Unnamed: 0']], how='right')
tmp_df['text']

0       The Czech capital market, roundly criticised for a lack of transparency, needs an independent re...
1       Sweden beat the Czech Republic 3-0 in a World Cup ice hockey game on Thursday, setting up a show...
2       The fall in the Czech trade deficit to 10.5 billion crowns in September from 14.5 billion in Aug...
3       Czech paper concern Sepap Group a.s. on Friday said its nine-month net profit fell as a shutdown...
4       Czech shares rallied on Monday following the coalition government's win in weekend Senate electi...
                                                       ...                                                 
2245    "The Times They Are a-Changin" -- Bob Dylan's counter-culture anthem of the 1960's  -- is being ...
2246    Veronika Hirsch, the flamboyant Canadian stock picker hired recently to spearhead Fidelity Inves...
2247    Toronto, Canada's biggest city and financial capital, is bracing for a near shutdown Friday when...
2248    Anti-government prot

In [None]:
#     text = tokenizer("Commonwealth [MASK] of Australia (_) managing", padding='max_length', max_length = 32, truncation=True, return_tensors="pt")
#     print(text)
#     mask_val = tokenizer.vocab.get('[MASK]')
#     text['attention_mask'] = torch.where(text['input_ids'] == mask_val, 0, text['attention_mask'])
#     print(text)
#     text = "Westpac Banking Corp Ltd is expected to report on Tuesday that its net profit growth was reined"
#     sub = "Banking"
#     i = text.find(sub)
#     print(i, text[i-1], text[i+len(sub)])
#     if i == -1 or (i > 0 and text[i-1].isalpha()) or (i+len(sub) < len(text) and text[i+len(sub)].isalpha()):
#         print('no')
#     else:
#         print('yes')
# tokenizer.mask_token_id, tokenizer.all_special_ids, tokenizer.special_tokens_map
# tokenizer("[MASK] fheou  hewoh iwo")
# tmp = tokenizer.tokenize("In a column intended to suggest the National Rifle Association (NRA) is exaggerating Hillary")
# tmp2 = tokenizer.convert_tokens_to_string(tmp)
# tmp, tmp2

In [6]:
import re

def mask_contents(threshold, mask_token='[MASK]'):
    vectorizer = TfidfVectorizer(ngram_range=(1, 2), lowercase=False) #, stop_words='english')
    vectorizer.fit(pd.concat([train_df['content'], valid_df['content']]))
    X_train = vectorizer.transform(train_df['content'])
    X_valid = vectorizer.transform(valid_df['content'])
    X_test = vectorizer.transform(test_df['content'])
    feat_names = vectorizer.get_feature_names_out()
    feat_idf = vectorizer.idf_

    def mask_content(X, df):
        mask_words = set(feat_names[feat_idf >= threshold])
        ret = []
        for i in range(df['content'].shape[0]):
            text = df['content'][i]
            splits = re.split(r"(?u)\b\w\w+\b", text)
            tokens = re.findall(r"(?u)\b\w\w+\b", text)
            tokens_masked = []
            i = 0
            while i < len(tokens):
                t = tokens[i]
                if t in mask_words: # 1-gram
                    tokens_masked.append(mask_token)
                    i += 1
                elif i < len(tokens_masked)-1 and t + ' ' + tokens[i+1] in mask_words: # 2-gram
                    tokens_masked.append(mask_token)
                    tokens_masked.append(mask_token)
                    i += 2
                else:
                    tokens_masked.append(t)
                    i += 1
            text_masked = ''.join([a + b for a, b in zip(splits, tokens_masked)])
            ret.append(text_masked)
        return ret

    train_df['masked_content'] = mask_content(X_train, train_df)
    valid_df['masked_content'] = mask_content(X_valid, valid_df)
    test_df['masked_content'] = mask_content(X_test, test_df)

In [7]:
# for visualization ONLY
pd.options.display.max_colwidth = 100
mask_contents(threshold=2., mask_token='_')
train_df['masked_content']

0       The _ market, _ _ for a _ of _, _ an _ _, but it is not a _ for all of the market's _, a _ _ sai...
1       _ the _ 3-0 in a _ _ _ on , _ up a _ for _ _ _ and _ the _ _ _ _ for _.\nThe _-_ _ _ of the _ fr...
2       The _ in the _ _ to _ in from in _ market _, and the _ _ _ _ _ is on _, analysts said on .\n() _...
3       _ _ a.s. on said its _ _ _ as a _ of its _ _ _, _ with a _ in the _ _ _ into its _ _.\nsaid its ...
4       _ _ on _ the _ _'s _ in _, but analysts said the _ was _ more to a _ _ than to a _ in _ _.\n_ _ ...
                                                       ...                                                 
2245    "" -- _-_ _ of the 's  -- is _ _ in a _ for a _ and some _ are up in _.\nis _ a _ _ _ the _'s _ ...
2246    , the _ _ _ _ _ to _ _ to _ the market, has been _ from _ _ as _ is _ by _.\nThe _ _ -- after 's...
2247    , 's _ _ and _ _, is _ for a _ _ when _ _ the _ _ _ _ _ by 's _ _.\nThe _-_ "" on and are _ up t...
2248    _-_ _ _ _ 's _ _ on 

### Dataset Class

In [8]:
class Dataset(torch.utils.data.Dataset):

    def __init__(self, df):

        self.labels = df['author_id'].values
        self.texts = [tokenizer(text, 
                               padding='max_length', max_length = 512, truncation=True,
                                return_tensors="pt") for text in df['masked_content']]
        # add attention mask
#         mask_val = tokenizer.vocab.get('_')
#         for text in self.texts:
#             text['attention_mask'] = torch.where(text['input_ids'] == mask_val, 0, text['attention_mask'])

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y

### Model Class

In [9]:
from torch import nn
from transformers import DistilBertForSequenceClassification, BertForSequenceClassification, DebertaForSequenceClassification, get_linear_schedule_with_warmup

class BertClassifier(nn.Module):

    def __init__(self, dropout=0.5):

        super().__init__()

        self.bert = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(le.classes_))
        # self.dropout = nn.Dropout(dropout)
        # self.linear = nn.Linear(768, 50)
        # self.relu = nn.ReLU()

    def forward(self, input_id, labels=None):

        # pooled_output, = self.bert(input_ids=input_id, attention_mask=mask, return_dict=False)
        # pooled_output = pooled_output[:,0,:]
        # dropout_output = self.dropout(pooled_output)
        # linear_output = self.linear(dropout_output)
        # final_layer = self.relu(linear_output)
        # print(input_id.shape, labels.shape)
        final_layer = self.bert(input_id, labels=labels)

        return final_layer

### Training

In [None]:
# !git clone https://gist.github.com/NTT123/4596e5533e573c8ceab2f319ab5d36a2 jslog
# import random
# import math
# import time
# from jslog.jslogger import JSLogger

# logger = JSLogger('train/valid loss', ['train', 'valid'])
# logger_ = JSLogger('train/valid accuracy', ['train', 'valid'])

In [None]:
from torch.optim import AdamW
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score

def train(model, train_data, val_data, learning_rate, epochs, batch_size):

    train, val = Dataset(train_data), Dataset(val_data)

    train_dataloader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=batch_size)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = nn.CrossEntropyLoss()
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    lr_scheduler = get_linear_schedule_with_warmup(
        optimizer=optimizer,
        num_warmup_steps=100,
        num_training_steps=len(train_dataloader) * epochs,
    )

    if use_cuda:
        
        print('cuda version', torch.__version__)
        model = model.cuda()
        criterion = criterion.cuda()

    for epoch_num in range(epochs):
        
        train_loss = 0
        train_pred = []
        train_truth = []
        model.train()

        for train_input, train_label in tqdm(train_dataloader):

            train_label = train_label.to(device)
            # mask = train_input['attention_mask'].to(device)
            input_id = train_input['input_ids'].squeeze(1).to(device)

            output = model(input_id, train_label)
                
            # batch_loss = criterion(output, train_label.long())
            batch_loss = output.loss
            train_loss += batch_loss.item()
            train_pred.append(output.logits.argmax(dim=1))
            train_truth.append(train_label)

            model.zero_grad()
            batch_loss.backward()
            optimizer.step()
            lr_scheduler.step()
                
        train_truth = torch.cat(train_truth).detach().cpu().numpy()
        train_pred = torch.cat(train_pred).detach().cpu().numpy()
        train_acc = accuracy_score(train_truth, train_pred)
        train_f1 = f1_score(train_truth, train_pred, average='macro')
            
        val_loss = 0
        val_pred = []
        val_truth = []
        model.eval()

        with torch.no_grad():

            for val_input, val_label in val_dataloader:

                val_label = val_label.to(device)
                # mask = val_input['attention_mask'].to(device)
                input_id = val_input['input_ids'].squeeze(1).to(device)

                output = model(input_id, val_label)

                # batch_loss = criterion(output, val_label.long())
                batch_loss = output.loss
                val_loss += batch_loss.item()
                val_pred.append(output.logits.argmax(dim=1))
                val_truth.append(val_label)
            
        val_truth = torch.cat(val_truth).detach().cpu().numpy()
        val_pred = torch.cat(val_pred).detach().cpu().numpy()
        val_acc = accuracy_score(val_truth, val_pred)
        val_f1 = f1_score(val_truth, val_pred, average='macro')
            
        print('Epochs: {} | Train Loss: {:.4f} | Train Accuracy: {:.4f} | Train F1: {:.4f}'\
                .format(epoch_num + 1, train_loss, train_acc, train_f1))
        print('           | Val Loss:   {:.4f} | Val Accuracy:   {:.4f} | Val F1:   {:.4f}'\
                .format(val_loss, val_acc, val_f1))
            
        # logger.log(epoch_num + 1, {'train': total_loss_train / len(train_data), 'valid': total_loss_val / len(val_data)})
        # logger_.log(epoch_num + 1, {'train': total_acc_train / len(train_data), 'valid': total_acc_val / len(val_data)})
    return train_acc, train_f1, val_acc, val_f1

# logger.show()
# logger_.show()
EPOCHS = 15
LR = 2e-5
BATCH_SIZE = 16
model = BertClassifier()
print('There are', sum(p.numel() for p in model.parameters()), 'parameters')

mask_contents(threshold=3.)
print(train_df['masked_content'])
              
train(model, train_df, valid_df, LR, EPOCHS, BATCH_SIZE)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

There are 109520690 parameters
0       The [MASK] market, [MASK] [MASK] for a [MASK] of [MASK], [MASK] an [MASK] [MASK], but it is not ...
1       [MASK] the [MASK] 3-0 in a [MASK] [MASK] [MASK] on , [MASK] up a [MASK] for group [MASK] against...
2       The [MASK] in the [MASK] [MASK] to [MASK] in from in [MASK] market [MASK], and the [MASK] [MASK]...
3       [MASK] [MASK] a.s. on said its [MASK] profit [MASK] as a [MASK] of its [MASK] [MASK] [MASK], [MA...
4       shares [MASK] on [MASK] the [MASK] government's [MASK] in [MASK], but analysts said the rise was...
                                                       ...                                                 
2245    "" -- [MASK]-[MASK] [MASK] of the 's  -- is being used in a [MASK] for a bank and some [MASK] ar...
2246    , the [MASK] stock [MASK] [MASK] [MASK] to [MASK] [MASK] to [MASK] the market, has been [MASK] f...
2247    , 's biggest [MASK] and financial [MASK], is [MASK] for a [MASK] [MASK] when [MASK] [MASK] the [.

### Evaluate

In [None]:
def evaluate(model, test_data):

    test = Dataset(test_data)

    test_dataloader = torch.utils.data.DataLoader(test, batch_size=32)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:

        model = model.cuda()

    test_pred = []
    test_truth = []
    model.eval()
    
    with torch.no_grad():

        for test_input, test_label in test_dataloader:
            
            test_label = test_label.to(device)
            # mask = test_input['attention_mask'].to(device)
            input_id = test_input['input_ids'].squeeze(1).to(device)

            output = model(input_id, test_label)

            test_pred.append(output.logits.argmax(dim=1))
            test_truth.append(test_label)
        
        test_truth = torch.cat(test_truth).detach().cpu().numpy()
        test_pred = torch.cat(test_pred).detach().cpu().numpy()
        test_acc = accuracy_score(test_truth, test_pred)
        test_f1 = f1_score(test_truth, test_pred, average='macro')
    
    print('Test Accuracy:  {:.4f} | Test F1:  {:.4f}'.format(test_acc, test_f1))
    return test_acc, test_f1
    
print(test_df['masked_content'])
evaluate(model, test_df)

### Fine-Tuning

In [None]:
def fine_tuning(lr_list, epoch_list, batch_list, checkpoint_path='tuning2'):
    
    index_list, index2_list, index3_list = [], [], []
    train_accs, train_f1s = [], []
    test_accs, test_f1s = [], []
    
    for lr in lr_list:
        for epoch in epoch_list:
            for batch_size in batch_list:
            
                model = BertClassifier()
                # mask_contents(threshold=threshold)
                print('lr at {}, epoch at {}, batch_size at {}'.format(lr, epoch, batch_size))
                train_acc, train_f1, val_acc, val_f1 = train(model, train_df, valid_df, lr, epoch, batch_size)
                test_acc, test_f1 = evaluate(model, test_df)

                train_accs.append(train_acc)
                train_f1s.append(train_f1)
                test_accs.append(test_acc)
                test_f1s.append(test_f1)
                index_list.append(lr)
                index2_list.append(epoch)
                index3_list.append(batch_size)

                result_df = pd.DataFrame(index_list, columns=['lr'])
                result_df['epoch'] = index2_list
                result_df['batch_size'] = index3_list
                result_df['train_accs'] = train_accs
                result_df['train_f1s'] = train_f1s
                result_df['test_accs'] = test_accs
                result_df['test_f1s'] = test_f1s
                result_df['model_name'] = 'bert_uncased'
                result_df.to_csv(checkpoint_path + '.csv', index=False)
                # break
    return index_list, index2_list, index3_list, train_accs, train_f1s, test_accs, test_f1s

In [None]:
# C50 [5e-5 + 10 + 16, 2e-5 + 15 + 16, 1e-5 + 15 + 8]
# All The News [2e-5 + 15 + 16]
lr_list = [2e-5] #[5e-5, 2e-5, 1e-5]
epoch_list = [15] #[10, 15]
batch_list = [16] #[16]
fine_tuning(lr_list=lr_list, epoch_list=epoch_list, batch_list=batch_list)

### Grid Search IDF

In [13]:
import warnings

def grid_search_idf(threshold_list, checkpoint_path='checkpoint_C502'):
    
    index_list = []
    train_accs, train_f1s = [], []
    test_accs, test_f1s = [], []
    
    for threshold in threshold_list:
        
        # warnings.filterwarnings(action='once')
        model = BertClassifier()
        mask_contents(threshold=threshold)
        print('idf removal threshold at', threshold)
        print(train_df['masked_content'])
        train_acc, train_f1, val_acc, val_f1 = train(model, train_df, valid_df, LR, EPOCHS, BATCH_SIZE)
        test_acc, test_f1 = evaluate(model, test_df)
        
        train_accs.append(train_acc)
        train_f1s.append(train_f1)
        test_accs.append(test_acc)
        test_f1s.append(test_f1)
        index_list.append(threshold)
        
        result_df = pd.DataFrame(index_list, columns=['idf_threshold'])
        result_df['train_accs'] = train_accs
        result_df['train_f1s'] = train_f1s
        result_df['test_accs'] = test_accs
        result_df['test_f1s'] = test_f1s
        result_df['model_name'] = 'bert_uncased'
        result_df.to_csv(checkpoint_path + '.csv', index=False)
        # break
    return index_list, train_accs, train_f1s, test_accs, test_f1s

In [None]:
pd.options.display.max_colwidth = 100
EPOCHS = 15
LR = 2e-5
BATCH_SIZE = 16
grid_search_idf(threshold_list=np.arange(7.5, 1, -0.5))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

idf removal threshold at 7.5
0       The capital market, [MASK] criticised for a lack of transparency, needs an independent regulator...
1       beat the Republic 3-0 in a ice hockey game on , setting up a showdown for group [MASK] against a...
2       The fall in the trade deficit to crowns in from in buoyed market sentiment, and the goods import...
3       paper concern a.s. on said its net profit fell as a shutdown of its main paper mill, coupled wit...
4       shares rallied on following the coalition government's win in elections, but analysts said the r...
                                                       ...                                                 
2245    "" -- counter-culture [MASK] of the 's  -- is being used in a [MASK] for a bank and some fans ar...
2246    , the flamboyant stock picker hired recently to spearhead drive to dominate the market, has been...
2247    , 's biggest city and financial capital, is [MASK] for a near shutdown when protesters hit the s...

100%|█████████████████████████████████████████| 141/141 [01:54<00:00,  1.24it/s]


Epochs: 1 | Train Loss: 545.6220 | Train Accuracy: 0.0502 | Train F1: 0.0389
           | Val Loss:   58.0571 | Val Accuracy:   0.1560 | Val F1:   0.1045


100%|█████████████████████████████████████████| 141/141 [01:52<00:00,  1.26it/s]


Epochs: 2 | Train Loss: 447.7361 | Train Accuracy: 0.3311 | Train F1: 0.2949
           | Val Loss:   43.5375 | Val Accuracy:   0.4720 | Val F1:   0.4092


100%|█████████████████████████████████████████| 141/141 [01:52<00:00,  1.26it/s]


Epochs: 3 | Train Loss: 339.5466 | Train Accuracy: 0.5311 | Train F1: 0.4945
           | Val Loss:   34.4097 | Val Accuracy:   0.5560 | Val F1:   0.4879


100%|█████████████████████████████████████████| 141/141 [01:52<00:00,  1.26it/s]


Epochs: 4 | Train Loss: 265.6287 | Train Accuracy: 0.6440 | Train F1: 0.6180
           | Val Loss:   27.4112 | Val Accuracy:   0.6560 | Val F1:   0.6003


100%|█████████████████████████████████████████| 141/141 [01:51<00:00,  1.26it/s]


Epochs: 5 | Train Loss: 205.7659 | Train Accuracy: 0.7627 | Train F1: 0.7487
           | Val Loss:   23.2328 | Val Accuracy:   0.6920 | Val F1:   0.6503


100%|█████████████████████████████████████████| 141/141 [01:52<00:00,  1.26it/s]


Epochs: 6 | Train Loss: 162.2556 | Train Accuracy: 0.8289 | Train F1: 0.8228
           | Val Loss:   19.5558 | Val Accuracy:   0.7400 | Val F1:   0.7161


100%|█████████████████████████████████████████| 141/141 [01:52<00:00,  1.26it/s]


Epochs: 7 | Train Loss: 129.3913 | Train Accuracy: 0.8756 | Train F1: 0.8710
           | Val Loss:   17.4828 | Val Accuracy:   0.7560 | Val F1:   0.7263


100%|█████████████████████████████████████████| 141/141 [01:52<00:00,  1.26it/s]


Epochs: 8 | Train Loss: 103.1076 | Train Accuracy: 0.9142 | Train F1: 0.9111
           | Val Loss:   15.7156 | Val Accuracy:   0.7520 | Val F1:   0.7288


100%|█████████████████████████████████████████| 141/141 [01:51<00:00,  1.26it/s]


Epochs: 9 | Train Loss: 83.2489 | Train Accuracy: 0.9476 | Train F1: 0.9453
           | Val Loss:   14.3978 | Val Accuracy:   0.7880 | Val F1:   0.7717


100%|█████████████████████████████████████████| 141/141 [01:52<00:00,  1.26it/s]


Epochs: 10 | Train Loss: 69.4408 | Train Accuracy: 0.9600 | Train F1: 0.9574
           | Val Loss:   13.6299 | Val Accuracy:   0.7760 | Val F1:   0.7538


100%|█████████████████████████████████████████| 141/141 [01:52<00:00,  1.26it/s]


Epochs: 11 | Train Loss: 57.6751 | Train Accuracy: 0.9791 | Train F1: 0.9782
           | Val Loss:   12.7011 | Val Accuracy:   0.7960 | Val F1:   0.7570


100%|█████████████████████████████████████████| 141/141 [01:52<00:00,  1.26it/s]


Epochs: 12 | Train Loss: 50.4283 | Train Accuracy: 0.9867 | Train F1: 0.9861
           | Val Loss:   12.1603 | Val Accuracy:   0.8000 | Val F1:   0.7835


100%|█████████████████████████████████████████| 141/141 [01:52<00:00,  1.26it/s]


Epochs: 13 | Train Loss: 45.3443 | Train Accuracy: 0.9907 | Train F1: 0.9904
           | Val Loss:   11.6232 | Val Accuracy:   0.8160 | Val F1:   0.7990


100%|█████████████████████████████████████████| 141/141 [01:51<00:00,  1.26it/s]


Epochs: 14 | Train Loss: 41.9724 | Train Accuracy: 0.9942 | Train F1: 0.9940
           | Val Loss:   11.4854 | Val Accuracy:   0.8040 | Val F1:   0.7897


100%|█████████████████████████████████████████| 141/141 [01:52<00:00,  1.26it/s]


Epochs: 15 | Train Loss: 39.8608 | Train Accuracy: 0.9960 | Train F1: 0.9958
           | Val Loss:   11.4543 | Val Accuracy:   0.7960 | Val F1:   0.7774
Test Accuracy:  0.6124 | Test F1:  0.6095


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

idf removal threshold at 7.0
0       The capital market, [MASK] criticised for a lack of transparency, needs an independent regulator...
1       beat the Republic 3-0 in a ice hockey game on , setting up a showdown for group [MASK] against a...
2       The fall in the trade deficit to crowns in from in buoyed market sentiment, and the goods import...
3       paper concern a.s. on said its net profit fell as a shutdown of its main paper mill, coupled wit...
4       shares rallied on following the coalition government's win in elections, but analysts said the r...
                                                       ...                                                 
2245    "" -- counter-culture [MASK] of the 's  -- is being used in a [MASK] for a bank and some fans ar...
2246    , the flamboyant stock picker hired recently to [MASK] drive to dominate the market, has been re...
2247    , 's biggest city and financial capital, is [MASK] for a near shutdown when protesters hit the s...

100%|█████████████████████████████████████████| 141/141 [01:52<00:00,  1.26it/s]


Epochs: 1 | Train Loss: 549.6167 | Train Accuracy: 0.0347 | Train F1: 0.0200
           | Val Loss:   59.2433 | Val Accuracy:   0.1120 | Val F1:   0.0710


100%|█████████████████████████████████████████| 141/141 [01:52<00:00,  1.26it/s]


Epochs: 2 | Train Loss: 453.6790 | Train Accuracy: 0.2947 | Train F1: 0.2699
           | Val Loss:   43.1167 | Val Accuracy:   0.4720 | Val F1:   0.4320


100%|█████████████████████████████████████████| 141/141 [01:52<00:00,  1.26it/s]


Epochs: 3 | Train Loss: 336.3838 | Train Accuracy: 0.5511 | Train F1: 0.5204
           | Val Loss:   33.3452 | Val Accuracy:   0.5440 | Val F1:   0.5057


100%|█████████████████████████████████████████| 141/141 [01:52<00:00,  1.26it/s]


Epochs: 4 | Train Loss: 259.3028 | Train Accuracy: 0.6413 | Train F1: 0.6178
           | Val Loss:   26.8178 | Val Accuracy:   0.6840 | Val F1:   0.6484


100%|█████████████████████████████████████████| 141/141 [01:51<00:00,  1.26it/s]


Epochs: 5 | Train Loss: 204.7271 | Train Accuracy: 0.7280 | Train F1: 0.7154
           | Val Loss:   22.0298 | Val Accuracy:   0.7080 | Val F1:   0.6709


100%|█████████████████████████████████████████| 141/141 [01:52<00:00,  1.26it/s]


Epochs: 6 | Train Loss: 163.3233 | Train Accuracy: 0.8004 | Train F1: 0.7950
           | Val Loss:   19.5050 | Val Accuracy:   0.7440 | Val F1:   0.7300


100%|█████████████████████████████████████████| 141/141 [01:52<00:00,  1.26it/s]


Epochs: 7 | Train Loss: 131.3244 | Train Accuracy: 0.8627 | Train F1: 0.8593
           | Val Loss:   16.6603 | Val Accuracy:   0.8000 | Val F1:   0.7825


100%|█████████████████████████████████████████| 141/141 [01:52<00:00,  1.26it/s]


Epochs: 8 | Train Loss: 106.7543 | Train Accuracy: 0.9053 | Train F1: 0.9035
           | Val Loss:   15.1917 | Val Accuracy:   0.8000 | Val F1:   0.7890


100%|█████████████████████████████████████████| 141/141 [01:47<00:00,  1.31it/s]


Epochs: 9 | Train Loss: 87.1474 | Train Accuracy: 0.9387 | Train F1: 0.9376
           | Val Loss:   13.8188 | Val Accuracy:   0.8120 | Val F1:   0.7959


100%|█████████████████████████████████████████| 141/141 [01:13<00:00,  1.93it/s]


Epochs: 10 | Train Loss: 71.9550 | Train Accuracy: 0.9609 | Train F1: 0.9603
           | Val Loss:   13.2836 | Val Accuracy:   0.8080 | Val F1:   0.7983


100%|█████████████████████████████████████████| 141/141 [01:52<00:00,  1.25it/s]


Epochs: 11 | Train Loss: 59.7395 | Train Accuracy: 0.9778 | Train F1: 0.9775
           | Val Loss:   12.5529 | Val Accuracy:   0.8120 | Val F1:   0.7948


100%|█████████████████████████████████████████| 141/141 [01:51<00:00,  1.26it/s]


Epochs: 12 | Train Loss: 51.9792 | Train Accuracy: 0.9889 | Train F1: 0.9887
           | Val Loss:   12.2197 | Val Accuracy:   0.8240 | Val F1:   0.8184


100%|█████████████████████████████████████████| 141/141 [01:51<00:00,  1.26it/s]


Epochs: 13 | Train Loss: 46.1539 | Train Accuracy: 0.9947 | Train F1: 0.9946
           | Val Loss:   11.9587 | Val Accuracy:   0.8280 | Val F1:   0.8190


100%|█████████████████████████████████████████| 141/141 [01:52<00:00,  1.26it/s]


Epochs: 14 | Train Loss: 42.2235 | Train Accuracy: 0.9947 | Train F1: 0.9945
           | Val Loss:   11.7985 | Val Accuracy:   0.8280 | Val F1:   0.8243


100%|█████████████████████████████████████████| 141/141 [01:51<00:00,  1.26it/s]


Epochs: 15 | Train Loss: 40.3989 | Train Accuracy: 0.9973 | Train F1: 0.9973
           | Val Loss:   11.6726 | Val Accuracy:   0.8240 | Val F1:   0.8244
Test Accuracy:  0.6228 | Test F1:  0.6202


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

idf removal threshold at 6.5
0       The capital market, [MASK] criticised for a lack of transparency, needs an independent regulator...
1       beat the [MASK] 3-0 in a ice [MASK] game on , setting up a [MASK] for group [MASK] against and l...
2       The fall in the trade deficit to crowns in from in buoyed market sentiment, and the goods import...
3       paper concern a.s. on said its net profit fell as a shutdown of its main paper mill, coupled wit...
4       shares rallied on following the coalition government's win in elections, but analysts said the r...
                                                       ...                                                 
2245    "" -- counter-culture [MASK] of the 's  -- is being used in a [MASK] for a bank and some fans ar...
2246    , the [MASK] stock [MASK] hired recently to [MASK] drive to dominate the market, has been remove...
2247    , 's biggest city and financial capital, is [MASK] for a near shutdown when protesters hit the s...

100%|█████████████████████████████████████████| 141/141 [01:52<00:00,  1.26it/s]


Epochs: 1 | Train Loss: 544.1699 | Train Accuracy: 0.0418 | Train F1: 0.0343
           | Val Loss:   56.1584 | Val Accuracy:   0.1920 | Val F1:   0.1084


100%|█████████████████████████████████████████| 141/141 [01:51<00:00,  1.26it/s]


Epochs: 2 | Train Loss: 432.9175 | Train Accuracy: 0.3249 | Train F1: 0.2891
           | Val Loss:   41.9124 | Val Accuracy:   0.4280 | Val F1:   0.3591


100%|█████████████████████████████████████████| 141/141 [01:51<00:00,  1.26it/s]


Epochs: 3 | Train Loss: 330.7208 | Train Accuracy: 0.5209 | Train F1: 0.4858
           | Val Loss:   33.0773 | Val Accuracy:   0.5640 | Val F1:   0.5099


100%|█████████████████████████████████████████| 141/141 [01:52<00:00,  1.26it/s]


Epochs: 4 | Train Loss: 258.0788 | Train Accuracy: 0.6311 | Train F1: 0.6016
           | Val Loss:   27.1193 | Val Accuracy:   0.6360 | Val F1:   0.5821


100%|█████████████████████████████████████████| 141/141 [01:51<00:00,  1.26it/s]


Epochs: 5 | Train Loss: 202.8932 | Train Accuracy: 0.7378 | Train F1: 0.7224
           | Val Loss:   23.2317 | Val Accuracy:   0.6760 | Val F1:   0.6664


100%|█████████████████████████████████████████| 141/141 [01:51<00:00,  1.26it/s]


Epochs: 6 | Train Loss: 161.8172 | Train Accuracy: 0.8200 | Train F1: 0.8114
           | Val Loss:   19.6536 | Val Accuracy:   0.7000 | Val F1:   0.6755


100%|█████████████████████████████████████████| 141/141 [01:51<00:00,  1.26it/s]


Epochs: 7 | Train Loss: 128.8161 | Train Accuracy: 0.8769 | Train F1: 0.8709
           | Val Loss:   17.4003 | Val Accuracy:   0.7120 | Val F1:   0.6888


100%|█████████████████████████████████████████| 141/141 [01:51<00:00,  1.26it/s]


Epochs: 8 | Train Loss: 102.4966 | Train Accuracy: 0.9196 | Train F1: 0.9171
           | Val Loss:   15.5782 | Val Accuracy:   0.7600 | Val F1:   0.7537


100%|█████████████████████████████████████████| 141/141 [01:51<00:00,  1.26it/s]


Epochs: 9 | Train Loss: 82.4903 | Train Accuracy: 0.9498 | Train F1: 0.9473
           | Val Loss:   14.5040 | Val Accuracy:   0.7360 | Val F1:   0.7318


100%|█████████████████████████████████████████| 141/141 [01:51<00:00,  1.26it/s]


Epochs: 10 | Train Loss: 67.1494 | Train Accuracy: 0.9680 | Train F1: 0.9672
           | Val Loss:   13.4929 | Val Accuracy:   0.7680 | Val F1:   0.7678


100%|█████████████████████████████████████████| 141/141 [01:51<00:00,  1.26it/s]


Epochs: 11 | Train Loss: 56.9779 | Train Accuracy: 0.9796 | Train F1: 0.9790
           | Val Loss:   13.1498 | Val Accuracy:   0.7520 | Val F1:   0.7509


100%|█████████████████████████████████████████| 141/141 [01:52<00:00,  1.26it/s]


Epochs: 12 | Train Loss: 49.7417 | Train Accuracy: 0.9844 | Train F1: 0.9842
           | Val Loss:   12.7490 | Val Accuracy:   0.7600 | Val F1:   0.7349


100%|█████████████████████████████████████████| 141/141 [01:51<00:00,  1.26it/s]


Epochs: 13 | Train Loss: 44.2114 | Train Accuracy: 0.9889 | Train F1: 0.9886
           | Val Loss:   12.1109 | Val Accuracy:   0.7680 | Val F1:   0.7456


100%|█████████████████████████████████████████| 141/141 [01:51<00:00,  1.26it/s]


Epochs: 14 | Train Loss: 41.0691 | Train Accuracy: 0.9938 | Train F1: 0.9938
           | Val Loss:   12.0993 | Val Accuracy:   0.7840 | Val F1:   0.7581


100%|█████████████████████████████████████████| 141/141 [01:52<00:00,  1.26it/s]


Epochs: 15 | Train Loss: 39.1543 | Train Accuracy: 0.9924 | Train F1: 0.9923
           | Val Loss:   11.9292 | Val Accuracy:   0.7880 | Val F1:   0.7828
Test Accuracy:  0.5976 | Test F1:  0.5966


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

idf removal threshold at 6.0
0       The capital market, [MASK] criticised for a lack of transparency, needs an independent regulator...
1       beat the [MASK] 3-0 in a ice [MASK] game on , setting up a [MASK] for group [MASK] against and l...
2       The fall in the trade deficit to crowns in from in buoyed market sentiment, and the goods import...
3       paper concern a.s. on said its net profit fell as a shutdown of its main paper [MASK], coupled w...
4       shares rallied on following the coalition government's win in elections, but analysts said the r...
                                                       ...                                                 
2245    "" -- counter-culture [MASK] of the 's  -- is being used in a [MASK] for a bank and some fans ar...
2246    , the [MASK] stock [MASK] hired recently to [MASK] drive to dominate the market, has been remove...
2247    , 's biggest city and financial capital, is [MASK] for a near shutdown when protesters hit the s...

100%|█████████████████████████████████████████| 141/141 [01:52<00:00,  1.26it/s]


Epochs: 1 | Train Loss: 548.9754 | Train Accuracy: 0.0356 | Train F1: 0.0239
           | Val Loss:   59.7495 | Val Accuracy:   0.1240 | Val F1:   0.0684


100%|█████████████████████████████████████████| 141/141 [01:52<00:00,  1.26it/s]


Epochs: 2 | Train Loss: 466.5549 | Train Accuracy: 0.2773 | Train F1: 0.2369
           | Val Loss:   45.4791 | Val Accuracy:   0.3920 | Val F1:   0.3008


100%|█████████████████████████████████████████| 141/141 [01:43<00:00,  1.37it/s]


Epochs: 3 | Train Loss: 356.8728 | Train Accuracy: 0.4973 | Train F1: 0.4517
           | Val Loss:   36.3457 | Val Accuracy:   0.5160 | Val F1:   0.4452


100%|█████████████████████████████████████████| 141/141 [01:20<00:00,  1.76it/s]


Epochs: 4 | Train Loss: 279.6385 | Train Accuracy: 0.6169 | Train F1: 0.5863
           | Val Loss:   29.7333 | Val Accuracy:   0.6080 | Val F1:   0.5359


100%|█████████████████████████████████████████| 141/141 [01:51<00:00,  1.26it/s]


Epochs: 5 | Train Loss: 220.4230 | Train Accuracy: 0.7218 | Train F1: 0.7069
           | Val Loss:   25.2484 | Val Accuracy:   0.6800 | Val F1:   0.6374


100%|█████████████████████████████████████████| 141/141 [01:51<00:00,  1.26it/s]


Epochs: 6 | Train Loss: 175.0443 | Train Accuracy: 0.7960 | Train F1: 0.7879
           | Val Loss:   21.7051 | Val Accuracy:   0.6920 | Val F1:   0.6501


100%|█████████████████████████████████████████| 141/141 [01:51<00:00,  1.26it/s]


Epochs: 7 | Train Loss: 136.8756 | Train Accuracy: 0.8693 | Train F1: 0.8644
           | Val Loss:   19.5945 | Val Accuracy:   0.6880 | Val F1:   0.6595


100%|█████████████████████████████████████████| 141/141 [01:52<00:00,  1.26it/s]


Epochs: 8 | Train Loss: 109.1586 | Train Accuracy: 0.9164 | Train F1: 0.9143
           | Val Loss:   17.8169 | Val Accuracy:   0.7000 | Val F1:   0.6697


100%|█████████████████████████████████████████| 141/141 [01:51<00:00,  1.26it/s]


Epochs: 9 | Train Loss: 89.0146 | Train Accuracy: 0.9427 | Train F1: 0.9397
           | Val Loss:   16.7080 | Val Accuracy:   0.7400 | Val F1:   0.7126


100%|█████████████████████████████████████████| 141/141 [01:51<00:00,  1.26it/s]


Epochs: 10 | Train Loss: 73.2950 | Train Accuracy: 0.9658 | Train F1: 0.9655
           | Val Loss:   15.6824 | Val Accuracy:   0.7360 | Val F1:   0.7124


100%|█████████████████████████████████████████| 141/141 [01:51<00:00,  1.26it/s]


Epochs: 11 | Train Loss: 61.3197 | Train Accuracy: 0.9756 | Train F1: 0.9751
           | Val Loss:   14.4076 | Val Accuracy:   0.7840 | Val F1:   0.7471


100%|█████████████████████████████████████████| 141/141 [01:51<00:00,  1.26it/s]


Epochs: 12 | Train Loss: 52.8451 | Train Accuracy: 0.9889 | Train F1: 0.9888
           | Val Loss:   14.0333 | Val Accuracy:   0.7720 | Val F1:   0.7466


100%|█████████████████████████████████████████| 141/141 [01:51<00:00,  1.26it/s]


Epochs: 13 | Train Loss: 47.4702 | Train Accuracy: 0.9889 | Train F1: 0.9887
           | Val Loss:   13.9435 | Val Accuracy:   0.7760 | Val F1:   0.7470


100%|█████████████████████████████████████████| 141/141 [01:52<00:00,  1.26it/s]


Epochs: 14 | Train Loss: 43.3586 | Train Accuracy: 0.9964 | Train F1: 0.9963
           | Val Loss:   13.6656 | Val Accuracy:   0.7840 | Val F1:   0.7538


100%|█████████████████████████████████████████| 141/141 [01:51<00:00,  1.26it/s]


Epochs: 15 | Train Loss: 41.4762 | Train Accuracy: 0.9978 | Train F1: 0.9978
           | Val Loss:   13.6168 | Val Accuracy:   0.7880 | Val F1:   0.7624
Test Accuracy:  0.6076 | Test F1:  0.6043


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

idf removal threshold at 5.5
0       The capital market, [MASK] criticised for a lack of transparency, needs an independent regulator...
1       beat the [MASK] 3-0 in a [MASK] [MASK] game on , setting up a [MASK] for group [MASK] against an...
2       The fall in the trade deficit to crowns in from in [MASK] market sentiment, and the goods import...
3       paper concern a.s. on said its net profit fell as a [MASK] of its main paper [MASK], [MASK] with...
4       shares [MASK] on following the coalition government's win in elections, but analysts said the ri...
                                                       ...                                                 
2245    "" -- counter-culture [MASK] of the 's  -- is being used in a [MASK] for a bank and some [MASK] ...
2246    , the [MASK] stock [MASK] hired recently to [MASK] drive to [MASK] the market, has been removed ...
2247    , 's biggest city and financial capital, is [MASK] for a near [MASK] when [MASK] hit the [MASK] ...

100%|█████████████████████████████████████████| 141/141 [01:51<00:00,  1.26it/s]


Epochs: 1 | Train Loss: 547.8099 | Train Accuracy: 0.0231 | Train F1: 0.0124
           | Val Loss:   60.5135 | Val Accuracy:   0.0720 | Val F1:   0.0396


100%|█████████████████████████████████████████| 141/141 [01:52<00:00,  1.26it/s]


Epochs: 2 | Train Loss: 476.1477 | Train Accuracy: 0.2071 | Train F1: 0.1823
           | Val Loss:   46.0089 | Val Accuracy:   0.3760 | Val F1:   0.2948


100%|█████████████████████████████████████████| 141/141 [01:51<00:00,  1.26it/s]


Epochs: 3 | Train Loss: 366.5328 | Train Accuracy: 0.4520 | Train F1: 0.4053
           | Val Loss:   37.0279 | Val Accuracy:   0.4800 | Val F1:   0.3967


100%|█████████████████████████████████████████| 141/141 [01:52<00:00,  1.26it/s]


Epochs: 4 | Train Loss: 289.8503 | Train Accuracy: 0.5951 | Train F1: 0.5626
           | Val Loss:   30.2448 | Val Accuracy:   0.5880 | Val F1:   0.5189


100%|█████████████████████████████████████████| 141/141 [01:51<00:00,  1.26it/s]


Epochs: 5 | Train Loss: 233.2129 | Train Accuracy: 0.6804 | Train F1: 0.6598
           | Val Loss:   25.3220 | Val Accuracy:   0.6800 | Val F1:   0.6351


100%|█████████████████████████████████████████| 141/141 [01:51<00:00,  1.26it/s]


Epochs: 6 | Train Loss: 188.3929 | Train Accuracy: 0.7693 | Train F1: 0.7559
           | Val Loss:   22.4553 | Val Accuracy:   0.6640 | Val F1:   0.6250


100%|█████████████████████████████████████████| 141/141 [01:51<00:00,  1.26it/s]


Epochs: 7 | Train Loss: 149.4913 | Train Accuracy: 0.8418 | Train F1: 0.8337
           | Val Loss:   20.0157 | Val Accuracy:   0.7040 | Val F1:   0.6716


100%|█████████████████████████████████████████| 141/141 [01:51<00:00,  1.26it/s]


Epochs: 8 | Train Loss: 120.7579 | Train Accuracy: 0.8960 | Train F1: 0.8924
           | Val Loss:   18.3130 | Val Accuracy:   0.7400 | Val F1:   0.7017


100%|█████████████████████████████████████████| 141/141 [01:52<00:00,  1.26it/s]


Epochs: 9 | Train Loss: 98.2069 | Train Accuracy: 0.9316 | Train F1: 0.9298
           | Val Loss:   16.6134 | Val Accuracy:   0.7600 | Val F1:   0.7201


100%|█████████████████████████████████████████| 141/141 [01:51<00:00,  1.26it/s]


Epochs: 10 | Train Loss: 80.4475 | Train Accuracy: 0.9604 | Train F1: 0.9594
           | Val Loss:   16.3649 | Val Accuracy:   0.7520 | Val F1:   0.7348


100%|█████████████████████████████████████████| 141/141 [01:51<00:00,  1.26it/s]


Epochs: 11 | Train Loss: 67.9212 | Train Accuracy: 0.9769 | Train F1: 0.9762
           | Val Loss:   15.3137 | Val Accuracy:   0.7520 | Val F1:   0.7116


100%|█████████████████████████████████████████| 141/141 [01:51<00:00,  1.26it/s]


Epochs: 12 | Train Loss: 59.5063 | Train Accuracy: 0.9836 | Train F1: 0.9832
           | Val Loss:   15.2100 | Val Accuracy:   0.7680 | Val F1:   0.7511


100%|█████████████████████████████████████████| 141/141 [01:39<00:00,  1.42it/s]


Epochs: 13 | Train Loss: 52.8927 | Train Accuracy: 0.9884 | Train F1: 0.9882
           | Val Loss:   14.4009 | Val Accuracy:   0.7800 | Val F1:   0.7389


100%|█████████████████████████████████████████| 141/141 [01:22<00:00,  1.70it/s]


Epochs: 14 | Train Loss: 49.1406 | Train Accuracy: 0.9911 | Train F1: 0.9909
           | Val Loss:   14.2964 | Val Accuracy:   0.7800 | Val F1:   0.7583


100%|█████████████████████████████████████████| 141/141 [01:52<00:00,  1.26it/s]


Epochs: 15 | Train Loss: 47.0589 | Train Accuracy: 0.9920 | Train F1: 0.9918
           | Val Loss:   14.2722 | Val Accuracy:   0.7680 | Val F1:   0.7279
Test Accuracy:  0.5872 | Test F1:  0.5845


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

idf removal threshold at 5.0
0       The capital market, [MASK] criticised for a lack of [MASK], needs an independent [MASK], but it ...
1       beat the [MASK] 3-0 in a [MASK] [MASK] game on , setting up a [MASK] for group [MASK] against an...
2       The fall in the trade deficit to crowns in from in [MASK] market sentiment, and the goods [MASK]...
3       paper concern a.s. on said its net profit fell as a [MASK] of its main paper [MASK], [MASK] with...
4       shares [MASK] on following the coalition government's win in elections, but analysts said the ri...
                                                       ...                                                 
2245    "" -- counter-[MASK] [MASK] of the 's  -- is being used in a [MASK] for a bank and some [MASK] a...
2246    , the [MASK] stock [MASK] [MASK] recently to [MASK] drive to [MASK] the market, has been [MASK] ...
2247    , 's biggest city and financial capital, is [MASK] for a near [MASK] when [MASK] hit the [MASK] ...

100%|█████████████████████████████████████████| 141/141 [01:52<00:00,  1.26it/s]


Epochs: 1 | Train Loss: 545.7100 | Train Accuracy: 0.0427 | Train F1: 0.0252
           | Val Loss:   58.2900 | Val Accuracy:   0.1800 | Val F1:   0.0999


100%|█████████████████████████████████████████| 141/141 [01:51<00:00,  1.26it/s]


Epochs: 2 | Train Loss: 454.5500 | Train Accuracy: 0.2729 | Train F1: 0.2319
           | Val Loss:   44.3573 | Val Accuracy:   0.4200 | Val F1:   0.3269


100%|█████████████████████████████████████████| 141/141 [01:51<00:00,  1.26it/s]


Epochs: 3 | Train Loss: 352.0671 | Train Accuracy: 0.4911 | Train F1: 0.4614
           | Val Loss:   35.7943 | Val Accuracy:   0.5520 | Val F1:   0.4763


100%|█████████████████████████████████████████| 141/141 [01:52<00:00,  1.26it/s]


Epochs: 4 | Train Loss: 279.3149 | Train Accuracy: 0.6129 | Train F1: 0.5913
           | Val Loss:   29.3050 | Val Accuracy:   0.6280 | Val F1:   0.5853


100%|█████████████████████████████████████████| 141/141 [01:51<00:00,  1.26it/s]


Epochs: 5 | Train Loss: 222.3236 | Train Accuracy: 0.7302 | Train F1: 0.7232
           | Val Loss:   25.0981 | Val Accuracy:   0.6640 | Val F1:   0.6154


100%|█████████████████████████████████████████| 141/141 [01:51<00:00,  1.26it/s]


Epochs: 6 | Train Loss: 177.3865 | Train Accuracy: 0.7991 | Train F1: 0.7961
           | Val Loss:   21.8781 | Val Accuracy:   0.6960 | Val F1:   0.6462


100%|█████████████████████████████████████████| 141/141 [01:51<00:00,  1.26it/s]


Epochs: 7 | Train Loss: 143.9766 | Train Accuracy: 0.8591 | Train F1: 0.8560
           | Val Loss:   19.1461 | Val Accuracy:   0.7320 | Val F1:   0.7040


100%|█████████████████████████████████████████| 141/141 [01:51<00:00,  1.26it/s]


Epochs: 8 | Train Loss: 115.3475 | Train Accuracy: 0.9107 | Train F1: 0.9101
           | Val Loss:   17.1016 | Val Accuracy:   0.7560 | Val F1:   0.7295


100%|█████████████████████████████████████████| 141/141 [01:51<00:00,  1.26it/s]


Epochs: 9 | Train Loss: 95.1079 | Train Accuracy: 0.9413 | Train F1: 0.9407
           | Val Loss:   15.6445 | Val Accuracy:   0.7640 | Val F1:   0.7580


100%|█████████████████████████████████████████| 141/141 [01:51<00:00,  1.26it/s]


Epochs: 10 | Train Loss: 78.9620 | Train Accuracy: 0.9622 | Train F1: 0.9622
           | Val Loss:   15.0728 | Val Accuracy:   0.7680 | Val F1:   0.7383


100%|█████████████████████████████████████████| 141/141 [01:51<00:00,  1.26it/s]


Epochs: 11 | Train Loss: 66.9025 | Train Accuracy: 0.9773 | Train F1: 0.9773
           | Val Loss:   13.8849 | Val Accuracy:   0.7880 | Val F1:   0.7757


100%|█████████████████████████████████████████| 141/141 [01:51<00:00,  1.26it/s]


Epochs: 12 | Train Loss: 58.1027 | Train Accuracy: 0.9809 | Train F1: 0.9811
           | Val Loss:   13.0409 | Val Accuracy:   0.8000 | Val F1:   0.7843


100%|█████████████████████████████████████████| 141/141 [01:51<00:00,  1.26it/s]


Epochs: 13 | Train Loss: 52.7792 | Train Accuracy: 0.9871 | Train F1: 0.9871
           | Val Loss:   12.6830 | Val Accuracy:   0.8000 | Val F1:   0.7861


100%|█████████████████████████████████████████| 141/141 [01:51<00:00,  1.26it/s]


Epochs: 14 | Train Loss: 48.5319 | Train Accuracy: 0.9898 | Train F1: 0.9898
           | Val Loss:   12.5250 | Val Accuracy:   0.8120 | Val F1:   0.7938


100%|█████████████████████████████████████████| 141/141 [01:51<00:00,  1.26it/s]


Epochs: 15 | Train Loss: 46.0941 | Train Accuracy: 0.9942 | Train F1: 0.9942
           | Val Loss:   12.4411 | Val Accuracy:   0.8160 | Val F1:   0.7974
Test Accuracy:  0.5860 | Test F1:  0.5800


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

idf removal threshold at 4.5
0       The capital market, [MASK] [MASK] for a lack of [MASK], needs an independent [MASK], but it is n...
1       [MASK] the [MASK] 3-0 in a [MASK] [MASK] [MASK] on , setting up a [MASK] for group [MASK] agains...
2       The fall in the trade [MASK] to [MASK] in from in [MASK] market [MASK], and the goods [MASK] sho...
3       [MASK] concern a.s. on said its net profit fell as a [MASK] of its main [MASK] [MASK], [MASK] wi...
4       shares [MASK] on following the [MASK] government's win in [MASK], but analysts said the rise was...
                                                       ...                                                 
2245    "" -- counter-[MASK] [MASK] of the 's  -- is being used in a [MASK] for a bank and some [MASK] a...
2246    , the [MASK] stock [MASK] [MASK] recently to [MASK] drive to [MASK] the market, has been [MASK] ...
2247    , 's biggest city and financial capital, is [MASK] for a near [MASK] when [MASK] hit the [MASK] ...

100%|█████████████████████████████████████████| 141/141 [01:52<00:00,  1.26it/s]


Epochs: 1 | Train Loss: 541.7176 | Train Accuracy: 0.0418 | Train F1: 0.0217
           | Val Loss:   57.9537 | Val Accuracy:   0.1200 | Val F1:   0.0555


100%|█████████████████████████████████████████| 141/141 [01:52<00:00,  1.26it/s]


Epochs: 2 | Train Loss: 458.7334 | Train Accuracy: 0.2458 | Train F1: 0.2025
           | Val Loss:   45.6261 | Val Accuracy:   0.3960 | Val F1:   0.2888


100%|█████████████████████████████████████████| 141/141 [01:51<00:00,  1.26it/s]


Epochs: 3 | Train Loss: 364.8114 | Train Accuracy: 0.4604 | Train F1: 0.4113
           | Val Loss:   36.6449 | Val Accuracy:   0.4960 | Val F1:   0.4048


100%|█████████████████████████████████████████| 141/141 [01:51<00:00,  1.26it/s]


Epochs: 4 | Train Loss: 294.7287 | Train Accuracy: 0.5840 | Train F1: 0.5518
           | Val Loss:   30.6858 | Val Accuracy:   0.5640 | Val F1:   0.4974


100%|█████████████████████████████████████████| 141/141 [01:51<00:00,  1.26it/s]


Epochs: 5 | Train Loss: 239.0493 | Train Accuracy: 0.7009 | Train F1: 0.6840
           | Val Loss:   26.6210 | Val Accuracy:   0.6280 | Val F1:   0.5533


100%|█████████████████████████████████████████| 141/141 [01:51<00:00,  1.26it/s]


Epochs: 6 | Train Loss: 193.8860 | Train Accuracy: 0.7760 | Train F1: 0.7644
           | Val Loss:   23.3854 | Val Accuracy:   0.6720 | Val F1:   0.6208


100%|█████████████████████████████████████████| 141/141 [01:32<00:00,  1.53it/s]


Epochs: 7 | Train Loss: 158.0726 | Train Accuracy: 0.8360 | Train F1: 0.8271
           | Val Loss:   21.2787 | Val Accuracy:   0.7000 | Val F1:   0.6617


100%|█████████████████████████████████████████| 141/141 [01:31<00:00,  1.54it/s]


Epochs: 8 | Train Loss: 128.5365 | Train Accuracy: 0.8827 | Train F1: 0.8746
           | Val Loss:   19.0155 | Val Accuracy:   0.7080 | Val F1:   0.6774


100%|█████████████████████████████████████████| 141/141 [01:51<00:00,  1.26it/s]


Epochs: 9 | Train Loss: 106.9023 | Train Accuracy: 0.9142 | Train F1: 0.9095
           | Val Loss:   17.9724 | Val Accuracy:   0.7440 | Val F1:   0.7052


100%|█████████████████████████████████████████| 141/141 [01:51<00:00,  1.26it/s]


Epochs: 10 | Train Loss: 90.0413 | Train Accuracy: 0.9396 | Train F1: 0.9365
           | Val Loss:   16.4505 | Val Accuracy:   0.7640 | Val F1:   0.7194


100%|█████████████████████████████████████████| 141/141 [01:51<00:00,  1.26it/s]


Epochs: 11 | Train Loss: 77.5789 | Train Accuracy: 0.9578 | Train F1: 0.9552
           | Val Loss:   15.6234 | Val Accuracy:   0.7760 | Val F1:   0.7278


100%|█████████████████████████████████████████| 141/141 [01:51<00:00,  1.26it/s]


Epochs: 12 | Train Loss: 68.9398 | Train Accuracy: 0.9658 | Train F1: 0.9643
           | Val Loss:   15.0269 | Val Accuracy:   0.7840 | Val F1:   0.7435


100%|█████████████████████████████████████████| 141/141 [01:52<00:00,  1.26it/s]


Epochs: 13 | Train Loss: 62.9596 | Train Accuracy: 0.9724 | Train F1: 0.9711
           | Val Loss:   14.6032 | Val Accuracy:   0.7880 | Val F1:   0.7452


100%|█████████████████████████████████████████| 141/141 [01:51<00:00,  1.26it/s]


Epochs: 14 | Train Loss: 58.0641 | Train Accuracy: 0.9827 | Train F1: 0.9821
           | Val Loss:   14.5141 | Val Accuracy:   0.7880 | Val F1:   0.7409


100%|█████████████████████████████████████████| 141/141 [01:51<00:00,  1.26it/s]


Epochs: 15 | Train Loss: 55.5388 | Train Accuracy: 0.9840 | Train F1: 0.9836
           | Val Loss:   14.4432 | Val Accuracy:   0.7840 | Val F1:   0.7528
Test Accuracy:  0.5608 | Test F1:  0.5589


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

idf removal threshold at 4.0
0       The capital market, [MASK] [MASK] for a [MASK] of [MASK], needs an independent [MASK], but it is...
1       [MASK] the [MASK] 3-0 in a [MASK] [MASK] [MASK] on , [MASK] up a [MASK] for group [MASK] against...
2       The fall in the trade [MASK] to [MASK] in from in [MASK] market [MASK], and the [MASK] [MASK] sh...
3       [MASK] concern a.s. on said its net profit fell as a [MASK] of its main [MASK] [MASK], [MASK] wi...
4       shares [MASK] on following the [MASK] government's win in [MASK], but analysts said the rise was...
                                                       ...                                                 
2245    "" -- [MASK]-[MASK] [MASK] of the 's  -- is being used in a [MASK] for a bank and some [MASK] ar...
2246    , the [MASK] stock [MASK] [MASK] recently to [MASK] [MASK] to [MASK] the market, has been [MASK]...
2247    , 's biggest city and financial capital, is [MASK] for a near [MASK] when [MASK] hit the [MASK] ...

100%|█████████████████████████████████████████| 141/141 [01:52<00:00,  1.26it/s]


Epochs: 1 | Train Loss: 549.9063 | Train Accuracy: 0.0316 | Train F1: 0.0185
           | Val Loss:   59.1606 | Val Accuracy:   0.1280 | Val F1:   0.0702


100%|█████████████████████████████████████████| 141/141 [01:51<00:00,  1.26it/s]


Epochs: 2 | Train Loss: 482.3045 | Train Accuracy: 0.1676 | Train F1: 0.1376
           | Val Loss:   48.4814 | Val Accuracy:   0.3000 | Val F1:   0.2473


100%|█████████████████████████████████████████| 141/141 [01:51<00:00,  1.26it/s]


Epochs: 3 | Train Loss: 385.0199 | Train Accuracy: 0.3942 | Train F1: 0.3524
           | Val Loss:   40.2737 | Val Accuracy:   0.4200 | Val F1:   0.3531


100%|█████████████████████████████████████████| 141/141 [01:52<00:00,  1.26it/s]


Epochs: 4 | Train Loss: 306.7330 | Train Accuracy: 0.5511 | Train F1: 0.5231
           | Val Loss:   34.6659 | Val Accuracy:   0.5320 | Val F1:   0.4795


100%|█████████████████████████████████████████| 141/141 [01:52<00:00,  1.26it/s]


Epochs: 5 | Train Loss: 247.5588 | Train Accuracy: 0.6729 | Train F1: 0.6573
           | Val Loss:   30.6568 | Val Accuracy:   0.5400 | Val F1:   0.5105


100%|█████████████████████████████████████████| 141/141 [01:51<00:00,  1.26it/s]


Epochs: 6 | Train Loss: 198.9951 | Train Accuracy: 0.7742 | Train F1: 0.7684
           | Val Loss:   27.6614 | Val Accuracy:   0.6040 | Val F1:   0.5656


100%|█████████████████████████████████████████| 141/141 [01:51<00:00,  1.26it/s]


Epochs: 7 | Train Loss: 161.6145 | Train Accuracy: 0.8489 | Train F1: 0.8458
           | Val Loss:   25.4013 | Val Accuracy:   0.6200 | Val F1:   0.5860


100%|█████████████████████████████████████████| 141/141 [01:51<00:00,  1.26it/s]


Epochs: 8 | Train Loss: 131.2773 | Train Accuracy: 0.8929 | Train F1: 0.8918
           | Val Loss:   22.9831 | Val Accuracy:   0.6600 | Val F1:   0.6239


100%|█████████████████████████████████████████| 141/141 [01:51<00:00,  1.26it/s]


Epochs: 9 | Train Loss: 106.8936 | Train Accuracy: 0.9471 | Train F1: 0.9467
           | Val Loss:   21.3648 | Val Accuracy:   0.6960 | Val F1:   0.6504


100%|█████████████████████████████████████████| 141/141 [01:51<00:00,  1.26it/s]


Epochs: 10 | Train Loss: 88.1611 | Train Accuracy: 0.9693 | Train F1: 0.9689
           | Val Loss:   20.8369 | Val Accuracy:   0.6920 | Val F1:   0.6462


100%|█████████████████████████████████████████| 141/141 [01:51<00:00,  1.26it/s]


Epochs: 11 | Train Loss: 73.9095 | Train Accuracy: 0.9804 | Train F1: 0.9803
           | Val Loss:   19.5531 | Val Accuracy:   0.6920 | Val F1:   0.6476


100%|█████████████████████████████████████████| 141/141 [01:51<00:00,  1.26it/s]


Epochs: 12 | Train Loss: 64.2626 | Train Accuracy: 0.9853 | Train F1: 0.9853
           | Val Loss:   19.0821 | Val Accuracy:   0.7000 | Val F1:   0.6551


100%|█████████████████████████████████████████| 141/141 [01:51<00:00,  1.26it/s]


Epochs: 13 | Train Loss: 57.3878 | Train Accuracy: 0.9902 | Train F1: 0.9902
           | Val Loss:   18.7842 | Val Accuracy:   0.7160 | Val F1:   0.6717


100%|█████████████████████████████████████████| 141/141 [01:52<00:00,  1.26it/s]


Epochs: 14 | Train Loss: 52.5469 | Train Accuracy: 0.9947 | Train F1: 0.9947
           | Val Loss:   18.5915 | Val Accuracy:   0.7120 | Val F1:   0.6716


100%|█████████████████████████████████████████| 141/141 [01:51<00:00,  1.26it/s]


Epochs: 15 | Train Loss: 50.0411 | Train Accuracy: 0.9956 | Train F1: 0.9955
           | Val Loss:   18.5940 | Val Accuracy:   0.7120 | Val F1:   0.6724
Test Accuracy:  0.5212 | Test F1:  0.5173


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

idf removal threshold at 3.5
0       The capital market, [MASK] [MASK] for a [MASK] of [MASK], [MASK] an [MASK] [MASK], but it is not...
1       [MASK] the [MASK] 3-0 in a [MASK] [MASK] [MASK] on , [MASK] up a [MASK] for group [MASK] against...
2       The [MASK] in the trade [MASK] to [MASK] in from in [MASK] market [MASK], and the [MASK] [MASK] ...
3       [MASK] [MASK] a.s. on said its net profit fell as a [MASK] of its main [MASK] [MASK], [MASK] wit...
4       shares [MASK] on [MASK] the [MASK] government's [MASK] in [MASK], but analysts said the rise was...
                                                       ...                                                 
2245    "" -- [MASK]-[MASK] [MASK] of the 's  -- is being used in a [MASK] for a bank and some [MASK] ar...
2246    , the [MASK] stock [MASK] [MASK] recently to [MASK] [MASK] to [MASK] the market, has been [MASK]...
2247    , 's biggest [MASK] and financial capital, is [MASK] for a [MASK] [MASK] when [MASK] hit the [MA...

100%|█████████████████████████████████████████| 141/141 [01:27<00:00,  1.62it/s]


Epochs: 1 | Train Loss: 547.0889 | Train Accuracy: 0.0338 | Train F1: 0.0175
           | Val Loss:   60.6153 | Val Accuracy:   0.0520 | Val F1:   0.0234


100%|█████████████████████████████████████████| 141/141 [01:35<00:00,  1.47it/s]


Epochs: 2 | Train Loss: 494.0034 | Train Accuracy: 0.1271 | Train F1: 0.0991
           | Val Loss:   53.2623 | Val Accuracy:   0.1800 | Val F1:   0.1581


100%|█████████████████████████████████████████| 141/141 [01:51<00:00,  1.26it/s]


Epochs: 3 | Train Loss: 420.6854 | Train Accuracy: 0.2938 | Train F1: 0.2533
           | Val Loss:   45.2726 | Val Accuracy:   0.3280 | Val F1:   0.2724


100%|█████████████████████████████████████████| 141/141 [01:52<00:00,  1.26it/s]


Epochs: 4 | Train Loss: 354.3292 | Train Accuracy: 0.4391 | Train F1: 0.4042
           | Val Loss:   38.7722 | Val Accuracy:   0.4280 | Val F1:   0.3666


100%|█████████████████████████████████████████| 141/141 [01:51<00:00,  1.26it/s]


Epochs: 5 | Train Loss: 299.3403 | Train Accuracy: 0.5716 | Train F1: 0.5440
           | Val Loss:   34.6691 | Val Accuracy:   0.4840 | Val F1:   0.4109


100%|█████████████████████████████████████████| 141/141 [01:52<00:00,  1.26it/s]


Epochs: 6 | Train Loss: 250.5143 | Train Accuracy: 0.6756 | Train F1: 0.6572
           | Val Loss:   30.6782 | Val Accuracy:   0.5720 | Val F1:   0.4955


100%|█████████████████████████████████████████| 141/141 [01:51<00:00,  1.26it/s]


Epochs: 7 | Train Loss: 210.2275 | Train Accuracy: 0.7667 | Train F1: 0.7547
           | Val Loss:   28.7212 | Val Accuracy:   0.5800 | Val F1:   0.5279


100%|█████████████████████████████████████████| 141/141 [01:52<00:00,  1.26it/s]


Epochs: 8 | Train Loss: 176.7877 | Train Accuracy: 0.8347 | Train F1: 0.8271
           | Val Loss:   26.3846 | Val Accuracy:   0.6160 | Val F1:   0.5684


100%|█████████████████████████████████████████| 141/141 [01:52<00:00,  1.26it/s]


Epochs: 9 | Train Loss: 150.6214 | Train Accuracy: 0.8791 | Train F1: 0.8725
           | Val Loss:   24.7050 | Val Accuracy:   0.6400 | Val F1:   0.6079


100%|█████████████████████████████████████████| 141/141 [01:51<00:00,  1.26it/s]


Epochs: 10 | Train Loss: 127.9251 | Train Accuracy: 0.9191 | Train F1: 0.9147
           | Val Loss:   23.4515 | Val Accuracy:   0.6240 | Val F1:   0.5782


100%|█████████████████████████████████████████| 141/141 [01:52<00:00,  1.26it/s]


Epochs: 11 | Train Loss: 111.0971 | Train Accuracy: 0.9444 | Train F1: 0.9409
           | Val Loss:   22.3239 | Val Accuracy:   0.6600 | Val F1:   0.6065


100%|█████████████████████████████████████████| 141/141 [01:52<00:00,  1.26it/s]


Epochs: 12 | Train Loss: 98.0472 | Train Accuracy: 0.9640 | Train F1: 0.9624
           | Val Loss:   21.6013 | Val Accuracy:   0.6720 | Val F1:   0.6236


100%|█████████████████████████████████████████| 141/141 [01:51<00:00,  1.26it/s]


Epochs: 13 | Train Loss: 88.5133 | Train Accuracy: 0.9698 | Train F1: 0.9671
           | Val Loss:   21.1635 | Val Accuracy:   0.6800 | Val F1:   0.6360


100%|█████████████████████████████████████████| 141/141 [01:52<00:00,  1.26it/s]


Epochs: 14 | Train Loss: 82.5992 | Train Accuracy: 0.9729 | Train F1: 0.9705
           | Val Loss:   20.9410 | Val Accuracy:   0.6720 | Val F1:   0.6259


100%|█████████████████████████████████████████| 141/141 [01:52<00:00,  1.26it/s]


Epochs: 15 | Train Loss: 78.5644 | Train Accuracy: 0.9787 | Train F1: 0.9775
           | Val Loss:   20.7751 | Val Accuracy:   0.6720 | Val F1:   0.6248
Test Accuracy:  0.5052 | Test F1:  0.5000


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

idf removal threshold at 3.0
0       The [MASK] market, [MASK] [MASK] for a [MASK] of [MASK], [MASK] an [MASK] [MASK], but it is not ...
1       [MASK] the [MASK] 3-0 in a [MASK] [MASK] [MASK] on , [MASK] up a [MASK] for group [MASK] against...
2       The [MASK] in the [MASK] [MASK] to [MASK] in from in [MASK] market [MASK], and the [MASK] [MASK]...
3       [MASK] [MASK] a.s. on said its [MASK] profit [MASK] as a [MASK] of its [MASK] [MASK] [MASK], [MA...
4       shares [MASK] on [MASK] the [MASK] government's [MASK] in [MASK], but analysts said the rise was...
                                                       ...                                                 
2245    "" -- [MASK]-[MASK] [MASK] of the 's  -- is being used in a [MASK] for a bank and some [MASK] ar...
2246    , the [MASK] stock [MASK] [MASK] [MASK] to [MASK] [MASK] to [MASK] the market, has been [MASK] f...
2247    , 's biggest [MASK] and financial [MASK], is [MASK] for a [MASK] [MASK] when [MASK] [MASK] the [...

100%|█████████████████████████████████████████| 141/141 [01:51<00:00,  1.26it/s]


Epochs: 1 | Train Loss: 551.2820 | Train Accuracy: 0.0302 | Train F1: 0.0163
           | Val Loss:   62.0649 | Val Accuracy:   0.0320 | Val F1:   0.0126


100%|█████████████████████████████████████████| 141/141 [01:52<00:00,  1.26it/s]


Epochs: 2 | Train Loss: 528.5446 | Train Accuracy: 0.0707 | Train F1: 0.0463
           | Val Loss:   57.2686 | Val Accuracy:   0.1200 | Val F1:   0.0780


100%|█████████████████████████████████████████| 141/141 [01:51<00:00,  1.26it/s]


Epochs: 3 | Train Loss: 462.4399 | Train Accuracy: 0.2018 | Train F1: 0.1557
           | Val Loss:   49.5421 | Val Accuracy:   0.2280 | Val F1:   0.1698


100%|█████████████████████████████████████████| 141/141 [01:51<00:00,  1.26it/s]


Epochs: 4 | Train Loss: 393.0654 | Train Accuracy: 0.3480 | Train F1: 0.3152
           | Val Loss:   43.8207 | Val Accuracy:   0.3640 | Val F1:   0.3439


100%|█████████████████████████████████████████| 141/141 [01:52<00:00,  1.26it/s]


Epochs: 5 | Train Loss: 331.0932 | Train Accuracy: 0.5071 | Train F1: 0.4763
           | Val Loss:   39.4798 | Val Accuracy:   0.3880 | Val F1:   0.3650


100%|█████████████████████████████████████████| 141/141 [01:52<00:00,  1.26it/s]


Epochs: 6 | Train Loss: 280.0661 | Train Accuracy: 0.6196 | Train F1: 0.5973
           | Val Loss:   34.4112 | Val Accuracy:   0.4640 | Val F1:   0.4259


100%|█████████████████████████████████████████| 141/141 [01:52<00:00,  1.26it/s]


Epochs: 7 | Train Loss: 233.8610 | Train Accuracy: 0.7298 | Train F1: 0.7190
           | Val Loss:   31.1638 | Val Accuracy:   0.5320 | Val F1:   0.4990


100%|█████████████████████████████████████████| 141/141 [01:51<00:00,  1.26it/s]


Epochs: 8 | Train Loss: 198.2557 | Train Accuracy: 0.8071 | Train F1: 0.8018
           | Val Loss:   28.6500 | Val Accuracy:   0.5760 | Val F1:   0.5375


100%|█████████████████████████████████████████| 141/141 [01:52<00:00,  1.26it/s]


Epochs: 9 | Train Loss: 166.4290 | Train Accuracy: 0.8676 | Train F1: 0.8652
           | Val Loss:   26.5200 | Val Accuracy:   0.5960 | Val F1:   0.5580


100%|█████████████████████████████████████████| 141/141 [01:52<00:00,  1.26it/s]


Epochs: 10 | Train Loss: 140.5103 | Train Accuracy: 0.9080 | Train F1: 0.9067
           | Val Loss:   24.7068 | Val Accuracy:   0.6440 | Val F1:   0.6133


100%|█████████████████████████████████████████| 141/141 [01:27<00:00,  1.62it/s]


Epochs: 11 | Train Loss: 120.5554 | Train Accuracy: 0.9458 | Train F1: 0.9454
           | Val Loss:   23.1640 | Val Accuracy:   0.6560 | Val F1:   0.6225


100%|█████████████████████████████████████████| 141/141 [01:38<00:00,  1.44it/s]


Epochs: 12 | Train Loss: 106.0102 | Train Accuracy: 0.9596 | Train F1: 0.9595
           | Val Loss:   22.9754 | Val Accuracy:   0.6400 | Val F1:   0.6137


100%|█████████████████████████████████████████| 141/141 [01:52<00:00,  1.26it/s]


Epochs: 13 | Train Loss: 94.9433 | Train Accuracy: 0.9751 | Train F1: 0.9750
           | Val Loss:   22.0794 | Val Accuracy:   0.6560 | Val F1:   0.6239


100%|█████████████████████████████████████████| 141/141 [01:52<00:00,  1.26it/s]


Epochs: 14 | Train Loss: 87.8684 | Train Accuracy: 0.9813 | Train F1: 0.9813
           | Val Loss:   21.5223 | Val Accuracy:   0.6600 | Val F1:   0.6346


100%|█████████████████████████████████████████| 141/141 [01:51<00:00,  1.26it/s]


Epochs: 15 | Train Loss: 83.9098 | Train Accuracy: 0.9844 | Train F1: 0.9843
           | Val Loss:   21.5298 | Val Accuracy:   0.6600 | Val F1:   0.6228
Test Accuracy:  0.4668 | Test F1:  0.4609


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

idf removal threshold at 2.5
0       The [MASK] market, [MASK] [MASK] for a [MASK] of [MASK], [MASK] an [MASK] [MASK], but it is not ...
1       [MASK] the [MASK] 3-0 in a [MASK] [MASK] [MASK] on , [MASK] up a [MASK] for group [MASK] [MASK] ...
2       The [MASK] in the [MASK] [MASK] to [MASK] in from in [MASK] market [MASK], and the [MASK] [MASK]...
3       [MASK] [MASK] a.s. on said its [MASK] [MASK] [MASK] as a [MASK] of its [MASK] [MASK] [MASK], [MA...
4       shares [MASK] on [MASK] the [MASK] government's [MASK] in [MASK], but analysts said the [MASK] w...
                                                       ...                                                 
2245    "" -- [MASK]-[MASK] [MASK] of the 's  -- is [MASK] [MASK] in a [MASK] for a [MASK] and some [MAS...
2246    , the [MASK] stock [MASK] [MASK] [MASK] to [MASK] [MASK] to [MASK] the market, has been [MASK] f...
2247    , 's [MASK] [MASK] and [MASK] [MASK], is [MASK] for a [MASK] [MASK] when [MASK] [MASK] the [MASK...

100%|█████████████████████████████████████████| 141/141 [01:51<00:00,  1.26it/s]


Epochs: 1 | Train Loss: 550.3019 | Train Accuracy: 0.0302 | Train F1: 0.0128
           | Val Loss:   61.4204 | Val Accuracy:   0.0640 | Val F1:   0.0222


100%|█████████████████████████████████████████| 141/141 [01:51<00:00,  1.26it/s]


Epochs: 2 | Train Loss: 517.8528 | Train Accuracy: 0.0929 | Train F1: 0.0594
           | Val Loss:   55.4452 | Val Accuracy:   0.1520 | Val F1:   0.0913


100%|█████████████████████████████████████████| 141/141 [01:52<00:00,  1.26it/s]


Epochs: 3 | Train Loss: 459.5212 | Train Accuracy: 0.2027 | Train F1: 0.1467
           | Val Loss:   50.3324 | Val Accuracy:   0.2040 | Val F1:   0.1523


100%|█████████████████████████████████████████| 141/141 [01:52<00:00,  1.26it/s]


Epochs: 4 | Train Loss: 406.3584 | Train Accuracy: 0.3378 | Train F1: 0.2868
           | Val Loss:   45.5399 | Val Accuracy:   0.3000 | Val F1:   0.2276


100%|█████████████████████████████████████████| 141/141 [01:51<00:00,  1.26it/s]


Epochs: 5 | Train Loss: 353.1768 | Train Accuracy: 0.4604 | Train F1: 0.4229
           | Val Loss:   40.9750 | Val Accuracy:   0.4080 | Val F1:   0.3543


100%|█████████████████████████████████████████| 141/141 [01:51<00:00,  1.26it/s]


Epochs: 6 | Train Loss: 302.8521 | Train Accuracy: 0.5796 | Train F1: 0.5469
           | Val Loss:   37.3131 | Val Accuracy:   0.4680 | Val F1:   0.4279


100%|█████████████████████████████████████████| 141/141 [01:51<00:00,  1.26it/s]


Epochs: 7 | Train Loss: 259.7113 | Train Accuracy: 0.6862 | Train F1: 0.6621
           | Val Loss:   33.8603 | Val Accuracy:   0.4960 | Val F1:   0.4577


100%|█████████████████████████████████████████| 141/141 [01:52<00:00,  1.26it/s]


Epochs: 8 | Train Loss: 221.4255 | Train Accuracy: 0.7729 | Train F1: 0.7576
           | Val Loss:   31.7443 | Val Accuracy:   0.5240 | Val F1:   0.4878


100%|█████████████████████████████████████████| 141/141 [01:52<00:00,  1.26it/s]


Epochs: 9 | Train Loss: 189.8386 | Train Accuracy: 0.8307 | Train F1: 0.8199
           | Val Loss:   29.0415 | Val Accuracy:   0.5600 | Val F1:   0.5021


100%|█████████████████████████████████████████| 141/141 [01:52<00:00,  1.26it/s]


Epochs: 10 | Train Loss: 162.8583 | Train Accuracy: 0.8813 | Train F1: 0.8749
           | Val Loss:   27.7113 | Val Accuracy:   0.5720 | Val F1:   0.5378


100%|█████████████████████████████████████████| 141/141 [01:51<00:00,  1.26it/s]


Epochs: 11 | Train Loss: 143.3459 | Train Accuracy: 0.9098 | Train F1: 0.9061
           | Val Loss:   27.0041 | Val Accuracy:   0.6000 | Val F1:   0.5683


100%|█████████████████████████████████████████| 141/141 [01:51<00:00,  1.26it/s]


Epochs: 12 | Train Loss: 126.5472 | Train Accuracy: 0.9351 | Train F1: 0.9331
           | Val Loss:   25.8264 | Val Accuracy:   0.5960 | Val F1:   0.5683


100%|█████████████████████████████████████████| 141/141 [01:52<00:00,  1.26it/s]


Epochs: 13 | Train Loss: 116.3483 | Train Accuracy: 0.9480 | Train F1: 0.9467
           | Val Loss:   25.1467 | Val Accuracy:   0.6160 | Val F1:   0.5917


 81%|█████████████████████████████████▏       | 114/141 [01:30<00:21,  1.26it/s]

### Saliency Map

In [None]:
def get_saliency_map(model, df):
    input_text = [tokenizer(text, padding='max_length', max_length = 512, truncation=True, return_tensors="pt") for text in df['content']]
    input = input_text[0]
    print(input.shape)
    for param in model.parameters():
        param.requires_grad = False
    model.eval()
    # input.unsqueeze_(0)
    input.requires_grad = True
    preds = model(input)
    score, indices = torch.max(preds, 1)
    #backward pass to get gradients of score predicted class w.r.t. input image
    score.backward()
    #get max along channel axis
    slc, _ = torch.max(torch.abs(input.grad[0]), dim=0)
    #normalize to [0..1]
    slc = (slc - slc.min())/(slc.max()-slc.min())
    return slc

input_slc = get_saliency_map(model, train_df)
print(input_slc.shape)
input_slc

### SHAP

In [None]:
import shap
import scipy as sp

def f(x):
    tv = torch.tensor([tokenizer.encode(v, padding='max_length', \
        max_length=512, truncation=True) for v in x]).cuda()
    outputs = model(tv)[0].detach().cpu().numpy()
    scores = (np.exp(outputs).T / np.exp(outputs).sum(-1)).T
    val = sp.special.logit(scores[:,1]) # use one vs rest logit units
    return val

explainer = shap.Explainer(f, tokenizer)
shap_values = explainer(train_df['masked_content'][:3], fixed_context=1, batch_size=32)

In [None]:
shap.plots.bar(shap_values.abs.mean(0)) # default

In [None]:
shap.plots.bar(shap_values.abs.sum(0))

In [None]:
shap.plots.bar(shap_values.abs.max(0))

In [None]:
shap.plots.text(shap_values)