In [1]:
import pytorch_lightning as pl
import os
import torch.nn.functional as F
import torch
from torch.utils.data import Dataset, DataLoader, IterableDataset
from transformers import BertTokenizer, BertModel, BertConfig
from transformers import AlbertTokenizer, AlbertModel, AlbertConfig
from transformers import AdamW
from transformers import get_constant_schedule_with_warmup
from transformers import get_linear_schedule_with_warmup
import transformers
from tqdm import tqdm
import numpy as np
import argparse
from pytorch_lightning import Trainer, seed_everything
from pytorch_lightning import loggers as pl_logger
from sklearn.metrics import f1_score, precision_score, recall_score
from pytorch_lightning.metrics import F1, Precision, Recall

seed_everything(42)


def encoding_text(d):
    inputs = tokenizer.encode(d, return_tensors="pt", truncation=True, padding='max_length', max_length=256)
    return inputs.squeeze()


class BertClassifier(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.cls = transformers.BertForSequenceClassification.from_pretrained('bert-base-multilingual-uncased')
        self.softmax = torch.nn.Softmax(dim=-1)
        self.crossentropy = torch.nn.CrossEntropyLoss()
        self.f1 = F1()
        self.p = Precision()
        self.r = Recall()

    def forward(self, x):
        return self.softmax(self.cls(x).logits)

    def training_step(self, batch, batch_idx):
        x, label = batch
        pred = self.forward(x)
        loss = self.crossentropy(pred, label)
        self.log("loss", loss)
        return loss

    def validation_step(self, batch, batch_idx):
        x, label = batch
        logits = self.forward(x)
        return label, logits[:,1]
    
    def evaluate_outs(self, outs):
        labels = torch.cat([label for label, pred in outs])
        preds = torch.cat([pred for label, pred in outs])
        f1 = self.f1(preds, labels)
        p = self.p(preds, labels)
        r = self.r(preds, labels)
        return f1, p, r
    
    def validation_epoch_end(self, outs):
        f1, p, r = self.evaluate_outs(outs)
        print('f1', f1)
        print('p', p)
        print('r', r)
        self.log("val_f1", f1)
        self.log("val_p", p)
        self.log("val_r", r)
        return f1
    
    def test_step(self, batch, batch_idx):
        return self.validation_step(batch, batch_idx)
    
    def test_epoch_end(self, outs):
        f1, p, r = self.evaluate_outs(outs)
        self.log("test_f1", f1)
        self.log("test_p", p)
        self.log("test_r", r)
        return f1

    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(), lr=lr)
        scheduler = get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=warmup_steps,
            num_training_steps=num_training_steps,
        )
        return [optimizer], [scheduler]

Global seed set to 42


In [8]:
import pytorch_lightning as pl
import os
import torch.nn.functional as F
import torch
from torch.utils.data import Dataset, DataLoader, IterableDataset
lr = 1e-4
warmup_steps = 50
batch_size = 80
maximum_training = 1000000
epochs = 8
dataset = binary_dataset('tweet_text_for_relevance.json', maximum_training)


print(len(dataset))
val_size = int(0.1 * len(dataset))
test_size = int(0.1 * len(dataset))
train_size = len(dataset) - val_size - test_size
print(val_size)

434357
43435


In [4]:

tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')

import json
class binary_dataset(Dataset):
    def __init__(self, fn, maximum_training=-1):
        self.X = []
        self.Y = []
        with open(fn) as fin:
            for line in fin:
                js = json.loads(line)
                self.Y.append(js['label'])
                self.X.append(js['text'])
                if maximum_training != -1 and len(self.X) > maximum_training:
                    break
    def __getitem__(self, idx):
        return encoding_text(self.X[idx]), int(self.Y[idx])
    def __len__(self):
        return len(self.X)


Downloading:   0%|          | 0.00/872k [00:00<?, ?B/s]

In [2]:
from pytorch_lightning.callbacks import ModelCheckpoint

lr = 1e-4
warmup_steps = 50
batch_size = 80
maximum_training = 1000000
epochs = 8

dataset = binary_dataset('tweet_text_for_relevance.json', maximum_training)
val_size = int(0.1 * len(dataset))
test_size = int(0.1 * len(dataset))
train_size = len(dataset) - val_size - test_size
train_set, val_set, test_set = torch.utils.data.random_split(dataset, [train_size, val_size, test_size])
num_training_steps = len(train_set) // batch_size * epochs

train_loader = DataLoader(train_set, batch_size=batch_size, num_workers=128, shuffle=True)
val_loader = DataLoader(val_set, batch_size=batch_size, num_workers=128)
checkpoint_callback = ModelCheckpoint(
    monitor='val_f1',
    dirpath='./',
    filename='relevance_model_tweet_best_f1',
    save_top_k=1,
    mode='max',
)
trainer = pl.Trainer(max_epochs=epochs, gpus=[0], precision=16, default_root_dir='checkpoints', callbacks=[checkpoint_callback])

model = BertClassifier()
# trainer.fit(model, train_loader, val_loader)

model = BertClassifier.load_from_checkpoint('./relevance_model_tweet_best_f1.ckpt')
test_loader = DataLoader(test_set, batch_size=batch_size, num_workers=128)
trainer.test(model, test_dataloaders=test_loader)

NameError: name 'binary_dataset' is not defined

In [4]:
from pytorch_lightning.callbacks import ModelCheckpoint

lr = 5e-5
warmup_steps = 50
batch_size = 64
maximum_training = 5000000
epochs = 20

dataset = binary_dataset('tweet_news_text_for_relevance.json', maximum_training)
val_size = int(0.1 * len(dataset))
test_size = int(0.1 * len(dataset))
train_size = len(dataset) - val_size - test_size
train_set, val_set, test_set = torch.utils.data.random_split(dataset, [train_size, val_size, test_size])
num_training_steps = len(train_set) // batch_size * epochs

train_loader = DataLoader(train_set, batch_size=batch_size, num_workers=128, shuffle=True, pin_memory=True)
val_loader = DataLoader(val_set, batch_size=batch_size, num_workers=128, pin_memory=True)
checkpoint_callback = ModelCheckpoint(
    monitor='val_f1',
    dirpath='./',
    filename='relevance_model_best_f1',
    save_top_k=1,
    mode='max',
)
trainer = pl.Trainer(max_epochs=epochs, gpus=[0], precision=16, default_root_dir='checkpoints', callbacks=[checkpoint_callback])

model = BertClassifier.load_from_checkpoint('./relevance_model_tweet_best_f1.ckpt')
trainer.fit(model, train_loader, val_loader)

model = BertClassifier.load_from_checkpoint('./relevance_model_best_f1.ckpt')
test_loader = DataLoader(test_set, batch_size=256, num_workers=128, pin_memory=True)
print('split test', trainer.test(model, test_dataloaders=test_loader))

test_loader2 = DataLoader(binary_dataset('manual_news_text_for_relevance.json'), batch_size=256, num_workers=128, pin_memory=True)
print('manual test', trainer.test(model, test_dataloaders=test_loader2))

GPU available: True, used: True
TPU available: None, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Using native 16bit precision.
Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClas

HBox(children=(HTML(value='Validation sanity check'), FloatProgress(value=1.0, bar_style='info', layout=Layout…

f1 tensor(0., device='cuda:0')
p tensor(0., device='cuda:0')
r tensor(0., device='cuda:0')




HBox(children=(HTML(value='Training'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), max…

HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

f1 tensor(0.0718, device='cuda:0')
p tensor(0.7188, device='cuda:0')
r tensor(0.0378, device='cuda:0')




HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

f1 tensor(0.4244, device='cuda:0')
p tensor(0.6290, device='cuda:0')
r tensor(0.3202, device='cuda:0')


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

f1 tensor(0.6169, device='cuda:0')
p tensor(0.6375, device='cuda:0')
r tensor(0.5977, device='cuda:0')


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

f1 tensor(0.6423, device='cuda:0')
p tensor(0.7031, device='cuda:0')
r tensor(0.5911, device='cuda:0')


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

f1 tensor(0.6442, device='cuda:0')
p tensor(0.7224, device='cuda:0')
r tensor(0.5813, device='cuda:0')


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

f1 tensor(0.6608, device='cuda:0')
p tensor(0.7173, device='cuda:0')
r tensor(0.6125, device='cuda:0')


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

f1 tensor(0.6540, device='cuda:0')
p tensor(0.7293, device='cuda:0')
r tensor(0.5928, device='cuda:0')


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

f1 tensor(0.6667, device='cuda:0')
p tensor(0.7114, device='cuda:0')
r tensor(0.6273, device='cuda:0')


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

f1 tensor(0.6706, device='cuda:0')
p tensor(0.6849, device='cuda:0')
r tensor(0.6568, device='cuda:0')


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

f1 tensor(0.6655, device='cuda:0')
p tensor(0.7173, device='cuda:0')
r tensor(0.6207, device='cuda:0')


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

f1 tensor(0.6697, device='cuda:0')
p tensor(0.7536, device='cuda:0')
r tensor(0.6026, device='cuda:0')


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

f1 tensor(0.6667, device='cuda:0')
p tensor(0.6991, device='cuda:0')
r tensor(0.6371, device='cuda:0')


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

f1 tensor(0.6661, device='cuda:0')
p tensor(0.7347, device='cuda:0')
r tensor(0.6092, device='cuda:0')


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

f1 tensor(0.6725, device='cuda:0')
p tensor(0.7140, device='cuda:0')
r tensor(0.6355, device='cuda:0')


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

f1 tensor(0.6516, device='cuda:0')
p tensor(0.7667, device='cuda:0')
r tensor(0.5665, device='cuda:0')


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

f1 tensor(0.6719, device='cuda:0')
p tensor(0.7106, device='cuda:0')
r tensor(0.6371, device='cuda:0')


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

f1 tensor(0.6643, device='cuda:0')
p tensor(0.7399, device='cuda:0')
r tensor(0.6026, device='cuda:0')


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

f1 tensor(0.6563, device='cuda:0')
p tensor(0.7453, device='cuda:0')
r tensor(0.5862, device='cuda:0')


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

f1 tensor(0.6497, device='cuda:0')
p tensor(0.7644, device='cuda:0')
r tensor(0.5649, device='cuda:0')


HBox(children=(HTML(value='Validating'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), m…

f1 tensor(0.6860, device='cuda:0')
p tensor(0.7120, device='cuda:0')
r tensor(0.6617, device='cuda:0')



Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model 

HBox(children=(HTML(value='Testing'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), max=…


--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_f1': tensor(0.6537, device='cuda:0'),
 'test_p': tensor(0.6862, device='cuda:0'),
 'test_r': tensor(0.6242, device='cuda:0')}
--------------------------------------------------------------------------------
split test [{'test_f1': 0.6537162065505981, 'test_p': 0.686170220375061, 'test_r': 0.624193549156189}]




HBox(children=(HTML(value='Testing'), FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), max=…


--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_f1': tensor(0.7067, device='cuda:0'),
 'test_p': tensor(0.9636, device='cuda:0'),
 'test_r': tensor(0.5579, device='cuda:0')}
--------------------------------------------------------------------------------
manual test [{'test_f1': 0.7066667079925537, 'test_p': 0.9636363387107849, 'test_r': 0.557894766330719}]


split test [{'test_f1': 0.6537162065505981, 'test_p': 0.686170220375061, 'test_r': 0.624193549156189}]
manual test [{'test_f1': 0.7066667079925537, 'test_p': 0.9636363387107849, 'test_r': 0.557894766330719}]

In [6]:
# Find threshold
model = BertClassifier.load_from_checkpoint('./relevance_model_best_f1.ckpt')
test_loader2 = DataLoader(binary_dataset('future_manual_news_text_for_relevance.json'), batch_size=256, num_workers=128, pin_memory=True)
model.eval()
model.to('cuda')
Y = []
pred = []
with torch.no_grad():
    for x, y in test_loader2:
        tmp = model(x.to('cuda'), return_dict=True)[:,1].detach().cpu().numpy()
        Y.extend(y.numpy())
        pred.extend(tmp)
print(len(Y), len(pred))

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model 

TypeError: forward() got an unexpected keyword argument 'return_dict'

In [14]:
best_threshold = 4.1355447e-06
pred = np.array(pred)
# best_f1 = -1
# print(sorted(pred))
# for threshold in pred:
#     f1 = f1_score(Y, pred >= threshold)
#     if best_f1 < f1:
#         best_f1 = f1
#         best_threshold = threshold

print(best_threshold)
threshold = best_threshold
f1 = f1_score(Y, pred >= threshold)
p = precision_score(Y, pred >= threshold)
r = recall_score(Y, pred >= threshold)
print(f1, p, r)

# loader1 : 0.6362968405584128 0.6983870967741935 0.5843454790823212
# loader2 : 0.811764705882353 0.7263157894736842 0.92

# 4.1355447e-06
# 0.8148148148148148 0.9428571428571428 0.717391304347826

4.1355447e-06
0.8148148148148148 0.9428571428571428 0.717391304347826


In [20]:
# generate relevance score for all news
model = BertClassifier.load_from_checkpoint('./relevance_model_best_f1.ckpt')
class inference_dataset(Dataset):
    def __init__(self):
        self.X = []
        with open('../news_text_raw.json') as fin:
            for line in fin:
                i = json.loads(line)
                text = (i['title'] if i['title'] else ' ') + ' ' + (i['article'] if i['article'] else ' ')
                self.X.append(text)
    def __getitem__(self, idx):
        return encoding_text(self.X[idx])
    def __len__(self):
        return len(self.X)
inference_set = inference_dataset()
inference_loader = DataLoader(inference_set, batch_size=256, num_workers=128, shuffle=False, drop_last=False)
relevance_scores = []
relevance_predictions = []
with torch.no_grad():
    model.eval().to('cuda')
    for batch in tqdm(inference_loader):
        tmp = model(batch.to('cuda'))[:,1].detach().cpu().numpy()
        relevance_scores.extend(tmp)
        relevance_predictions.extend(tmp >= best_threshold)
all_data = []
with open('../news_text_raw.json') as fin:
    for line in fin:
        i = json.loads(line)
        all_data.append(i)

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model 

In [23]:
url2gt = dict()
with open('tweet_news_text_for_relevance.json') as fin:
    for line in fin:
        js = json.loads(line)
        url2gt[js['url']] = js['label']
with open('manual_news_text_for_relevance.json') as fin:
    for line in fin:
        js = json.loads(line)
        url2gt[js['url']] = js['label']
assert len(all_data) == len(relevance_scores)
with open('url2relevance.json', 'w') as fout:
    for js, score, pred in zip(all_data, relevance_scores, relevance_predictions):
        if js['url'] in url2gt:
            js['relevance_prediction'] = url2gt[js['url']]
            js['relevance_score'] = 1.0 if js['relevance_prediction'] else 0.0
        else:
            js['relevance_score'] = float(score)
            js['relevance_prediction'] = bool(pred)
        fout.write(json.dumps({'url': js['url'], 'relevance_score': js['relevance_score'], 'relevance_prediction': js['relevance_prediction']}) + '\n')
        
cnt = 0
for js in all_data:
    if js['relevance_prediction'] and js['lang'] == 'en':
        cnt += 1
print(cnt)    

36417
