In [1]:
import pytorch_lightning as pl
import os
import torch.nn.functional as F
import torch
from torch.utils.data import Dataset, DataLoader, IterableDataset
from transformers import BertTokenizer, BertModel, BertConfig
from transformers import AlbertTokenizer, AlbertModel, AlbertConfig
from transformers import AdamW
from transformers import get_constant_schedule_with_warmup
from transformers import get_linear_schedule_with_warmup
import transformers
from tqdm import tqdm
import numpy as np
import argparse
from pytorch_lightning import Trainer, seed_everything
from pytorch_lightning import loggers as pl_logger
from sklearn.metrics import f1_score
from pytorch_lightning.metrics import F1, Precision, Recall

seed_everything(42)


def encoding_text(d):
    inputs = tokenizer.encode(d, return_tensors="pt", truncation=True, padding='max_length', max_length=256)
    return inputs.squeeze()


class BertClassifier(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.cls = transformers.BertForSequenceClassification.from_pretrained('bert-base-multilingual-uncased')
        self.softmax = torch.nn.Softmax(dim=-1)
        self.crossentropy = torch.nn.CrossEntropyLoss()
        self.f1 = F1()
        self.p = Precision()
        self.r = Recall()

    def forward(self, x):
        return self.softmax(self.cls(x).logits)

    def training_step(self, batch, batch_idx):
        x, label = batch
        pred = self.forward(x)
        loss = self.crossentropy(pred, label)
        self.log("loss", loss)
        return loss

    def validation_step(self, batch, batch_idx):
        x, label = batch
        logits = self.forward(x)
        return label, logits[:,1]
    
    def evaluate_outs(self, outs):
        labels = torch.cat([label for label, pred in outs])
        preds = torch.cat([pred for label, pred in outs])
        f1 = self.f1(preds, labels)
        p = self.p(preds, labels)
        r = self.r(preds, labels)
        return f1, p, r
    
    def validation_epoch_end(self, outs):
        f1, p, r = self.evaluate_outs(outs)
        print('f1', f1)
        print('p', p)
        print('r', r)
        self.log("val_f1", f1)
        self.log("val_p", p)
        self.log("val_r", r)
        return f1
    
    def test_step(self, batch, batch_idx):
        return self.validation_step(batch, batch_idx)
    
    def test_epoch_end(self, outs):
        f1, p, r = self.evaluate_outs(outs)
        self.log("test_f1", f1)
        self.log("test_p", p)
        self.log("test_r", r)
        return f1

    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(), lr=lr)
        scheduler = get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=warmup_steps,
            num_training_steps=num_training_steps,
        )
        return [optimizer], [scheduler]

Global seed set to 42


In [None]:

tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')

import json
class binary_dataset(Dataset):
    def __init__(self, fn, maximum_training=-1):
        self.X = []
        self.Y = []
        with open(fn) as fin:
            for line in fin:
                js = json.loads(line)
                self.Y.append(js['label'])
                self.X.append(js['text'])
                if maximum_training != -1 and len(self.X) > maximum_training:
                    break
    def __getitem__(self, idx):
        return encoding_text(self.X[idx]), int(self.Y[idx])
    def __len__(self):
        return len(self.X)


In [None]:
from pytorch_lightning.callbacks import ModelCheckpoint

lr = 1e-4
warmup_steps = 50
batch_size = 80
maximum_training = 1000000
epochs = 8

dataset = binary_dataset('tweet_text_for_relevance.json', maximum_training)
val_size = int(0.1 * len(dataset))
test_size = int(0.1 * len(dataset))
train_size = len(dataset) - val_size - test_size
train_set, val_set, test_set = torch.utils.data.random_split(dataset, [train_size, val_size, test_size])
num_training_steps = len(train_set) // batch_size * epochs

train_loader = DataLoader(train_set, batch_size=batch_size, num_workers=128, shuffle=True)
val_loader = DataLoader(val_set, batch_size=batch_size, num_workers=128)
checkpoint_callback = ModelCheckpoint(
    monitor='val_f1',
    dirpath='./',
    filename='relevance_model_best_f1',
    save_top_k=1,
    mode='max',
)
trainer = pl.Trainer(max_epochs=epochs, gpus=[0], precision=16, default_root_dir='checkpoints', callbacks=[checkpoint_callback])

model = BertClassifier()
# trainer.fit(model, train_loader, val_loader)

model = BertClassifier.load_from_checkpoint('./relevance_model_best_f1.ckpt')
test_loader = DataLoader(test_set, batch_size=batch_size, num_workers=128)
trainer.test(model, test_dataloaders=test_loader)

finetuned on tweet
validation:
[{'test_f1': 0.8882,
  'test_p': 0.9177,
  'test_r': 0.8605}]

TEST RESULTS (tweet)
 [{'test_f1': 0.8901,
  'test_p': 0.9180,
  'test_r': 0.8637}]
 
 TEST RESULTS (news, direct apply)
[{'test_f1': 0.2075,
  'test_p': 1.0,
  'test_r': 0.1158}]
  
  
news, finetuned, validation
{'test_f1': tensor(0.8485, device='cuda:0'),
 'test_p': tensor(0.9333, device='cuda:0'),
 'test_r': tensor(0.7778, device='cuda:0')}
 
news, finetuned, testing (29, 145 in total)
{'test_f1': tensor(0.8718, device='cuda:0'),
 'test_p': tensor(0.8947, device='cuda:0'),
 'test_r': tensor(0.8500, device='cuda:0')}

In [None]:
# directly apply tweet model for news
model = BertClassifier.load_from_checkpoint('./relevance_model_best_f1.ckpt')
news_set = binary_dataset('news_text_for_relevance.json')
news_loader = DataLoader(news_set, batch_size=72, num_workers=128)
trainer.test(model, test_dataloaders=news_loader)

In [None]:
from pytorch_lightning.callbacks import ModelCheckpoint

lr = 1e-4
warmup_steps = 3
batch_size = 12
batch_size_eval = 256
epochs = 100

news_set = binary_dataset('news_text_for_relevance.json')
print(len(news_set))

val_size = int(0.2 * len(news_set))
test_size = int(0.2 * len(news_set))
train_size = len(news_set) - val_size - test_size
train_set, val_set, test_set = torch.utils.data.random_split(news_set, [train_size, val_size, test_size])
print('#test', len(test_set))
num_training_steps = len(news_set) // batch_size * epochs

train_loader = DataLoader(train_set, batch_size=batch_size, num_workers=128, shuffle=True)
val_loader = DataLoader(val_set, batch_size=batch_size_eval, num_workers=128)
checkpoint_callback = ModelCheckpoint(
    monitor='val_f1',
    dirpath='./',
    filename='relevance_model_news_best_f1',
    save_top_k=1,
    mode='max',
)
trainer = pl.Trainer(max_epochs=epochs, gpus=[0], precision=16, default_root_dir='checkpoints_news', callbacks=[checkpoint_callback])

model = BertClassifier.load_from_checkpoint('./relevance_model_best_f1.ckpt')
trainer.fit(model, train_loader, val_loader)

model = BertClassifier.load_from_checkpoint('./relevance_model_news_best_f1.ckpt')
test_loader = DataLoader(test_set, batch_size=batch_size_eval, num_workers=128)
trainer.test(model, test_dataloaders=test_loader)

In [None]:

model = BertClassifier.load_from_checkpoint('./relevance_model_news_best_f1.ckpt')
test_loader = DataLoader(test_set, batch_size=batch_size_eval, num_workers=128)
trainer.test(model, test_dataloaders=val_loader)
trainer.test(model, test_dataloaders=test_loader)

In [None]:
# generate relevance score for all news
model = BertClassifier.load_from_checkpoint('./relevance_model_news_best_f1.ckpt')
class inference_dataset(Dataset):
    def __init__(self):
        self.X = []
        with open('../news_text_raw.json') as fin:
            for line in fin:
                i = json.loads(line)
                text = (i['title'] if i['title'] else ' ') + ' ' + (i['article'] if i['article'] else ' ')
                self.X.append(text)
    def __getitem__(self, idx):
        return encoding_text(self.X[idx])
    def __len__(self):
        return len(self.X)
inference_set = inference_dataset()
inference_loader = DataLoader(inference_set, batch_size=256, num_workers=128, shuffle=False, drop_last=False)
relevance_scores = []
relevance_predictions = []
with torch.no_grad():
    model.eval().to('cuda')
    for batch in tqdm(inference_loader):
        tmp = model(batch.to('cuda'))[:,1].detach().cpu().numpy()
        relevance_scores.extend(tmp)
        relevance_predictions.extend(tmp > 0.5)
all_data = []
with open('../news_text_raw.json') as fin:
    for line in fin:
        i = json.loads(line)
        all_data.append(i)

In [None]:
url2gt = dict()
with open('news_text_for_relevance.json') as fin:
    for line in fin:
        js = json.loads(line)
        url2gt[js['url']] = js['label']
assert len(all_data) == len(relevance_scores)
with open('news_relevance.json', 'w') as fout:
    for js, score, pred in zip(all_data, relevance_scores, relevance_predictions):
        if js['url'] in url2gt:
            js['relevance_prediction'] = url2gt[js['url']]
            js['relevance_score'] = 1.0 if js['relevance_prediction'] else 0.0
        else:
            js['relevance_score'] = float(score)
            js['relevance_prediction'] = bool(pred)
        fout.write(json.dumps(js) + '\n')

In [None]:
cnt = 0
for js in all_data:
    if js['relevance_prediction'] and js['lang'] == 'en':
        cnt += 1
print(cnt)