<a href="https://colab.research.google.com/github/pranshurastogi29/News-summarization-Topic-Prediction.ipynb/blob/main/training_super_fast_roberta_base.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd

import os
from glob import glob

import torch
import torch.nn as nn
from torch.utils.data import Dataset,DataLoader
from torch.autograd import Variable
from torch.utils.data.sampler import SequentialSampler, RandomSampler
import sklearn

import time
import random
from datetime import datetime
from tqdm import tqdm
tqdm.pandas()
from transformers import AdamW, get_linear_schedule_with_warmup, get_constant_schedule
from transformers import AutoTokenizer, AutoModel
import transformers
import gc
import re

# !pip install nltk > /dev/null
import nltk
nltk.download('punkt')

from nltk import sent_tokenize

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
SEED = 42

MAX_LENGTH = 512
BACKBONE_PATH = "distilroberta-base"


def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(SEED)

In [None]:
from nltk import sent_tokenize
from random import shuffle
import random
import albumentations
from albumentations.core.transforms_interface import DualTransform, BasicTransform


LANGS = {
    'en': 'english',
}

def get_sentences(text, lang='en'):
    return sent_tokenize(text, LANGS.get(lang, 'english'))

def exclude_duplicate_sentences(text, lang='en'):
    sentences = []
    for sentence in get_sentences(text, lang):
        sentence = sentence.strip()
        if sentence not in sentences:
            sentences.append(sentence)
    return ' '.join(sentences)

def clean_text(text, lang='en'):
    text = str(text)
    text = re.sub(r' r','',text)
    text = exclude_duplicate_sentences(text, lang)
    return text.strip()

In [None]:
import pandas as pd
df = pd.read_csv('../input/scraping-news-and-create-dataset/data.csv')
df.columns = ['date','topic','news']

In [None]:
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
df['labels'] = labelencoder.fit_transform(df['topic'])

In [None]:
df['labels'].unique()

array([3, 5, 4, 7, 6, 0, 1, 2])

In [None]:
tokenizer = AutoTokenizer.from_pretrained(BACKBONE_PATH)

Downloading:   0%|          | 0.00/480 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

In [None]:
def onehot(size, target):
    vec = torch.zeros(size, dtype=torch.float32)
    vec[target] = 1.
    return vec

class DatasetRetriever(Dataset):

    def __init__(self, labels_or_ids, comment_texts, lang = 'en', test=False):
        self.test = test
        self.lang = lang
        self.labels_or_ids = labels_or_ids
        self.comment_texts = comment_texts
        
    def get_tokens(self, text):
        encoded = tokenizer.encode_plus(
            text, 
            add_special_tokens=True, 
            max_length=MAX_LENGTH, 
            pad_to_max_length=True
        )
        return encoded['input_ids'], encoded['attention_mask']

    def __len__(self):
        return self.comment_texts.shape[0]

    def __getitem__(self, idx):
        text = self.comment_texts[idx]
        if self.test is False:
            label = self.labels_or_ids[idx]
            target = onehot(8, label)

        tokens, attention_mask = self.get_tokens(str(text))
        tokens, attention_mask = torch.tensor(tokens), torch.tensor(attention_mask)

        if self.test is False:
            return target, tokens, attention_mask
        return self.labels_or_ids[idx], tokens, attention_mask

    def get_labels(self):
        return list(np.char.add(self.labels_or_ids.astype(str),''))

In [None]:
train_dataset = DatasetRetriever(
    labels_or_ids=df['labels'].values, 
    comment_texts=df['news'].values, 
    lang='en')

In [None]:
class AucMeter(object):
    def __init__(self):
        self.reset()

    def reset(self):
        self.y_true = np.array([0,1,2])
        self.y_pred = np.array([0,1,2])
        self.score = 0

    def update(self, y_true, y_pred):
        y_true = y_true.cpu().numpy().argmax(axis=1)
        y_pred = nn.functional.softmax(y_pred, dim=1).data.cpu().numpy().argmax(axis=1)
        self.score = sklearn.metrics.accuracy_score(y_true, y_pred)
    
    @property
    def avg(self):
        return self.score

class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

In [None]:
class LabelSmoothing(nn.Module):
    def __init__(self, smoothing = 0):
        super(LabelSmoothing, self).__init__()
        self.confidence = 1.0 - smoothing
        self.smoothing = smoothing

    def forward(self, x, target):
        x = x.float()
        target = target.float()
        logprobs = torch.nn.functional.log_softmax(x, dim = -1)
        nll_loss = -logprobs * target
        nll_loss = nll_loss.sum(-1)
        smooth_loss = -logprobs.mean(dim=-1)
        loss = self.confidence * nll_loss + self.smoothing * smooth_loss
        return loss.mean()

In [None]:
import warnings
from tqdm import tqdm
warnings.filterwarnings("ignore")
from catalyst.data.sampler import DistributedSamplerWrapper, BalanceClassSampler

class TPUFitter:
    
    def __init__(self, model, device, config):
        if not os.path.exists('node_submissions'):
            os.makedirs('node_submissions')

        self.config = config
        self.epoch = 0
        self.log_path = 'log.txt'

        self.model = model
        self.device = device

        param_optimizer = list(self.model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]

        self.optimizer = AdamW(optimizer_grouped_parameters, lr=config.lr)
        self.scheduler = config.SchedulerClass(self.optimizer, **config.scheduler_params)

        self.criterion = config.criterion
        print(f'Fitter prepared. Device is {self.device}')

    def fit(self, train_loader, validation_loader):
        for e in range(self.config.n_epochs):
            if self.config.verbose:
                lr = self.optimizer.param_groups[0]['lr']
                timestamp = datetime.utcnow().isoformat()
                print(f'\n{timestamp}\nLR: {lr}')

            t = time.time()
            para_loader = train_loader
            losses, final_scores = self.train_one_epoch(para_loader)
            
            print(f'[RESULT]: Train. Epoch: {self.epoch}, loss: {losses.avg:.5f}, final_score: {final_scores.avg:.5f}, time: {(time.time() - t):.5f}')

            t = time.time()
            para_loader = validation_loader
            losses, final_scores = self.validation(para_loader)

            print(f'[RESULT]: Validation. Epoch: {self.epoch}, loss: {losses.avg:.5f}, final_score: {final_scores.avg:.5f}, time: {(time.time() - t):.5f}')

            if self.config.validation_scheduler:
                self.scheduler.step(metrics=final_scores.avg)

            self.epoch += 1
    
    def run_tuning_and_inference(self, test_loader, validation_tune_loader):
        for e in range(2):
            self.optimizer.param_groups[0]['lr'] = self.config.lr / (e + 1)
            para_loader = validation_tune_loader
            losses, final_scores = self.train_one_epoch(para_loader)
            para_loader = test_loader
            self.run_inference(para_loader)

    def validation(self, val_loader):
        self.model.eval()
        losses = AverageMeter()
        final_scores = AucMeter()

        t = time.time()
        for step, (targets, inputs, attention_masks) in tqdm(enumerate(val_loader)):
            if self.config.verbose:
                if step % self.config.verbose_step == 0:
                    print(
                        f'Valid Step {step}, loss: ' + \
                        f'{losses.avg:.5f}, final_score: {final_scores.avg:.5f}, ' + \
                        f'time: {(time.time() - t):.5f}'
                    )
            with torch.no_grad():
                inputs = inputs.to(self.device, dtype=torch.long) 
                attention_masks = attention_masks.to(self.device, dtype=torch.long) 
                targets = targets.to(self.device, dtype=torch.float) 

                outputs = self.model(inputs, attention_masks)
                loss = self.criterion(outputs, targets)
                
                batch_size = inputs.size(0)

                final_scores.update(targets, outputs)
                losses.update(loss.detach().item(), batch_size)
                
        return losses, final_scores
         
    def train_one_epoch(self, train_loader):
        self.model.train()

        losses = AverageMeter()
        final_scores = AucMeter()
        t = time.time()
        for step, (targets, inputs, attention_masks) in tqdm(enumerate(train_loader)):   
            if self.config.verbose:
                if step % self.config.verbose_step == 0:
                    print(
                        f'Train Step {step}, loss: ' + \
                        f'{losses.avg:.5f}, final_score: {final_scores.avg:.5f}, ' + \
                        f'time: {(time.time() - t):.5f}'
                    )

            inputs = inputs.to(self.device, dtype=torch.long)
            attention_masks = attention_masks.to(self.device, dtype=torch.long)
            targets = targets.to(self.device, dtype=torch.float)

            self.optimizer.zero_grad()

            outputs = self.model(inputs, attention_masks)
            loss = self.criterion(outputs, targets)

            batch_size = inputs.size(0)
            
            final_scores.update(targets, outputs)
            
            losses.update(loss.detach().item(), batch_size)

            loss.backward()
            self.optimizer.step()

            if self.config.step_scheduler:
                self.scheduler.step()
        
        self.model.eval()
        self.save('last-checkpoint.bin')
        return losses, final_scores

    def run_inference(self, test_loader):
        self.model.eval()
        result = {'id': [], 'topic': []}
        l = []
        t = time.time()
        for step, (ids, inputs, attention_masks) in enumerate(test_loader):
            if self.config.verbose:
                if step % self.config.verbose_step == 0:
                    print(f'Prediction Step {step}, time: {(time.time() - t):.5f}')

            with torch.no_grad():
                inputs = inputs.to(self.device, dtype=torch.long) 
                attention_masks = attention_masks.to(self.device, dtype=torch.long)
                outputs = self.model(inputs, attention_masks)
                topics = nn.functional.sigmoid(outputs).cpu().numpy()
                _ , topics = torch.topk(torch.tensor(topics), dim = 1, k = 3)
                topics = np.array(topics)
            l.extend(topics)
        return np.array(l) 
    
    def save(self, path):        
        torch.save(self.model.state_dict(), path)

### Model

In [None]:
class TopicSimpleNNModel(nn.Module):

    def __init__(self):
        super(TopicSimpleNNModel, self).__init__()
        self.backbone = AutoModel.from_pretrained(BACKBONE_PATH)
        self.dropout = nn.Dropout(0.3)
        self.linear = nn.Linear(in_features=self.backbone.pooler.dense.out_features*2,out_features=8)
        
    def forward(self, input_ids, attention_masks):
        seq_x, _= self.backbone(input_ids=input_ids, attention_mask=attention_masks, return_dict=False)
        apool = torch.mean(seq_x, 1)
        mpool, _ = torch.max(seq_x, 1)
        x = torch.cat((apool, mpool), 1)
        x = self.dropout(x)
        return self.linear(x)

# Custom Config

In [None]:
net = TopicSimpleNNModel()

Downloading:   0%|          | 0.00/316M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
class TrainGlobalConfig:
    num_workers = 0 
    batch_size = 16
    n_epochs = 4
    lr = 1e-5

    # -------------------
    verbose = True
    verbose_step = 50
    # -------------------

    # --------------------
    step_scheduler = False  # do scheduler.step after optimizer.step
    validation_scheduler = True  # do scheduler.step after validation stage loss
    SchedulerClass = torch.optim.lr_scheduler.ReduceLROnPlateau
    scheduler_params = dict(
        mode='max',
        factor=0.7,
        patience=0,
        verbose=False, 
        threshold=0.0001,
        threshold_mode='abs',
        cooldown=0, 
        min_lr=1e-8,
        eps=1e-08
    )
    # --------------------

    # -------------------
    criterion = LabelSmoothing()
    # -------------------

### Main method

In [None]:
device = 'cuda:0'
net.to(device)
from torch.utils.data.distributed import DistributedSampler

train_loader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=TrainGlobalConfig.batch_size,
    pin_memory=False,
    drop_last=True,
    num_workers=TrainGlobalConfig.num_workers,
)

validation_loader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=TrainGlobalConfig.batch_size,
    pin_memory=False,
    drop_last=False,
    num_workers=TrainGlobalConfig.num_workers
)

test_loader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=TrainGlobalConfig.batch_size,
    pin_memory=False,
    drop_last=False,
    num_workers=TrainGlobalConfig.num_workers
)

fitter = TPUFitter(model=net, device=device, config=TrainGlobalConfig)
fitter.fit(train_loader, train_loader)
fitter.run_tuning_and_inference(train_loader, train_loader)

Fitter prepared. Device is cuda:0

2022-01-20T21:23:29.032498
LR: 1e-05


0it [00:00, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Train Step 0, loss: 0.00000, final_score: 0.00000, time: 0.04076


50it [00:23,  2.24it/s]

Train Step 50, loss: 1.24328, final_score: 1.00000, time: 23.29675


100it [00:45,  2.21it/s]

Train Step 100, loss: 1.20798, final_score: 0.18750, time: 45.84786


150it [01:08,  2.20it/s]


[RESULT]: Train. Epoch: 0, loss: 0.96394, final_score: 1.00000, time: 69.01419


1it [00:00,  6.11it/s]

Valid Step 0, loss: 0.00000, final_score: 0.00000, time: 0.02062


51it [00:08,  6.02it/s]

Valid Step 50, loss: 0.36435, final_score: 1.00000, time: 8.34846


101it [00:16,  5.84it/s]

Valid Step 100, loss: 0.31907, final_score: 0.87500, time: 16.82298


150it [00:25,  5.94it/s]


[RESULT]: Validation. Epoch: 0, loss: 0.29542, final_score: 1.00000, time: 25.26708

2022-01-20T21:25:03.314243
LR: 1e-05


0it [00:00, ?it/s]

Train Step 0, loss: 0.00000, final_score: 0.00000, time: 0.02948


50it [00:22,  2.22it/s]

Train Step 50, loss: 0.30566, final_score: 1.00000, time: 22.44093


100it [00:44,  2.20it/s]

Train Step 100, loss: 0.26425, final_score: 0.87500, time: 45.01461


150it [01:07,  2.22it/s]


[RESULT]: Train. Epoch: 1, loss: 0.23031, final_score: 1.00000, time: 68.47971


1it [00:00,  6.07it/s]

Valid Step 0, loss: 0.00000, final_score: 0.00000, time: 0.02350


51it [00:08,  6.03it/s]

Valid Step 50, loss: 0.25812, final_score: 1.00000, time: 8.35039


101it [00:16,  5.84it/s]

Valid Step 100, loss: 0.19495, final_score: 0.93750, time: 16.77101


150it [00:25,  5.97it/s]


[RESULT]: Validation. Epoch: 1, loss: 0.17273, final_score: 1.00000, time: 25.13326

2022-01-20T21:26:36.928097
LR: 7e-06


0it [00:00, ?it/s]

Train Step 0, loss: 0.00000, final_score: 0.00000, time: 0.02170


50it [00:22,  2.24it/s]

Train Step 50, loss: 0.20220, final_score: 1.00000, time: 22.46948


100it [00:44,  2.21it/s]

Train Step 100, loss: 0.17099, final_score: 0.93750, time: 44.98788


150it [01:07,  2.22it/s]


[RESULT]: Train. Epoch: 2, loss: 0.15413, final_score: 1.00000, time: 68.68927


1it [00:00,  5.67it/s]

Valid Step 0, loss: 0.00000, final_score: 0.00000, time: 0.04031


51it [00:08,  6.02it/s]

Valid Step 50, loss: 0.18103, final_score: 1.00000, time: 8.37000


101it [00:16,  5.91it/s]

Valid Step 100, loss: 0.13299, final_score: 0.93750, time: 16.83754


150it [00:25,  5.96it/s]


[RESULT]: Validation. Epoch: 2, loss: 0.11658, final_score: 1.00000, time: 25.19764

2022-01-20T21:28:10.815714
LR: 4.9e-06


0it [00:00, ?it/s]

Train Step 0, loss: 0.00000, final_score: 0.00000, time: 0.02106


50it [00:22,  2.23it/s]

Train Step 50, loss: 0.15727, final_score: 1.00000, time: 22.45092


100it [00:44,  2.21it/s]

Train Step 100, loss: 0.12673, final_score: 0.93750, time: 44.98556


150it [01:07,  2.22it/s]


[RESULT]: Train. Epoch: 3, loss: 0.11331, final_score: 1.00000, time: 68.41001


1it [00:00,  6.08it/s]

Valid Step 0, loss: 0.00000, final_score: 0.00000, time: 0.02791


51it [00:08,  5.95it/s]

Valid Step 50, loss: 0.12938, final_score: 1.00000, time: 8.48131


101it [00:17,  5.90it/s]

Valid Step 100, loss: 0.09174, final_score: 0.93750, time: 16.97485


150it [00:25,  5.92it/s]


[RESULT]: Validation. Epoch: 3, loss: 0.08144, final_score: 1.00000, time: 25.35253


0it [00:00, ?it/s]

Train Step 0, loss: 0.00000, final_score: 0.00000, time: 0.02383


50it [00:22,  2.24it/s]

Train Step 50, loss: 0.13401, final_score: 1.00000, time: 22.48259


100it [00:45,  2.21it/s]

Train Step 100, loss: 0.10570, final_score: 0.93750, time: 45.06919


150it [01:07,  2.22it/s]


Prediction Step 0, time: 0.01899
Prediction Step 50, time: 8.31959
Prediction Step 100, time: 16.67188


0it [00:00, ?it/s]

Train Step 0, loss: 0.00000, final_score: 0.00000, time: 0.01831


50it [00:22,  2.23it/s]

Train Step 50, loss: 0.08088, final_score: 1.00000, time: 22.46386


100it [00:44,  2.20it/s]

Train Step 100, loss: 0.06110, final_score: 0.93750, time: 44.99648


150it [01:07,  2.23it/s]


Prediction Step 0, time: 0.01582
Prediction Step 50, time: 8.36469
Prediction Step 100, time: 16.67850


In [None]:
result = fitter.run_inference(train_loader)

Prediction Step 0, time: 0.01764
Prediction Step 50, time: 8.19521
Prediction Step 100, time: 16.59983


In [None]:
w = pd.DataFrame(result)
w.to_csv('out.csv',index = False)