# ðŸ¤— Bert for Question Answering Baseline: Inference

This code is adapted from my work in the Tweet Sentiment Extraction Competition.

It tackles the task as a Question Answering one, where the question is implicit and can be understood as : "Which datasets are mentionned ?"


The approach is quite naÃ¯ve and has a lot of flaws. Feel free to ask any question in the comments.

Training Kernel : https://www.kaggle.com/theoviel/bert-for-question-answering-baseline-training

# Initialization

## Imports

In [None]:
import re
import os
import gc
import glob
import json
import torch
import datetime
import tokenizers
import numpy as np
import transformers
import pandas as pd
import torch.nn as nn
import seaborn as sns
import matplotlib.pyplot as plt

from tokenizers import *
from functools import partial
from tqdm.notebook import tqdm
from torch.nn import functional as F
from sklearn.model_selection import StratifiedKFold

## Params

In [None]:
SEED = 2020

DATA_PATH = "../input/coleridgeinitiative-show-us-the-data/"
DATA_PATH_TRAIN = DATA_PATH + 'train/'
DATA_PATH_TEST = DATA_PATH + 'test/'

CP_PATH = '../input/coleridge-bert-qa/'

NUM_WORKERS = 4

VOCABS = {
    "bert-base-uncased": "../input/vocabs/bert-base-uncased-vocab.txt",
}

MODEL_PATHS = {
    'bert-base-uncased': '../input/bertconfigs/uncased_L-12_H-768_A-12/uncased_L-12_H-768_A-12/',
    'bert-large-uncased-whole-word-masking-finetuned-squad': '../input/bertconfigs/wwm_uncased_L-24_H-1024_A-16/wwm_uncased_L-24_H-1024_A-16/',
    'albert-large-v2': '../input/albert-configs/albert-large-v2/albert-large-v2/',
    'albert-base-v2': '../input/albert-configs/albert-base-v2/albert-base-v2/',
    'distilbert': '../input/albert-configs/distilbert/distilbert/',
}

## Config

In [None]:
class Config:
    # General
    k = 5
    seed = 2021

    # Texts
    max_len = 256
    
    # Architecture
    selected_model = "bert-base-uncased"
    lowercase = True
    
    # Training
    batch_size = 16
    batch_size_val = batch_size * 2

# Data

## Tokenizer

In [None]:
class EncodedText:
    def __init__(self, ids, offsets):
        self.ids = ids
        self.offsets = offsets


def create_tokenizer_and_tokens(config):
    if "roberta" in config.selected_model:
        raise NotImplementedError
        
    elif "albert" in config.selected_model:
        raise NotImplementedError
        
    else:
        tokenizer = BertWordPieceTokenizer(
            MODEL_PATHS[config.selected_model] + 'vocab.txt',
            lowercase=config.lowercase,
        )

        tokens = {
            'cls': tokenizer.token_to_id('[CLS]'),
            'sep': tokenizer.token_to_id('[SEP]'),
            'pad': tokenizer.token_to_id('[PAD]'),
        }
    
    return tokenizer, tokens

## Utils

In [None]:
import re
import os
import json
import numpy as np


def load_text(id_, root=""):
    with open(os.path.join(root, id_ + ".json")) as f:
        text = json.load(f)
    return text


def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower()).strip()


def locate_label_string(text, label):
    """
    Finds the label in the text
    """
    len_label = len(label) - 1

    candidates_idx = [i for i, e in enumerate(text) if e == label[1]]
    for idx in candidates_idx:
        if " " + text[idx: idx + len_label] == label:
            idx_start = idx
            idx_end = idx + len_label
            break

    assert (
        text[idx_start:idx_end] == label[1:]
    ), f'"{text[idx_start: idx_end]}" instead of "{label}" in "{text}"'

    char_targets = np.zeros(len(text))
    char_targets[idx_start:idx_end] = 1

    return idx_start, idx_end, char_targets


def locate_label_tokens(offsets, char_targets):
    """
    Finds the tokens corresponding to the found labels
    """
    target_idx = []
    for idx, (offset1, offset2) in enumerate(offsets):
        if sum(char_targets[offset1:offset2]) > 0:
            target_idx.append(idx)

    if not len(target_idx):
        for idx, (offset1, offset2) in enumerate(offsets):
            if sum(char_targets[offset1:offset2]) > 0:
                target_idx.append(idx)

    return target_idx[0], target_idx[-1]

## Process sample

In [None]:
def process_data(
    text,
    label,
    tokenizer,
    tokens,
    max_len=100,
    model_name="bert",
):
    """
    Prepares the data for the question answering task.
    Adapted from Abishek's work on the Tweet Sentiment extraction competition, 
    check his work for more details !
    """
    target_start, target_end = 0, 0
    text = " " + " ".join(str(text).split())
    label = " " + " ".join(str(label).split())

    if label != " ":
        idx_start, idx_end, char_targets = locate_label_string(
            text, label
        )

    tokenized = tokenizer.encode(text)
    input_ids_text = tokenized.ids[1:-1]

    # print(input_ids_text, len(input_ids_text))

    offsets = tokenized.offsets[1:-1]

    if label != " ":
        target_start, target_end = locate_label_tokens(offsets, char_targets)

    if target_end >= max_len - 2:  # target is too far in the sentence, we crop its beginning.
        n_tok_to_crop = target_start - max_len // 2
        new_str_start = offsets[n_tok_to_crop][0]

        input_ids_text = input_ids_text[n_tok_to_crop:]

        offsets = [tuple(t) for t in np.array(offsets[n_tok_to_crop:]) - new_str_start]
        text = text[new_str_start:]

        target_start -= n_tok_to_crop
        target_end -= n_tok_to_crop

    input_ids = (
        [tokens["cls"]]
        + input_ids_text[:max_len - 2]
        + [tokens["sep"]]
    )

    if "roberta" in model_name:
        token_type_ids = [0] * len(input_ids)
    else:
        token_type_ids = [1] * len(input_ids)

    text_offsets = [(0, 0)] + offsets[:max_len - 2] + [(0, 0)]

    target_start += 1
    target_end += 1

    # target_end = min(target_end, max_len - 1)

    assert len(input_ids) == len(token_type_ids) and len(input_ids) == len(text_offsets), (len(input_ids), len(text_offsets))  # noqa

    padding_length = max_len - len(input_ids)
    if padding_length > 0:
        input_ids = input_ids + ([tokens["pad"]] * padding_length)
        token_type_ids = token_type_ids + ([0] * padding_length)
        text_offsets = text_offsets + ([(0, 0)] * padding_length)

    return {
        "ids": input_ids,
        "token_type_ids": token_type_ids,
        "targets_start": target_start,
        "targets_end": target_end,
        "text": text,
        "label": label,
        "offsets": text_offsets,
    }

## Dataset

In [None]:
from torch.utils.data import Dataset

class ArticleDataset(Dataset):
    """
    Dataset for inference. 
    """
    def __init__(
        self,
        id_,
        tokenizer,
        tokens,
        max_len=512,
        words_per_split=300,
        margin=10,
        model_name="bert",
        root=""
    ):
        self.tokens = tokens
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.model_name = model_name
        self.words_per_split = words_per_split
        self.margin = margin

        self.article = load_text(id_, root=root)
        
        self.texts = self.article_to_texts()

    def __len__(self):
        return len(self.texts)
    
    def article_to_texts(self):
        """
        Each article is divided into sections, 
        and then into subsets of self.words_per_split words
        """
        texts = []
        for section in self.article:
            clean_section = clean_text(section['text']).split(' ')[:5000]  # only keep first 5k words
            
            for i in range(len(clean_section) // self.words_per_split + 1):
                start = max(0, self.words_per_split * i - self.margin)
                end = self.words_per_split * (i + 1) + self.margin
                text = " ".join(clean_section[start: end])
                texts.append(text)
            
        return texts

    def __getitem__(self, idx):
        data = process_data(
            self.texts[idx],
            "",
            self.tokenizer,
            self.tokens,
            max_len=self.max_len,
            model_name=self.model_name,
        )

        return {
            "ids": torch.tensor(data["ids"], dtype=torch.long),
            "token_type_ids": torch.tensor(data["token_type_ids"], dtype=torch.long),
            "target_start": torch.tensor(data["targets_start"], dtype=torch.long),
            "target_end": torch.tensor(data["targets_end"], dtype=torch.long),
            "text": data["text"],
            "label": data["label"],
            "offsets": torch.tensor(data["offsets"], dtype=torch.long),
        }


# Model

In [None]:
from transformers import BertModel, BertConfig

TRANSFORMERS = {   
    "bert-base-uncased": (BertModel, "bert-base-uncased", BertConfig),
}


class QATransformer(nn.Module):
    """
    Simple model for Question Answering
    """
    def __init__(self, model):
        super().__init__()
        self.name = model

        self.pad_idx = 1 if "roberta" in self.name else 0

        model_class, _, config_class = TRANSFORMERS[model]

        try:
            config = config_class.from_json_file(MODEL_PATHS[model] + 'bert_config.json')
        except:
            config = config_class.from_json_file(MODEL_PATHS[model] + 'config.json')
        config.output_hidden_states = True

        self.transformer =  model_class(config)

        self.nb_features = self.transformer.pooler.dense.out_features

        self.logits = nn.Sequential(
            nn.Linear(self.nb_features, self.nb_features),
            nn.Tanh(),
            nn.Linear(self.nb_features, 2),
        )

    def forward(self, tokens, token_type_ids):
        """
        Usual torch forward function

        Arguments:
            tokens {torch tensor} -- Sentence tokens
            token_type_ids {torch tensor} -- Sentence tokens ids
        """

        hidden_states = self.transformer(
            tokens,
            attention_mask=(tokens != self.pad_idx).long(),
            token_type_ids=token_type_ids,
        )[-1]

        features = hidden_states[-1]
        logits = self.logits(features)

        start_logits, end_logits = logits[:, :, 0], logits[:, :, 1]

        return start_logits, end_logits

# Inference


## Utils

In [None]:
def load_model_weights(model, filename, verbose=1, cp_folder=""):
    """
    Loads the weights of a PyTorch model. The exception handles cpu/gpu incompatibilities.

    Args:
        model (torch model): Model to load the weights to.
        filename (str): Name of the checkpoint.
        verbose (int, optional): Whether to display infos. Defaults to 1.
        cp_folder (str, optional): Folder to load from. Defaults to "".

    Returns:
        torch model: Model with loaded weights.
    """

    if verbose:
        print(f"\n -> Loading weights from {os.path.join(cp_folder,filename)}\n")
    try:
        model.load_state_dict(os.path.join(cp_folder, filename), strict=True)
    except BaseException:
        model.load_state_dict(
            torch.load(os.path.join(cp_folder, filename), map_location="cpu"),
            strict=True,
        )
    return model

## Predict

In [None]:
import torch
import numpy as np
from torch.utils.data import DataLoader


def predict(model, dataset, batch_size=32):
    """
    Usual predict torch function

    Arguments:
        model {torch model} -- Model to predict with
        dataset {torch dataset} -- Dataset to get predictions from

    Keyword Arguments:
        batch_size {int} -- Batch size (default: {32})

    Returns:
        numpy array -- Predictions
    """

    model.eval()
    start_probas = []
    end_probas = []

    loader = DataLoader(
        dataset, batch_size=batch_size, shuffle=False, num_workers=NUM_WORKERS
    )

    with torch.no_grad():
        for data in loader:
            ids, token_type_ids = data["ids"], data["token_type_ids"]

            start_logits, end_logits = model(
                ids.cuda(), token_type_ids.cuda()
            )

            start_probs = torch.softmax(start_logits, dim=1).cpu().detach().numpy()
            end_probs = torch.softmax(end_logits, dim=1).cpu().detach().numpy()

            for s, e in zip(start_probs, end_probs):
                start_probas.append(list(s))
                end_probas.append(list(e))

    return start_probas, end_probas

## Predicted strings from probas

In [None]:
def get_string_from_idx(text, idx_start, idx_end, offsets):
    """
    Uses the offsets to retrieve the predicted string based on the start and end indices
    """
    if idx_end < idx_start:
        idx_end = idx_start

    predicted_string = ""
    for i in range(idx_start, idx_end + 1):
        predicted_string += text[offsets[i][0]: offsets[i][1]]
        if i + 1 < len(offsets) and offsets[i][1] < offsets[i + 1][0]:
            predicted_string += " "

    return predicted_string


def get_pred_from_probas(dataset, start_probas, end_probas, threshold=0.):
    preds = []
    for i in range(len(dataset)):
        if start_probas[i].max() > threshold or end_probas[i].max() > threshold:
            start_idx = np.argmax(start_probas[i])
            end_idx = np.argmax(end_probas[i])
            if start_idx < end_idx and end_idx - start_idx < 10:
                # print(start_idx, end_idx)
                data = dataset[i]
                preds.append(get_string_from_idx(data["text"], start_idx, end_idx, data["offsets"]))

    return preds

## $k$-fold

In [None]:
def post_process(preds):
    """
    Naive processing of prediction : 
    Remove duplicates and convert to expected format.
    """
    preds = np.unique(preds)
    return "|".join(preds)


def k_fold_inference(config, df, tokenizer, tokens, weights, threshold=0.):
    models = []
    for w in weights:
        model = QATransformer(config.selected_model).cuda()
        model.zero_grad()
        load_model_weights(model, w)
        models.append(model)

    preds = []
    for text_id in tqdm(df['Id']):

        dataset = ArticleDataset(
            text_id,
            tokenizer,
            tokens,
            max_len=512,
            model_name="bert",
            root=DATA_PATH_TEST
        )

        start_probas, end_probas = [], []
        for model in models:
            start_proba, end_proba = predict(
                model, 
                dataset, 
                batch_size=config.batch_size_val, 
            )
            start_probas.append(start_proba)
            end_probas.append(end_proba)

        start_probas = np.mean(start_probas, 0)
        end_probas = np.mean(end_probas, 0)

        pred = get_pred_from_probas(dataset, start_probas, end_probas, threshold=threshold)
        preds.append(post_process(pred))
            
    return preds

# Main

In [None]:
config = Config
df = pd.read_csv(DATA_PATH + 'sample_submission.csv')

In [None]:
tokenizer, tokens = create_tokenizer_and_tokens(config)

In [None]:
dataset = ArticleDataset(
    df['Id'][0],
    tokenizer,
    tokens,
    max_len=512,
    model_name="bert",
    root=DATA_PATH_TEST,
)

In [None]:
weights = sorted(glob.glob(CP_PATH + "*.pt"))[:1]

In [None]:
preds_model = k_fold_inference(
    config,
    df,
    tokenizer,
    tokens,
    weights,
    threshold=0.5,
)

# Merge with 0.702 notebook
> https://www.kaggle.com/prashansdixit/coleridge-initiative-eda-baseline-model

## Compute

In [None]:
def read_append_return(filename, root=DATA_PATH_TRAIN, output='text'):
    """
    Function to read json file and then return the text data from them and append to the dataframe
    """
    json_path = os.path.join(root, (filename +'.json'))
    headings = []
    contents = []
    combined = []
    with open(json_path, 'r') as f:
        json_decode = json.load(f)
        for data in json_decode:
            headings.append(data.get('section_title'))
            contents.append(data.get('text'))
            combined.append(data.get('section_title'))
            combined.append(data.get('text'))
    
    all_headings = ' '.join(headings)
    all_contents = ' '.join(contents)
    all_data = '. '.join(combined)
    
    if output == 'text':
        return all_contents
    elif output == 'head':
        return all_headings
    else:
        return all_data

In [None]:
train_df = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/train.csv')

temp_1 = [x.lower() for x in train_df['dataset_label'].unique()]
temp_2 = [x.lower() for x in train_df['dataset_title'].unique()]
temp_3 = [x.lower() for x in train_df['cleaned_label'].unique()]

existing_labels = set(temp_1 + temp_2 + temp_3)

In [None]:
sample_sub = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/sample_submission.csv')
sample_sub['text'] = sample_sub['Id'].apply(partial(read_append_return, root=DATA_PATH_TEST))

In [None]:
preds_naive = []
for index, row in sample_sub.iterrows():
    sample_text = row['text']
    row_id = row['Id']
    
#     temp_df = train_df[train_df['text'] == text_cleaning(sample_text)]
#     cleaned_labels = temp_df['cleaned_label'].to_list()
    
    cleaned_labels = []
    for known_label in existing_labels:
        if known_label in sample_text.lower():
            cleaned_labels.append(clean_text(known_label))

    cleaned_labels = set([clean_text(x) for x in cleaned_labels])
    
    preds_naive.append('|'.join(cleaned_labels))

## Merge

In [None]:
def merge_preds(preds_naive, preds_model):
    preds = []
    for i in range(len(preds_naive)):
        pred_naive = preds_naive[i].split('|')
        pred_model = preds_model[i].split('|')
        
        pred_model_kept = []
        for pred_m in pred_model:
            kept = True
            for pred_n in pred_naive:
                if pred_m in pred_n or pred_n in pred_m:
                    kept = False
            
            if kept:
                pred_model_kept.append(pred_m)
            else:
                pass
#                 print(f'Removed prediction {pred_m}')
            
        preds.append("|".join(pred_naive + pred_model_kept))
    return preds

In [None]:
preds = merge_preds(preds_naive, preds_model)

## Submit

In [None]:
df['PredictionString'] = preds

df.to_csv('submission.csv', index=False)

df.head()

Thanks for reading !

Hopefully this work comes helpful in beating the public LB baselines...

Don't forget to upvote :)