This notebook is a template on using Literal Matching + Masked Language Modeling to identify datasets in papers.

The training of the Bert model was done in another notebook: [\[Coleridge\] BERT - Masked Dataset Modeling.](https://www.kaggle.com/tungmphung/coleridge-bert-masked-dataset-modeling)

The approach is:
- Locate all the sequences of capitalized words (these sequences may contain some stopwords),
- Replace each sequence with one of 2 special symbols (e.g. $ and #), implying if that sequence represents a dataset name or not.
- Have the model learn the MLM task.

# Install packages

In [None]:
!pip install datasets --no-index --find-links=file:///kaggle/input/coleridge-packages/packages/datasets
!pip install ../input/coleridge-packages/seqeval-1.2.2-py3-none-any.whl
!pip install ../input/coleridge-packages/tokenizers-0.10.1-cp37-cp37m-manylinux1_x86_64.whl
!pip install ../input/coleridge-packages/transformers-4.5.0.dev0-py3-none-any.whl

# Import

In [None]:
import os
import re
import json
import time
import datetime
import random
import glob
import importlib
from functools import partial
import string

import numpy as np
import pandas as pd

from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

import torch
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorForLanguageModeling, \
AutoModelForMaskedLM, Trainer, TrainingArguments, pipeline, AutoModelForSequenceClassification

sns.set()
random.seed(123)
np.random.seed(456)
torch.manual_seed(2021)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

from nltk.corpus import stopwords
from unidecode import unidecode

STOPWORDS = set(stopwords.words('english'))
REMOVE_MATCH = True
MASKEDLM = False

# Load data

In [None]:
train_path = '../input/coleridgeinitiative-show-us-the-data/train.csv'
train_files_path = '../input/coleridgeinitiative-show-us-the-data/train'
train = pd.read_csv(train_path)

In [None]:
sample_submission_path = '../input/coleridgeinitiative-show-us-the-data/sample_submission.csv'
sample_submission = pd.read_csv(sample_submission_path)

paper_test_folder = '../input/coleridgeinitiative-show-us-the-data/test'
# paper_test_folder = '../input/coleridgeinitiative-show-us-the-data/train'
papers = {}
for paper_id in sample_submission['Id']:
    with open(f'{paper_test_folder}/{paper_id}.json', 'r') as f:
        paper = json.load(f)
        papers[paper_id] = paper

# Literal Matching

### Create a knowledge bank

In [None]:
existing_labels = set(np.load('../input/showdata-labels1/existing_labels.npy', allow_pickle = True).tolist())
train = pd.read_csv('../input/showdata-labels1/cleaned_train.csv')
    
print(f'No. different labels: {len(existing_labels)}')

train.head()

In [None]:
existing_labels

### Matching on test data

In [None]:
def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower()).strip()

def totally_clean_text(txt):
    txt = clean_text(txt)
    txt = re.sub(' +', ' ', txt)
    return txt

## Functions for literal matching

def json_to_text(filename, train_files_path=train_files_path, output='text'):
    json_path = os.path.join(train_files_path, (filename+'.json'))
    headings = []
    contents = []
    combined = []
    with open(json_path, 'r') as f:
        json_decode = json.load(f)
        for data in json_decode:
            headings.append(data.get('section_title'))
            contents.append(data.get('text'))
            combined.append(data.get('section_title'))
            combined.append(data.get('text'))
    
    all_headings = ' '.join(headings)
    all_contents = ' '.join(contents)
    all_data = '. '.join(combined)
    
    if output == 'text':
        return all_contents
    elif output == 'head':
        return all_headings
    else:
        return all_data
    
def text_cleaning(text):
    text = ''.join([k for k in text if k not in string.punctuation])
    text = re.sub('[^A-Za-z0-9]+', ' ', str(text).lower()).strip()
    text = re.sub(' +', ' ', text)
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    return text

tqdm.pandas()
sample_submission['text'] = sample_submission['Id'].progress_apply(partial(json_to_text, train_files_path=paper_test_folder))

In [None]:
literal_preds = []

def predict(sample_sub):
    id_list = []
    labels_list = []
    for index, row in tqdm(sample_sub.iterrows()):
        sample_text = row['text']
        row_id = row['Id']
        temp_df = train[train['text'] == text_cleaning(sample_text)]
        cleaned_labels = temp_df['cleaned_label'].to_list()
        for known_label in existing_labels:
            if known_label in sample_text.lower():
                if known_label not in STOPWORDS:
                    if len(known_label)>1:
                        cleaned_labels.append(clean_text(known_label))
        cleaned_labels = [clean_text(x) for x in cleaned_labels]
        cleaned_labels = set(cleaned_labels)
        labels_list.append('|'.join(cleaned_labels))
        id_list.append(row_id)
    return (id_list,labels_list)

Z=predict(sample_submission)
literal_preds = Z[1]

# Masked Dataset Modeling

### Paths and Hyperparameters

In [None]:
#PRETRAINED_PATH = '../input/coleridge-bert-masked-dataset-modeling-edit/mlm-model'
PRETRAINED_PATH = '../input/coleridgebertsequenceclassification/seqClass-model'
TOKENIZER_PATH = '../input/coleridgebertsequenceclassification/model_tokenizer'

MAX_LENGTH = 64
OVERLAP = 20

PREDICT_BATCH = 32 # a higher value requires higher GPU memory usage

DATASET_SYMBOL = '$' # this symbol represents a dataset name
NONDATA_SYMBOL = '#' # this symbol represents a non-dataset name

# Transform data to MLM format

### Load model and tokenizer

In [None]:
# During training, the suspect phrases (i.e. uppercase/ connection words) are replaced with either 
# $ denoting dataset or # denoting not dataset, and the phrases are randomly masked - the model is
# then tasked to predict the masked words. In the inference phase, the suspect phrases (this time # 
# or $ is unknown) are masked and the model infers either # or $. 

# The fill-mask pipeline has an argument target_tokens when called which indicates what the valid 
# tokens for prediction are.

tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH, use_fast=True)


if MASKEDLM:
    model = AutoModelForMaskedLM.from_pretrained(PRETRAINED_PATH)
    mlm = pipeline(
        'fill-mask', 
        model=model,
        tokenizer=tokenizer,
        device=0 if torch.cuda.is_available() else -1
        )
else:
    model = AutoModelForSequenceClassification.from_pretrained(PRETRAINED_PATH)
    snt = pipeline(
        'sentiment-analysis', 
        model=model,
        tokenizer=tokenizer,
        device=0 if torch.cuda.is_available() else -1
        )

### Auxiliary functions

In [None]:
def jaccard_similarity(s1, s2):
    l1 = s1.split(" ")
    l2 = s2.split(" ")    
    intersection = len(list(set(l1).intersection(l2)))
    union = (len(l1) + len(l2)) - intersection
    return float(intersection) / union

def clean_paper_sentence(s):
    """
    This function is essentially clean_text without lowercasing.
    """
    s = re.sub('[^A-Za-z0-9]+', ' ', str(s)).strip()
    s = re.sub(' +', ' ', s)
    return s

def shorten_sentences(sentences):
    """
    Sentences that have more than MAX_LENGTH words will be split
    into multiple sentences with overlappings.
    """
    short_sentences = []
    for sentence in sentences:
        words = sentence.split()
        if len(words) > MAX_LENGTH:
            for p in range(0, len(words), MAX_LENGTH - OVERLAP):
                short_sentences.append(' '.join(words[p:p+MAX_LENGTH]))
        else:
            short_sentences.append(sentence)
    return short_sentences

# connection_tokens = {'s', 'of', 'and', 'in', 'on', 'for', 'data', 'dataset'}
def find_mask_candidates_old(sentence):
    """
    Extract masking candidates for Masked Dataset Modeling from a given $sentence.
    A candidate should be a continuous sequence of at least 2 words, 
    each of these words either has the first letter in uppercase or is one of
    the connection words ($connection_tokens). Furthermore, the connection 
    tokens are not allowed to appear at the beginning and the end of the
    sequence.
    """
    def candidate_qualified(words):
        while len(words) and words[0].lower() in connection_tokens:
            words = words[1:]
        while len(words) and words[-1].lower() in connection_tokens:
            words = words[:-1]
        
        return len(words) >= 2
    
    candidates = []
    
    phrase_start, phrase_end = -1, -1
    for id in range(1, len(sentence)):
        word = sentence[id]
        if word[0].isupper() or word in connection_tokens:
            if phrase_start == -1:
                phrase_start = phrase_end = id
            else:
                phrase_end = id
        else:
            if phrase_start != -1:
                if candidate_qualified(sentence[phrase_start:phrase_end+1]):
                    candidates.append((phrase_start, phrase_end))
                phrase_start = phrase_end = -1
    
    if phrase_start != -1:
        if candidate_qualified(sentence[phrase_start:phrase_end+1]):
            candidates.append((phrase_start, phrase_end))
    
    return candidates

connection_tokens = {'s', 'of', 'and', 'in', 'on', 'for', 'data', 'dataset', 'survey', 'study','sequence'}
prep_tokens = {'s', 'of', 'and', 'in', 'on', 'for', 'this', 'we', 'their', 'it', 'to'}
def find_mask_candidates(sentence):
    """
    Extract negative samples for Masked Dataset Modeling from a given $sentence.
    A negative candidate should be a continuous sequence of at least 2 words, 
    each of these words either has the first letter in uppercase or is one of
    the connection words ($connection_tokens). Furthermore, the connection 
    tokens are not allowed to appear at the beginning and the end of the
    sequence. Lastly, the sequence must be quite different to any of the 
    ground truth labels (measured by Jaccard similarity).
    """
    def candidate_qualified(words):
        # remove beginning words that are connection_tokens except data/dataset
        startIdx = 0
        endIdx = 0
        while len(words) and words[0].lower() in prep_tokens:
            words = words[1:]
            startIdx +=1
        # remove ending words that are connection_tokens
        while len(words) and words[-1].lower() in prep_tokens:
            words = words[:-1]
            endIdx+=1
        # comparison without connection_tokens
        if (len(words) <= 3 or \
            sum([1 for word in words if not word.isnumeric()])<=2):
            return False, []
        elif len(words)==4 and words[-1].isnumeric() and words[1] == 'and':
            # to get rid of references, e.g. Johnson and Johnson 2018
            return False, []
        else:
            return True, [startIdx, endIdx]
    
    candidates = []
    
    phrase_start, phrase_end = -1, -1
    for id in range(1, len(sentence)):
        word = sentence[id]
        # if word is captial or connection token
        # if word[0].isupper() or word in connection_tokens:
        if word[0].isupper() or (word[0].isnumeric() and len(word)>2):
            # set as phrase start if phrase start doesn't exist, if not set as end
            if phrase_start == -1:
                phrase_start = phrase_end = id
            else:
                phrase_end = id
        elif word not in connection_tokens:
            # if current phrase fulfils dissimilarity requirement, reset phrase_start
            if phrase_start != -1:
                qualified, tmpidxs = candidate_qualified(sentence[phrase_start:phrase_end+1])
                if qualified:
                    candidates.append((phrase_start+tmpidxs[0], phrase_end-tmpidxs[1]))
                phrase_start = phrase_end = -1
    
    # to deal with case where phrase end is last word
    if phrase_start != -1:
        qualified, tmpidxs = candidate_qualified(sentence[phrase_start:phrase_end+1])
        if qualified:
            candidates.append((phrase_start+tmpidxs[0], phrase_end-tmpidxs[1]))
    
    return candidates

def pre_tokenize(sentence):
    try:
        sentence = sentence.split()
    except:
        pass
    wordlist = ['university', 'initiative','international','information']
    for i in range(len(sentence)):
        word  = sentence[i]
        if word.isupper():
            sentence[i] = '#'
        elif word[0].isupper() and len(word)>8 and word.lower() not in wordlist:
            sentence[i] = '$'
    return sentence

### Transform

In [None]:
if MASKEDLM:
    mask = mlm.tokenizer.mask_token

In [None]:
all_test_data = []
dN = 5
for paper_id in sample_submission['Id']:
    # load paper
    paper = papers[paper_id]
    
    # extract sentences
    sentences = set([clean_paper_sentence(sentence) for section in paper 
                     for sentence in section['text'].split('.')
                    ])
    sentences = shorten_sentences(sentences) # make sentences short
    sentences = [sentence for sentence in sentences if len(sentence) > 10] # only accept sentences with length > 10 chars
    sentences = [sentence for sentence in sentences if any(word in sentence.lower() for word in ['data', 'study'])]
    sentences = [sentence.split() for sentence in sentences] # sentence = list of words
    
    # mask
    test_data = []
    for sentence in sentences:
        for phrase_start, phrase_end in find_mask_candidates(sentence):
            if MASKEDLM:
                dt_point = sentence[:phrase_start] + [mask] + sentence[phrase_end+1:]
                test_data.append((' '.join(dt_point), ' '.join(sentence[phrase_start:phrase_end+1]))) # (masked text, phrase)
            else:
                dt_point = pre_tokenize(sentence[max(phrase_start-dN,0):phrase_end+1+dN])
                test_data.append((' '.join(dt_point), sentence[phrase_start:phrase_end+1]))
    
    all_test_data.append(test_data)

### Predict

In [None]:
pred_labels = []

pbar = tqdm(total = len(all_test_data))
# each iteration is one ID, test_data contains a list of tuples, 1st member of tuple
# is the masked sentence, second member is the phrase
for i,test_data in enumerate(all_test_data):
    pred_bag = set()
    
    if len(test_data):
        texts, phrases = list(zip(*test_data))
        preds = []
        # iterate through sentences for that ID
        for p_id in range(0, len(texts), PREDICT_BATCH):
            batch_texts = texts[p_id:p_id+PREDICT_BATCH]
            # since there are two target tokens, batch_pred is a list of two dicts, each
            # containing the token identity and the corresponding prob score
            if MASKEDLM:
                batch_pred = mlm(list(batch_texts), targets=[f' {DATASET_SYMBOL}', f' {NONDATA_SYMBOL}'])
            else:
                batch_pred = snt(list(batch_texts))
            
            if len(batch_texts) == 1 and MASKEDLM:
                batch_pred = [batch_pred]
            
            preds.extend(batch_pred)
        
        if MASKEDLM:
            # append phrase to pred_bag if probability score corresponding to DATASET_SYMBOL is
            # sufficiently high - pred_bag contains all predictions for a particular ID
            for (result1, result2), phrase in zip(preds, phrases):
                # print(result1)
                if (result1['score'] > result2['score']*2 and result1['token_str'] == DATASET_SYMBOL) or\
                   (result2['score'] > result1['score']*2 and result2['token_str'] == DATASET_SYMBOL):
                    pred_bag.add(clean_text(phrase))
        else:
            for lbl, phrase in zip(preds,phrases):
                if lbl['label']=='LABEL_1':
                    pred_bag.add(clean_text(phrase))
    # filter labels by jaccard score 
    filtered_labels = []
    # starting with longest labels, label is accepted only if there are no previously
    # accepted labels or if label is sufficiently dissimilar to other accepted labels
    for label in sorted(pred_bag, key=len, reverse=True):
        if len(filtered_labels) == 0 or all(jaccard_similarity(label, got_label) < 0.75 for got_label in filtered_labels):
            if any(jaccard_similarity(label, got_label) > 0.5 for got_label in existing_labels) and REMOVE_MATCH:
                continue
            filtered_labels.append(label)
            
    pred_labels.append('|'.join(filtered_labels))
    pbar.update(1)

In [None]:
batch_pred

In [None]:
preds

In [None]:
if MASKEDLM:
    mlm_pred
else:
    for lbl, phrase in zip(preds,phrases):
        print('Label: '+ str(lbl['label'])+ '-> '+str(lbl['score']) +'; string:'+ clean_text(phrase))

In [None]:
# pred_labels[:5]

## Aggregate final predictions and write submission file

In [None]:
final_predictions = []
for literal_match, mlm_pred in zip(literal_preds, pred_labels):
    if literal_match and not REMOVE_MATCH:
        final_predictions.append(literal_match)
    else:
        final_predictions.append(mlm_pred)

In [None]:
sample_submission['PredictionString'] = final_predictions
sample_submission = sample_submission[['Id','PredictionString']]
sample_submission.to_csv('submission.csv', index=False)

In [None]:
sample_submission.head()

In [None]:
preds