This notebook illustrates how to use Masked Language Modeling for this competition.

Observation: most of the dataset names consist of only words with uppercased-first-letter and some stopwords like `on`, `in`, `and` (e.g. `Early Childhood Longitudinal Study`, `Trends in International Mathematics and Science Study`). 

Thus, one approach to find the datasets is: 
- Locate all the sequences of capitalized words (these sequences may contain some stopwords), 
- Replace each sequence with one of 2 special symbols (e.g. `$` and `#`), implying if that sequence represents a dataset name or not.
- Have the model learn the MLM task.

The code below shows how to train a model for that purpose with the help of the `huggingface`.

In [None]:
MAX_SAMPLE = None # set a small number (e.g. 50) for experimentation, set None for production.
SEED = 42

model_checkpoint = "bert-base-cased"

MAX_LENGTH = 64
OVERLAP = 20
LENGTH = 1
FL_TH = 0.75

DATASET_SYMBOL = '$' # this symbol represents a dataset name
NONDATA_SYMBOL = '#' # this symbol represents a non-dataset name

# Install packages

In [None]:
!pip install datasets --no-index --find-links=file:///kaggle/input/coleridge-packages/packages/datasets
!pip install ../input/coleridge-packages/seqeval-1.2.2-py3-none-any.whl
!pip install ../input/coleridge-packages/tokenizers-0.10.1-cp37-cp37m-manylinux1_x86_64.whl
!pip install ../input/coleridge-packages/transformers-4.5.0.dev0-py3-none-any.whl


import os
import re
import json
import time
import datetime
import random
import glob
import importlib

import numpy as np
import pandas as pd

from tqdm import tqdm

import torch
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorForLanguageModeling, \
AutoModelForMaskedLM, Trainer, TrainingArguments, pipeline, AutoConfig

from IPython.display import clear_output


clear_output()

In [None]:
# https://huggingface.co/transformers/_modules/transformers/trainer_utils.html
def set_seed(seed: int):
    """
    Helper function for reproducible behavior to set the seed in ``random``, ``numpy``, ``torch`` and/or ``tf`` (if
    installed).

    Args:
        seed (:obj:`int`): The seed to set.
    """
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # ^^ safe to call this function even if cuda is not available
    
    print(f'Setted Pipeline SEED = {SEED}')


set_seed(SEED)

In [None]:
def text_cleaning(text):
    '''
    Converts all text to lower case, Removes special charecters, emojis and multiple spaces
    text - Sentence that needs to be cleaned
    '''
    text = re.sub('[^A-Za-z0-9]+', ' ', str(text).lower()).strip()
    text = re.sub(' +', ' ', text)
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    return text


def jaccard_similarity(s1, s2):
    l1 = s1.split(" ")
    l2 = s2.split(" ")    
    intersection = len(list(set(l1).intersection(l2)))
    union = (len(l1) + len(l2)) - intersection
    return float(intersection) / union


def find_data_sets_id(publication_id: int) -> list:
    data_set_ids = []
    for class_ in data_set_citations:
        if class_['publication_id'] == publication_id:
            data_set_ids.append(class_['data_set_id'])
    return data_set_ids


def find_data_set_citations_mention_list(publication_id: int) -> str:
    mention_list = []
    for class_ in data_set_citations:
        if class_['publication_id'] == publication_id:
            mention_list.append( '|'.join(class_['mention_list']) )
    return '|'.join( [label for label in mention_list if label != ''] )


def find_data_sets_title(data_set_id: int) -> str:
    for class_ in data_sets:
        if class_['data_set_id'] == data_set_id:
            return class_['title']
        

def RichContextDF(publications: '.json') -> pd.DataFrame:
    publication_id = []
    text_file_name = []
    citations_mention_list = []
    data_sets_title = []
    for class_ in publications:
        publication_id.append(class_['publication_id']) # to get data_set_citations
        text_file_name.append(class_['text_file_name']) # to get text_file
        
        # to get citations_mention_list
        citations_mention_list.append( find_data_set_citations_mention_list(class_['publication_id']) )
        
        # to get data_sets_title
        data_sets_title_temp = []
        for data_sets_id in find_data_sets_id( class_['publication_id'] ):
            data_sets_title_temp.append( find_data_sets_title(data_sets_id) )
        data_sets_title.append('||'.join(data_sets_title_temp))
    
    return pd.DataFrame({
        'publication_id': publication_id,
        'text_file_name': text_file_name,
        'citations_mention_list': citations_mention_list,
        'data_sets_title': data_sets_title
    })


def clean_paper_sentence(s):
    """
    This function is essentially clean_text without lowercasing.
    """
    s = re.sub('[^A-Za-z0-9]+', ' ', str(s)).strip()
    s = re.sub(' +', ' ', s)
    return s


def shorten_sentences(sentences):
    """
    Sentences that have more than MAX_LENGTH words will be split
    into multiple sentences with overlappings.
    """
    short_sentences = []
    for sentence in sentences:
        words = sentence.split()
        if len(words) > MAX_LENGTH:
            for p in range(0, len(words), MAX_LENGTH - OVERLAP):
                short_sentences.append(' '.join(words[p:p+MAX_LENGTH]))
        else:
            short_sentences.append(sentence)
    return short_sentences


def find_sublist(big_list, small_list):
    """
    find all positions of $small_list in $big_list.
    """
    all_positions = []
    for i in range(len(big_list) - len(small_list) + 1):
        if small_list == big_list[i:i+len(small_list)]:
            all_positions.append(i)
    
    return all_positions


def jaccard_similarity_list(l1, l2):
    """
    Return the Jaccard Similarity score of 2 lists.
    """
    intersection = len(list(set(l1).intersection(l2)))
    union = (len(l1) + len(l2)) - intersection
    return float(intersection) / union


connection_tokens = {'s', 'of', 'and', 'in', 'on', 'for', 'data', 'dataset'}
def find_negative_candidates(sentence, labels):
    """
    Extract negative samples for Masked Dataset Modeling from a given $sentence.
    A negative candidate should be a continuous sequence of at least 2 words, 
    each of these words either has the first letter in uppercase or is one of
    the connection words ($connection_tokens). Furthermore, the connection 
    tokens are not allowed to appear at the beginning and the end of the
    sequence. Lastly, the sequence must be quite different to any of the 
    ground truth labels (measured by Jaccard similarity).
    """
    def candidate_qualified(words, labels):
        while len(words) and words[0].lower() in connection_tokens:
            words = words[1:]
        while len(words) and words[-1].lower() in connection_tokens:
            words = words[:-1]
        
        return len(words) >= 2 and \
               all(jaccard_similarity_list(words, label) < 0.75 for label in labels)
    
    candidates = []
    
    phrase_start, phrase_end = -1, -1
    for id in range(1, len(sentence)):
        word = sentence[id]
        if word[0].isupper() or word in connection_tokens:
            if phrase_start == -1:
                phrase_start = phrase_end = id
            else:
                phrase_end = id
        else:
            if phrase_start != -1:
                if candidate_qualified(sentence[phrase_start:phrase_end+1], labels):
                    candidates.append((phrase_start, phrase_end))
                phrase_start = phrase_end = -1
    
    if phrase_start != -1:
        if candidate_qualified(sentence[phrase_start:phrase_end+1], labels):
            candidates.append((phrase_start, phrase_end))
    
    return candidates

# Load data

In [None]:
# train
train_path = '../input/coleridgeinitiative-show-us-the-data/train.csv'
paper_train_folder = '../input/coleridgeinitiative-show-us-the-data/train'

train = pd.read_csv(train_path)
print('train size before agg.:', len(train))

train = train[:MAX_SAMPLE]
# Group by publication, training labels should have the same form as expected output.
train = train.groupby('Id').agg({
    'pub_title': 'first',
    'dataset_title': '|'.join,
    'dataset_label': '|'.join,
    'cleaned_label': '|'.join
}).reset_index()    
print('train size after agg.:', len(train))

train = train.sort_values(by=['Id'])
train.head()

## Pseudo

In [None]:
# train
pseudo_train_path = '../input/coleridge-pseudolabelsv2-0585/submission.csv'
paper_train_folder = '../input/coleridgeinitiative-show-us-the-data/train'

pseudo_train = pd.read_csv(pseudo_train_path)
print('pseudo_train size before drop_duplicates.:', len(pseudo_train))

pseudo_train = pseudo_train[:MAX_SAMPLE]
pseudo_train = pseudo_train.drop_duplicates(subset='Id')
print('pseudo_train size after drop_duplicates:', len(pseudo_train), '\n')

pseudo_train = pseudo_train.sort_values(by=['Id']).reset_index(drop=True)
pseudo_train.head()

## Original Add Pseudo

In [None]:
train['dataset_label'] = train['dataset_label'] + '|' + pseudo_train['PredictionString']
train.head()

In [None]:
train_temp = []
for labels in tqdm(train['dataset_label']):
    labels = labels.split('|')
    filtered_labels = []
    for label in labels:
        if len(filtered_labels) == 0 or all(jaccard_similarity(text_cleaning(label), text_cleaning(got_label)) < FL_TH for got_label in filtered_labels):
            filtered_labels.append(label)
    train_temp.append('|'.join(filtered_labels))
train['dataset_label'] = train_temp
train.head()

## External

In [None]:
with open(f'../input/rich-context-competition-train-testtargz/train_test/publications.json', 'r') as f:
    publications = json.load(f)
    
with open(f'../input/rich-context-competition-train-testtargz/train_test/data_set_citations.json', 'r') as f:
    data_set_citations = json.load(f)
    
with open(f'../input/rich-context-competition-train-testtargz/train_test/data_sets.json', 'r') as f:
    data_sets = json.load(f)

In [None]:
RichContext_train = RichContextDF(publications)
RichContext_train = RichContext_train[ RichContext_train['citations_mention_list'] != '' ]
RichContext_train

# Prepare data for train MLM

### Extract positive and negative samples

In [None]:
corpus = []
cnt_pos = 0
cnt_neg = 0

In [None]:
pbar = tqdm(total = len(RichContext_train))
for paper_id, dataset_labels in RichContext_train[['publication_id', 'citations_mention_list']].itertuples(index=False):
    labels = [clean_paper_sentence(label).split() for label in dataset_labels.split('|')]
    # papers preparation
    with open(f'../input/rich-context-competition-train-testtargz/train_test/files/text/{paper_id}.txt', 'r') as f:
        paper = f.readlines()
    paper = [line[:-1] for line in paper]
    content = ' '.join(paper)
    sentences = set([clean_paper_sentence(sentence) for sentence in content.split('.')])
    sentences = shorten_sentences(sentences)
    sentences = [sentence for sentence in sentences if len(sentence) > LENGTH]
    sentences = [sentence.split() for sentence in sentences]

    # positive samples
    for sentence in sentences:
        for label in labels:
            for pos in find_sublist(sentence, label):
                dt_point = sentence[:pos] + [DATASET_SYMBOL] + sentence[pos+len(label):]
                corpus.append(' '.join(dt_point))
                cnt_pos += 1

    # negative samples
    for sentence in sentences:
        sentence_str = ' '.join(sentence)
        if all(w not in sentence_str for w in {'data', 'study'}):
            continue
        for phrase_start, phrase_end in find_negative_candidates(sentence, labels):
            dt_point = sentence[:phrase_start] + [NONDATA_SYMBOL] + sentence[phrase_end+1:]
            corpus.append(' '.join(dt_point))
            cnt_neg += 1

    # process bar
    pbar.update(1)
    pbar.set_description(f'Training data size: {cnt_pos} postives + {cnt_neg} negatives')
pbar.close()

In [None]:
pbar = tqdm(total = len(train))
for paper_id, dataset_labels in train[['Id', 'dataset_label']].itertuples(index=False):
    labels = [clean_paper_sentence(label).split() for label in dataset_labels.split('|')]
    # papers preparation
    with open(f'{paper_train_folder}/{paper_id}.json', 'r') as f:
        paper = json.load(f)
    content = '. '.join(section['text'] for section in paper)
    sentences = set([clean_paper_sentence(sentence) for sentence in content.split('.')])
    sentences = shorten_sentences(sentences)
    sentences = [sentence for sentence in sentences if len(sentence) > LENGTH]
    sentences = [sentence.split() for sentence in sentences]
    
    # positive samples
    for sentence in sentences:
        for label in labels:
            for pos in find_sublist(sentence, label):
                dt_point = sentence[:pos] + [DATASET_SYMBOL] + sentence[pos+len(label):]
                corpus.append(' '.join(dt_point))
                cnt_pos += 1
    
    # negative samples
    for sentence in sentences:
        sentence_str = ' '.join(sentence)
        if all(w not in sentence_str for w in {'data', 'study'}):
            continue
        for phrase_start, phrase_end in find_negative_candidates(sentence, labels):
            dt_point = sentence[:phrase_start] + [NONDATA_SYMBOL] + sentence[phrase_end+1:]
            corpus.append(' '.join(dt_point))
            cnt_neg += 1
    
    # process bar
    pbar.update(1)
    pbar.set_description(f'Training data size: {cnt_pos} postives + {cnt_neg} negatives')
pbar.close()

### Save data to a file

In [None]:
with open('train_mlm.json', 'w') as f:
    for sentence in corpus:
        row_json = {'text':sentence}
        json.dump(row_json, f)
        f.write('\n')

# Fine-tune the Transformer

In [None]:
datasets = load_dataset('json',
            data_files={'train' : 'train_mlm.json'},
            )

datasets["train"][:5]

### Tokenize and collate data

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["text"])

tokenized_datasets = datasets.map(tokenize_function, batched=True, num_proc=1, remove_columns=["text"])

In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

### Load pre-trained model and fine-tune

In [None]:
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

In [None]:
training_args = TrainingArguments(
    output_dir="output-mlm",
    evaluation_strategy = "no",
    learning_rate=2e-5,
    weight_decay=0.01,
    save_steps=12000,
    num_train_epochs=2,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    data_collator=data_collator,
)

In [None]:
trainer.train()

### Save model

In [None]:
trainer.model.save_pretrained('mlm-model')

### Save tokenizer

In [None]:
config = AutoConfig.from_pretrained(model_checkpoint)

tokenizer.save_pretrained('model_tokenizer')
config.save_pretrained('model_tokenizer')