# Setting

In [None]:
COMPUTE_CV = False

HOW = 'MLM_ONLY' # 'MATCH_ONLY', 'MLM_ONLY', 'UNION_MERGE', 'LAM', 'ALL_BLENDED_PP', 'ORIGINAL'
ADNL_GOVT_LABELS_PATH = '../input/bigger-govt-dataset-list/data_set_800.csv'
MATCHING_DATA = 'original' # 'original', 'drop_duplicates', 'text_cleaning'
PRED_TH = 2.0
FL_TH = 1.0 # 0.75
LAM_FL_TH = 0.5

POST_PROCESSING = False; PP_TH = 0.5
SEED = 42

if HOW != 'MATCH_ONLY':
    MODEL_PATH_PREFIX = '../input/coleridge-bert-mlm-external-pseudo-labels'
    MLM_PRETRAINED_PATH = 'mlm-model'
    TOKENIZER = 'model_tokenizer'
    
    LENGTH = 1
    MAX_LENGTH = 64
    OVERLAP = 20

    PREDICT_BATCH = 32 # a higher value requires higher GPU memory usage

    DATASET_SYMBOL = '$' # this symbol represents a dataset name
    NONDATA_SYMBOL = '#' # this symbol represents a non-dataset name

In [None]:
!pip install datasets --no-index --find-links=file:///kaggle/input/coleridge-packages/packages/datasets
!pip install ../input/coleridge-packages/seqeval-1.2.2-py3-none-any.whl
!pip install ../input/coleridge-packages/tokenizers-0.10.1-cp37-cp37m-manylinux1_x86_64.whl
!pip install ../input/coleridge-packages/transformers-4.5.0.dev0-py3-none-any.whl


import os, sys
import re
import json
import random

import numpy as np
import pandas as pd

from tqdm.autonotebook import tqdm
from datasets import load_dataset
import torch
from transformers import AutoTokenizer, DataCollatorForLanguageModeling, \
AutoModelForMaskedLM, Trainer, TrainingArguments, pipeline

from typing import List
import string
from functools import partial

from IPython.display import clear_output


clear_output()

In [None]:
# https://huggingface.co/transformers/_modules/transformers/trainer_utils.html
def set_seed(seed: int):
    """
    Helper function for reproducible behavior to set the seed in ``random``, ``numpy``, ``torch`` and/or ``tf`` (if
    installed).

    Args:
        seed (:obj:`int`): The seed to set.
    """
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # ^^ safe to call this function even if cuda is not available
    
    print(f'Setted Pipeline SEED = {SEED}')


set_seed(SEED)

In [None]:
sample_submission = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/sample_submission.csv')
if len(sample_submission) > 4: COMPUTE_CV = False
if COMPUTE_CV: 
    print('this submission notebook will compute CV score but commit notebook will not')
else:
    print('this submission notebook will only be used to submit result')

# Load data

In [None]:
train_path = '../input/coleridgeinitiative-show-us-the-data/train.csv'
train_files_path = '../input/coleridgeinitiative-show-us-the-data/train'
train = pd.read_csv(train_path)

if COMPUTE_CV: 
    sample_submission = train
    paper_test_folder = '../input/coleridgeinitiative-show-us-the-data/train'
    test_files_path = paper_test_folder
else:
    sample_submission = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/sample_submission.csv')
    paper_test_folder = '../input/coleridgeinitiative-show-us-the-data/test'
    test_files_path = paper_test_folder
    
adnl_govt_labels_path = ADNL_GOVT_LABELS_PATH

In [None]:
papers = {}
for paper_id in tqdm(sample_submission['Id']):
    with open(f'{paper_test_folder}/{paper_id}.json', 'r') as f:
        paper = json.load(f)
        papers[paper_id] = paper

# Literal Matching

## Auxiliary Functions

In [None]:
def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower()).strip()


def totally_clean_text(txt):
    txt = clean_text(txt)
    txt = re.sub(' +', ' ', txt)
    return txt


def text_cleaning(text):
    '''
    Converts all text to lower case, Removes special charecters, emojis and multiple spaces
    text - Sentence that needs to be cleaned
    '''
    text = re.sub('[^A-Za-z0-9]+', ' ', str(text).lower()).strip()
    text = re.sub(' +', ' ', text)
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    return text


def read_json_pub(filename, train_data_path=train_files_path, output='text'):
    json_path = os.path.join(train_data_path, (filename+'.json'))
    headings = []
    contents = []
    combined = []
    
    with open(json_path, 'r') as f:
        json_decode = json.load(f)
        for data in json_decode:
            headings.append(data.get('section_title'))
            contents.append(data.get('text'))
            combined.append(data.get('section_title'))
            combined.append(data.get('text'))
    
    all_headings = ' '.join(headings)
    all_contents = ' '.join(contents)
    all_data = '. '.join(combined)
    
    if output == 'text':
        return all_contents
    elif output == 'head':
        return all_headings
    else:
        return all_data

## Ken Matching

In [None]:
if HOW != 'MLM_ONLY':
    adnl_govt_labels = pd.read_csv(adnl_govt_labels_path)
    print('adnl_govt_labels size =', len(adnl_govt_labels))
    
    if MATCHING_DATA in ['drop_duplicates', 'text_cleaning']:
        adnl_govt_labels = adnl_govt_labels.drop_duplicates()
        print('adnl_govt_labels size after drop_duplicates =', len(adnl_govt_labels))
        
        if MATCHING_DATA == 'text_cleaning':
            adnl_govt_labels['title'] = adnl_govt_labels['title'].apply(text_cleaning)
            adnl_govt_labels = adnl_govt_labels.drop_duplicates()
            print('adnl_govt_labels size after text_cleaning =', len(adnl_govt_labels))

    
    literal_preds = []
    to_append = []
    for index, row in tqdm(sample_submission.iterrows()):
        to_append = [row['Id'],'']
        large_string = str(read_json_pub(row['Id'], test_files_path))
        clean_string = text_cleaning(large_string)
        for index, row2 in adnl_govt_labels.iterrows():
            query_string = str(row2['title'])
            if query_string in clean_string:
                if to_append[1] != '' and clean_text(query_string) not in to_append[1]:
                    to_append[1] = to_append[1] + '|' + clean_text(query_string)
                if to_append[1] == '':
                    to_append[1] = clean_text(query_string)
        literal_preds.append(*to_append[1:])

    print(literal_preds[:4])

else: print('Matching is not used.')

# Masked Dataset Modeling

## Load model and tokenizer

In [None]:
if HOW != 'MATCH_ONLY':
    TOKENIZER_PATH = os.path.join(MODEL_PATH_PREFIX, TOKENIZER)
    PRETRAINED_PATH = os.path.join(MODEL_PATH_PREFIX, MLM_PRETRAINED_PATH)
    
    tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH, use_fast=True)
    model = AutoModelForMaskedLM.from_pretrained(PRETRAINED_PATH)

    mlm = pipeline(
        'fill-mask', 
        model=model,
        tokenizer=tokenizer,
        device=0 if torch.cuda.is_available() else -1
    )

## Auxiliary functions

In [None]:
def jaccard_similarity(s1, s2):
    l1 = s1.split(" ")
    l2 = s2.split(" ")    
    intersection = len(list(set(l1).intersection(l2)))
    union = (len(l1) + len(l2)) - intersection
    return float(intersection) / union

def clean_paper_sentence(s):
    """
    This function is essentially clean_text without lowercasing.
    """
    s = re.sub('[^A-Za-z0-9]+', ' ', str(s)).strip()
    s = re.sub(' +', ' ', s)
    return s

def shorten_sentences(sentences):
    """
    Sentences that have more than MAX_LENGTH words will be split
    into multiple sentences with overlappings.
    """
    short_sentences = []
    for sentence in sentences:
        words = sentence.split()
        if len(words) > MAX_LENGTH:
            for p in range(0, len(words), MAX_LENGTH - OVERLAP):
                short_sentences.append(' '.join(words[p:p+MAX_LENGTH]))
        else:
            short_sentences.append(sentence)
    return short_sentences

connection_tokens = {'s', 'of', 'and', 'in', 'on', 'for', 'data', 'dataset'}
def find_mask_candidates(sentence):
    """
    Extract masking candidates for Masked Dataset Modeling from a given $sentence.
    A candidate should be a continuous sequence of at least 2 words, 
    each of these words either has the first letter in uppercase or is one of
    the connection words ($connection_tokens). Furthermore, the connection 
    tokens are not allowed to appear at the beginning and the end of the
    sequence.
    """
    def candidate_qualified(words):
        while len(words) and words[0].lower() in connection_tokens:
            words = words[1:]
        while len(words) and words[-1].lower() in connection_tokens:
            words = words[:-1]
        
        return len(words) >= 2
    
    candidates = []
    
    phrase_start, phrase_end = -1, -1
    for id in range(1, len(sentence)):
        word = sentence[id]
        if word[0].isupper() or word in connection_tokens:
            if phrase_start == -1:
                phrase_start = phrase_end = id
            else:
                phrase_end = id
        else:
            if phrase_start != -1:
                if candidate_qualified(sentence[phrase_start:phrase_end+1]):
                    candidates.append((phrase_start, phrase_end))
                phrase_start = phrase_end = -1
    
    if phrase_start != -1:
        if candidate_qualified(sentence[phrase_start:phrase_end+1]):
            candidates.append((phrase_start, phrase_end))
    
    return candidates

## Transform

In [None]:
if HOW != 'MATCH_ONLY':
    mask = mlm.tokenizer.mask_token
    all_test_data = []
    
    for paper_id in tqdm(sample_submission['Id']):
        # load paper
        paper = papers[paper_id]

        # extract sentences
        sentences = set([clean_paper_sentence(sentence) for section in paper 
                         for sentence in section['text'].split('.')
                        ])
        sentences = shorten_sentences(sentences) # make sentences short
        sentences = [sentence for sentence in sentences if len(sentence) > LENGTH] # only accept sentences with length > 10 chars
        sentences = [sentence for sentence in sentences if any(word in sentence.lower() for word in ['data', 'study'])]
        sentences = [sentence.split() for sentence in sentences] # sentence = list of words

        # mask
        test_data = []
        for sentence in sentences:
            for phrase_start, phrase_end in find_mask_candidates(sentence):
                dt_point = sentence[:phrase_start] + [mask] + sentence[phrase_end+1:]
                test_data.append((' '.join(dt_point), ' '.join(sentence[phrase_start:phrase_end+1]))) # (masked text, phrase)

        all_test_data.append(test_data)

## Predict

In [None]:
# test_data = all_test_data[0]
# pred_bag = set()
# if len(test_data):
#     texts, phrases = list(zip(*test_data))
#     mlm_pred = []
#     for p_id in range(0, len(texts), PREDICT_BATCH):
#         batch_texts = texts[p_id:p_id+PREDICT_BATCH]
#         batch_pred = mlm(list(batch_texts), targets=[f' {DATASET_SYMBOL}', f' {NONDATA_SYMBOL}'])

#         if len(batch_texts) == 1:
#             batch_pred = [batch_pred]

#         mlm_pred.extend(batch_pred)

#     for find_one_th in np.arange(FIND_ONE_TH, 1.0, -0.05): # abandoned this one
#         if len(pred_bag) == 0:
#             score1_temp = []; score2_temp = []
#             phrase_temp = []
#             token_str1 = []; token_str2 = []
#             for (result1, result2), phrase in zip(mlm_pred, phrases):
#                 score1_temp.append(result1['score']); score2_temp.append(result2['score'])
#                 phrase_temp.append(phrase)
#                 token_str1.append(result1['token_str']); token_str2.append(result2['token_str'])
#                 if (result1['score'] > result2['score'] * find_one_th and result1['token_str'] == DATASET_SYMBOL) or\
#                    (result2['score'] > result1['score'] * find_one_th and result2['token_str'] == NONDATA_SYMBOL):
#                     pred_bag.add(clean_text(phrase))
                    
            
            
# #             if len(pred_bag) == 0:
#             if True:
#                 score_df = pd.DataFrame({'score1': score1_temp, 'score2': score2_temp,
#                                         'phrase': phrase_temp,
#                                         'token_str1': token_str1, 'token_str2': token_str2})
#                 score_df['score_diff'] = abs(score_df.score1 - score_df.score2)
#                 row = score_df['score_diff'].argmax()
#                 pred_bag.add(score_df['phrase'][row])
# pred_bag

In [None]:
if HOW != 'MATCH_ONLY':
    pred_mlm_labels = []

    for test_data in tqdm(all_test_data):
        pred_bag = set()

        if len(test_data):
            texts, phrases = list(zip(*test_data))
            mlm_pred = []
            for p_id in range(0, len(texts), PREDICT_BATCH):
                batch_texts = texts[p_id:p_id+PREDICT_BATCH]
                batch_pred = mlm(list(batch_texts), targets=[f' {DATASET_SYMBOL}', f' {NONDATA_SYMBOL}'])
                
                if len(batch_texts) == 1:
                    batch_pred = [batch_pred]
                    
                mlm_pred.extend(batch_pred)

            for pred_th in np.arange(PRED_TH, 1.0, -0.05): # find-more PP
                if len(pred_bag) == 0:
                    for (result1, result2), phrase in zip(mlm_pred, phrases):
                        if (result1['score'] > result2['score'] * pred_th and result1['token_str'] == DATASET_SYMBOL) or\
                           (result2['score'] > result1['score'] * pred_th and result2['token_str'] == NONDATA_SYMBOL):
                            pred_bag.add(clean_text(phrase))
                else: break

        # filter labels by jaccard score 
        filtered_labels = []
        for label in sorted(pred_bag, key=len, reverse=True): # long to short so that we keep the potential best
            if len(filtered_labels) == 0 or all(jaccard_similarity(label, got_label) < FL_TH for got_label in filtered_labels):
                filtered_labels.append(label)

        pred_mlm_labels.append('|'.join(filtered_labels))
        
    print(f'pred_mlm_labels[:4] w/ PT{PRED_TH} FL{FL_TH}: \n{pred_mlm_labels[:4]}')
        
else: print('MLM is not used.')

# Final Predictions

In [None]:
final_predictions = []


if HOW == 'MATCH_ONLY':
    final_predictions = literal_preds
    
elif HOW == 'MLM_ONLY':
    final_predictions = pred_mlm_labels

elif HOW == 'UNION_MERGE':
    for i in range(len(literal_preds)):
        pred_naive = literal_preds[i].split('|')
        pred_model = pred_mlm_labels[i].split('|')
        pred_model_kept = []
        for pred_m in pred_model:
            kept = True
            for pred_n in pred_naive:
                if pred_m in pred_n or pred_n in pred_m:
                    kept = False
            if kept:
                pred_model_kept.append(pred_m)
        final_predictions.append("|".join(pred_naive + pred_model_kept))
        
elif HOW == 'LAM':
    for pred_match, perd_mlm in tqdm(zip(literal_preds, pred_mlm_labels)):
        if pred_match:
            labels = pred_match.split('|')
            
            # literal_preds + pred_mlm_labels
            if perd_mlm:
                filtered_labels = labels
                labels_mlm = perd_mlm.split('|')
                for label_mlm in labels_mlm:
                    if all(jaccard_similarity(label_mlm, got_label) < LAM_FL_TH for got_label in labels):
                        filtered_labels.append(label_mlm)
                        
            # literal_preds
            else: filtered_labels = labels
                
        # pred_mlm_labels
        elif perd_mlm:
            filtered_labels = perd_mlm.split('|')
        
        # ''
        else:
            filtered_labels = []
            
        final_predictions.append('|'.join(filtered_labels))
    print(f'final_predictions[:4] w/ LAM_FL_TH{LAM_FL_TH}:')
        
elif HOW == 'ALL_BLENDED_PP':
    for pred_match, perd_mlm in tqdm(zip(literal_preds, pred_mlm_labels)):
        if pred_match and perd_mlm:
            labels = pred_match + '|' + perd_mlm
        elif pred_match:
            labels = pred_match
        elif perd_mlm:
            labels = pred_match # bug to external-pseudo v1 length1 FL0.5 ABPP | 0.482
        else:
            labels = ''
        labels = labels.split('|')
        filtered_labels = []
        for label in sorted(labels, key=len, reverse=True):
            if len(filtered_labels) == 0 or all(jaccard_similarity(label, got_label) < FL_TH for got_label in filtered_labels):
                filtered_labels.append(label)
        final_predictions.append('|'.join(filtered_labels))
    print(f'final_predictions[:4] w/ FL{FL_TH}:')
        
elif HOW == 'ORIGINAL':    
    for literal_match, mlm_pred in zip(literal_preds, pred_mlm_labels):
        if literal_match:
            final_predictions.append(literal_match)
        else:
            final_predictions.append(mlm_pred)


final_predictions[:4]

[What is your best score without string matching?](https://www.kaggle.com/c/coleridgeinitiative-show-us-the-data/discussion/232964)

In [None]:
if POST_PROCESSING:
    temp_1 = [x.lower() for x in train['dataset_label'].unique()]
    temp_2 = [x.lower() for x in train['dataset_title'].unique()]
    temp_3 = [x.lower() for x in train['cleaned_label'].unique()]
    existing_labels = list(set(temp_1 + temp_2 + temp_3))
    
    final_predictions_temp = []
    for labels in final_predictions:
        labels = labels.split('|')
        final_predictions_temp_temp = []
        for label in labels:
            if all(jaccard_similarity(label, existing_label) < PP_TH for existing_label in existing_labels):
                final_predictions_temp_temp.append(label)
        final_predictions_temp.append('|'.join(final_predictions_temp_temp))
    final_predictions = final_predictions_temp
    
    print(f'final_predictions[:4] w/ PP{str(PP_TH)[1:]}:')
    
else:
    print('final_predictions[:4] w/o PP:')
    

final_predictions[:4]

In [None]:
sample_submission['PredictionString'] = final_predictions
sample_submission[['Id', 'PredictionString']].to_csv('submission.csv', index=False)

sample_submission.head()

# Evaluation Metric

In [None]:
# https://www.kaggle.com/c/coleridgeinitiative-show-us-the-data/discussion/230091
def compute_fbeta(y_true: List[List[str]],
                  y_pred: List[List[str]],
                  beta: float = 0.5) -> float:
    """Compute the Jaccard-based micro FBeta score.

    References
    ----------
    - https://www.kaggle.com/c/coleridgeinitiative-show-us-the-data/overview/evaluation
    """

    def _jaccard_similarity(str1: str, str2: str) -> float:
        a = set(str1.split()) 
        b = set(str2.split())
        c = a.intersection(b)
        return float(len(c)) / (len(a) + len(b) - len(c))

    tp = 0  # true positive
    fp = 0  # false positive
    fn = 0  # false negative
    for ground_truth_list, predicted_string_list in zip(y_true, y_pred):
        predicted_string_list_sorted = sorted(predicted_string_list)
        for ground_truth in sorted(ground_truth_list):            
            if len(predicted_string_list_sorted) == 0:
                fn += 1
            else:
                similarity_scores = [
                    _jaccard_similarity(ground_truth, predicted_string)
                    for predicted_string in predicted_string_list_sorted
                ]
                matched_idx = np.argmax(similarity_scores)
                if similarity_scores[matched_idx] >= 0.5:
                    predicted_string_list_sorted.pop(matched_idx)
                    tp += 1
                else:
                    fn += 1
        fp += len(predicted_string_list_sorted)

    tp *= (1 + beta ** 2)
    fn *= beta ** 2
    fbeta_score = tp / (tp + fp + fn)
    return fbeta_score

In [None]:
if COMPUTE_CV:
    COMPUTE_CV_SCORE = compute_fbeta(sample_submission['cleaned_label'].apply(lambda x: [x]),\
                  sample_submission['PredictionString'].apply(lambda x: x.split('|')))
    print('COMPUTE_CV_SCORE =', COMPUTE_CV_SCORE)
else:
    print(f'COMPUTE_CV = {COMPUTE_CV}')
    
print(f'HOW = {HOW}')
print(f'ADNL_GOVT_LABELS_PATH = {ADNL_GOVT_LABELS_PATH}')
print(f'MATCHING_DATA = {MATCHING_DATA}')
print(f'PRED_TH = {PRED_TH}')
print(f'FL_TH = {FL_TH}')
if HOW == 'LAM': print(f'LAM_FL_TH = {LAM_FL_TH}')

print(f'POST_PROCESSING = {POST_PROCESSING}')
if POST_PROCESSING: print(f'PP_TH = {PP_TH}')

print(f'PRETRAINED_PATH = {PRETRAINED_PATH}')
print(f'TOKENIZER_PATH = {TOKENIZER_PATH}')
print(f'LENGTH = {LENGTH}')

|   | CV | LB |
| --- | --- | --- |
| v1 length10 PLv1 only |   | 0.001 |
| v2 length8 |   | 0.575 |
| v2 length8 UM |   | 0.554 |
| v2 length8 only |   | 0.170 |
| v2 length8 36 og |   | 0.573 |
| v2 length8 36 add |   | 0.574 |
| v2 length8 48 |   | 0.574 |
| v3 length5 |   | 0.572 |
| v3 length5 36 |   | 0.573 |
| v4 length1 |   | 0.574 |
| v4 length1 36 |   | 0.573 |
| **v4 length1 48** | **0.514** | **0.575** |
| v4 length1 48 PP.8 |   |   |
| v4 length1 48 only |   | 0.106 |
| v4 length1 48 after drop_duplicates |   | 0.575 |
| v4 length1 48 after text_cleaning |   | 0.573 |
| v4 length1 48 UM |   | 0.561 |
| v4 length1 48 2000 |   | 0.532 |
| v4 length1 48 2000 text_cleaning |   | 0.531 |
| v4 length1 48 2000 UM |   | 0.521 |
| v4 length1 48 4000 UM |   | 0.486 |
| v4 length1 48 4000 |   | 0.496 |
| v4 length1 48 26897 |   | 0.244 |
| v4 length1 60 |   | 0.575 |
| NER w/ External_Datasets_Matching |   | 0.573 |
| NER v1 length1 |   | 0.573 |
| NER v1 length1 45 |   | 0.573 |
| external-data v1 length1 |   | 0.573 |
| external-data v1 length1 only |   | 0.118 |
| external-data v2 length1 |   | 0.574 |
| external-data v2 length1 only |   | 0.207 |
| external-data v2 length1 84 |   | 0.573 |
| external-data v2 length1 72 |   | 0.574 |
| external-data v2 length1 60 |   | 0.574 |
| external-data v2 length1 48 |   | 0.570 |
| external-data v2 length1 36 |   | 0.572 |
| external-pseudo v1 length1 |   | 0.572 |
| external-pseudo v1 length1 FOPP FL1.0 |   | 0.571 |
| external-pseudo v1 length1 FOPP FL1.0 LAM0.5 |   |   |
| external-pseudo v1 length1 only FL1.0 |   | 0.215 |
| external-pseudo v3 length1 (FL1.0) only FOPP FL1.0 |   | 0.216 |
| external-pseudo v4 length8 only FOPP FL1.0 |   | 0.196 |
| **external-pseudo v1 length1 only FOPP FL1.0** |   | **0.227** |
| external-pseudo v5 length1 only FOPP FL1.0 |   |   |
| external-pseudo v1 length1 only PT2.5 FOPP FL1.0 |   | 0.215 |
| external-pseudo v2 length1 (FL0.5) only FOPP FL1.0 |   | 0.225 |
| external-pseudo v1 length1 only |   | 0.213 |
| external-pseudo v1 length1 only FOPP |   |   |
| external-pseudo v1 length1 only FL0.5 |   | 0.206|
| external-pseudo v1 length1 FL0.5 |   | 0.572 |
| external-pseudo v1 length1 FL0.5 ABPP(bug) |   | 0.482 |
| external-pseudo v1 length1 only PP.5 |   |   |