# 1. Introduction and Context

Masked-Language Modeling using Transformer based methods will be used.

Following inputs are to be included as a first step. (the inputs on train data, test data and submission sample files are anyways included upfront)

* bigger-govt-dataset-list (for additional datasets)
* coleridge-mlm-model (pre-trained model)
* coleridge-packages (transformer packages)

We have been facing some issues with fsspec initially and hence the following are also added as inputs.
* icevision-080 (to have fsspec utils related issues fixed)

# 2. Install Packages

In [None]:
!pip install datasets --no-index --find-links=file:///kaggle/input/coleridge-packages/packages/datasets
!pip install ../input/coleridge-packages/seqeval-1.2.2-py3-none-any.whl
!pip install ../input/coleridge-packages/tokenizers-0.10.1-cp37-cp37m-manylinux1_x86_64.whl
!pip install ../input/coleridge-packages/transformers-4.5.0.dev0-py3-none-any.whl
#!pip install ../input/fsspec/fsspec-0.9.0-py3-none-any.whl
!pip install ../input/icevision-080/fsspec-2021.5.0-py3-none-any.whl

# 3. Configuration Params

In [None]:
COMPUTE_CV = False
ALL_BLENDED = False
BASELINE_HELPING = False
MATCH_ONLY = False
KEN_MATCHING = True
BS_CLEANING = False
SEED = 12456

# 4. Import Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import os
import re
import json
import time
import datetime
import random
import glob
import importlib

from tqdm.autonotebook import tqdm

import torch
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorForLanguageModeling, \
AutoModelForMaskedLM, Trainer, TrainingArguments, pipeline

from typing import List
import string
from functools import partial

sns.set()
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

sample_submission = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/sample_submission.csv')
if len(sample_submission) > 4: COMPUTE_CV = False
if COMPUTE_CV: 
    print('The notebook will compute CV score but commit notebook will not')
else:
    print('This submission notebook will only be used to submit result')

In [None]:
#!pip install --upgrade fsspec
#!pip install fsspec==0.9.0

# 5. Load Dataset

In [None]:
train_path = '../input/coleridgeinitiative-show-us-the-data/train.csv'
train_files_path = '../input/coleridgeinitiative-show-us-the-data/train'
train = pd.read_csv(train_path)

if COMPUTE_CV: 
    sample_submission = train
    paper_test_folder = '../input/coleridgeinitiative-show-us-the-data/train'
    test_files_path = paper_test_folder
else:
    sample_submission = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/sample_submission.csv')
    paper_test_folder = '../input/coleridgeinitiative-show-us-the-data/test'
    test_files_path = paper_test_folder
    
adnl_govt_labels_path = '../input/bigger-govt-dataset-list/data_set_800.csv'

In [None]:
publications = {}
for paper_id in tqdm(sample_submission['Id']):
    with open(f'{paper_test_folder}/{paper_id}.json', 'r') as f:
        paper = json.load(f)
        publications[paper_id] = paper

# 6. Literal Matching

### We will create a knowledge bank.

In [None]:
all_labels = set()

for label_1, label_2, label_3 in train[['dataset_title', 'dataset_label', 'cleaned_label']].itertuples(index=False):
    all_labels.add(str(label_1).lower())
    all_labels.add(str(label_2).lower())
    all_labels.add(str(label_3).lower())
    
print(f'No. different labels: {len(all_labels)}')

### Let's use additional Govt datasets

In [None]:
adnl_govt_labels = pd.read_csv(adnl_govt_labels_path)

for l in adnl_govt_labels.title:
    all_labels.add(l)
    
all_labels = set(all_labels)
print(f'No. different labels: {len(all_labels)}')

### Now, let's match on test data

In [None]:
def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower()).strip()


def totally_clean_text(txt):
    txt = clean_text(txt)
    txt = re.sub(' +', ' ', txt)
    return txt

if not BS_CLEANING:
    def text_cleaning(text):
        '''
        Converts all text to lower case, Removes special charecters, emojis and multiple spaces
        text - Sentence that needs to be cleaned
        '''
        text = re.sub('[^A-Za-z0-9]+', ' ', str(text).lower()).strip()
        text = re.sub(' +', ' ', text)
        emoji_pattern = re.compile("["
                                   u"\U0001F600-\U0001F64F"  # emoticons
                                   u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                                   u"\U0001F680-\U0001F6FF"  # transport & map symbols
                                   u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                                   "]+", flags=re.UNICODE)
        text = emoji_pattern.sub(r'', text)
        return text
else:
    def text_cleaning(text):
        '''
        Converts all text to lower case, Removes special charecters, emojis and multiple spaces
        text - Sentence that needs to be cleaned
        '''
        text = ''.join([k for k in text if k not in string.punctuation])
        text = re.sub('[^A-Za-z0-9]+', ' ', str(text).lower()).strip()
        # text = re.sub("/'+/g", ' ', text)
        return text


def read_json_pub(filename, train_data_path=train_files_path, output='text'):
    json_path = os.path.join(train_data_path, (filename+'.json'))
    headings = []
    contents = []
    combined = []
    with open(json_path, 'r') as f:
        json_decode = json.load(f)
        for data in json_decode:
            headings.append(data.get('section_title'))
            contents.append(data.get('text'))
            combined.append(data.get('section_title'))
            combined.append(data.get('text'))
    
    all_headings = ' '.join(headings)
    all_contents = ' '.join(contents)
    all_data = '. '.join(combined)
    
    if output == 'text':
        return all_contents
    elif output == 'head':
        return all_headings
    else:
        return all_data

In [None]:
if not KEN_MATCHING:
    literal_preds = []
    for paper_id in tqdm(sample_submission['Id']):
        paper = publications[paper_id]
        text_1 = '. '.join(section['text'] for section in paper).lower()
        text_2 = totally_clean_text(text_1)

        labels = set()
        for label in all_labels:
            if label in text_1 or label in text_2:
                labels.add(clean_text(label))

        literal_preds.append('|'.join(labels))
    literal_preds[:5]

### Ken Matching

In [None]:
if KEN_MATCHING:
    literal_preds = []
    to_append = []
    for index, row in tqdm(sample_submission.iterrows()):
        to_append = [row['Id'],'']
        large_string = str(read_json_pub(row['Id'], test_files_path))
        clean_string = text_cleaning(large_string)
        for index, row2 in adnl_govt_labels.iterrows():
            query_string = str(row2['title'])
            if query_string in clean_string:
                if to_append[1] != '' and clean_text(query_string) not in to_append[1]:
                    to_append[1] = to_append[1] + '|' + clean_text(query_string)
                if to_append[1] == '':
                    to_append[1] = clean_text(query_string)
        literal_preds.append(*to_append[1:])

# 7. Masked Dataset Modeling

### Let's set paths and hyperparams

In [None]:
if not MATCH_ONLY:
    PRETRAINED_PATH = '../input/coleridge-mlm-model/output-mlm/checkpoint-48000'
    TOKENIZER_PATH = '../input/coleridge-mlm-model/model_tokenizer'

    MAX_LENGTH = 64
    OVERLAP = 20

    PREDICT_BATCH = 128 # a higher value requires higher GPU memory usage

    DATASET_SYMBOL = '$' # this symbol represents a dataset name
    NONDATA_SYMBOL = '#' # this symbol represents a non-dataset name

# 8. Transform Data into MLM format

### Load Model and Tokenizer

In [None]:
if not MATCH_ONLY:
    tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH, use_fast=True)
    model = AutoModelForMaskedLM.from_pretrained(PRETRAINED_PATH)

    mlm = pipeline(
        'fill-mask', 
        model=model,
        tokenizer=tokenizer,
        device=0 if torch.cuda.is_available() else -1
    )

### Auxiliary Functions

In [None]:
def jaccard_similarity(s1, s2):
    l1 = s1.split(" ")
    l2 = s2.split(" ")    
    intersection = len(list(set(l1).intersection(l2)))
    union = (len(l1) + len(l2)) - intersection
    return float(intersection) / union

def clean_paper_sentence(s):
    """
    This function is essentially clean_text without lowercasing.
    """
    s = re.sub('[^A-Za-z0-9]+', ' ', str(s)).strip()
    s = re.sub(' +', ' ', s)
    return s

def shorten_sentences(sentences):
    """
    Sentences that have more than MAX_LENGTH words will be split
    into multiple sentences with overlappings.
    """
    short_sentences = []
    for sentence in sentences:
        words = sentence.split()
        if len(words) > MAX_LENGTH:
            for p in range(0, len(words), MAX_LENGTH - OVERLAP):
                short_sentences.append(' '.join(words[p:p+MAX_LENGTH]))
        else:
            short_sentences.append(sentence)
    return short_sentences

connection_tokens = {'s', 'of', 'and', 'in', 'on', 'for', 'data', 'dataset'}
def find_mask_candidates(sentence):
    """
    Extract masking candidates for Masked Dataset Modeling from a given $sentence.
    A candidate should be a continuous sequence of at least 2 words, 
    each of these words either has the first letter in uppercase or is one of
    the connection words ($connection_tokens). Furthermore, the connection 
    tokens are not allowed to appear at the beginning and the end of the
    sequence.
    """
    def candidate_qualified(words):
        while len(words) and words[0].lower() in connection_tokens:
            words = words[1:]
        while len(words) and words[-1].lower() in connection_tokens:
            words = words[:-1]
        
        return len(words) >= 2
    
    candidates = []
    
    phrase_start, phrase_end = -1, -1
    for id in range(1, len(sentence)):
        word = sentence[id]
        if word[0].isupper() or word in connection_tokens:
            if phrase_start == -1:
                phrase_start = phrase_end = id
            else:
                phrase_end = id
        else:
            if phrase_start != -1:
                if candidate_qualified(sentence[phrase_start:phrase_end+1]):
                    candidates.append((phrase_start, phrase_end))
                phrase_start = phrase_end = -1
    
    if phrase_start != -1:
        if candidate_qualified(sentence[phrase_start:phrase_end+1]):
            candidates.append((phrase_start, phrase_end))
    
    return candidates

### Transform

In [None]:
if not MATCH_ONLY:
    mask = mlm.tokenizer.mask_token
    all_test_data = []
    
    for paper_id in tqdm(sample_submission['Id']):
        # load paper
        paper = publications[paper_id]

        # extract sentences
        sentences = set([clean_paper_sentence(sentence) for section in paper 
                         for sentence in section['text'].split('.')
                        ])
        sentences = shorten_sentences(sentences) # make sentences short
        sentences = [sentence for sentence in sentences if len(sentence) > 10] # only accept sentences with length > 10 chars
        sentences = [sentence for sentence in sentences if any(word in sentence.lower() for word in ['data', 'study'])]
        sentences = [sentence.split() for sentence in sentences] # sentence = list of words

        # mask
        test_data = []
        for sentence in sentences:
            for phrase_start, phrase_end in find_mask_candidates(sentence):
                dt_point = sentence[:phrase_start] + [mask] + sentence[phrase_end+1:]
                test_data.append((' '.join(dt_point), ' '.join(sentence[phrase_start:phrase_end+1]))) # (masked text, phrase)

        all_test_data.append(test_data)

# 9. Prediction

In [None]:
if not MATCH_ONLY:
    pred_mlm_labels = []

    for test_data in tqdm(all_test_data):
        pred_bag = set()

        if len(test_data):
            texts, phrases = list(zip(*test_data))
            mlm_pred = []
            for p_id in range(0, len(texts), PREDICT_BATCH):
                batch_texts = texts[p_id:p_id+PREDICT_BATCH]
                batch_pred = mlm(list(batch_texts), targets=[f' {DATASET_SYMBOL}', f' {NONDATA_SYMBOL}'])

                if len(batch_texts) == 1:
                    batch_pred = [batch_pred]

                mlm_pred.extend(batch_pred)

            for (result1, result2), phrase in zip(mlm_pred, phrases):
                if (result1['score'] > result2['score']*2 and result1['token_str'] == DATASET_SYMBOL) or\
                   (result2['score'] > result1['score']*2 and result2['token_str'] == NONDATA_SYMBOL):
                    pred_bag.add(clean_text(phrase))

        # filter labels by jaccard score 
        filtered_labels = []

        for label in sorted(pred_bag, key=len, reverse=True):
            if len(filtered_labels) == 0 or all(jaccard_similarity(label, got_label) < 0.75 for got_label in filtered_labels):
                filtered_labels.append(label)

        pred_mlm_labels.append('|'.join(filtered_labels))
    
    pred_mlm_labels[:5]

# 10. Baseline Model

In [None]:
def read_append_return(filename, train_files_path=train_files_path, output='text'):
    """
    Function to read json file and then return the text data from them and append to the dataframe
    """
    json_path = os.path.join(train_files_path, (filename+'.json'))
    headings = []
    contents = []
    combined = []
    with open(json_path, 'r') as f:
        json_decode = json.load(f)
        for data in json_decode:
            headings.append(data.get('section_title'))
            contents.append(data.get('text'))
            combined.append(data.get('section_title'))
            combined.append(data.get('text'))
    
    all_headings = ' '.join(headings)
    all_contents = ' '.join(contents)
    all_data = '. '.join(combined)
    
    if output == 'text':
        return all_contents
    elif output == 'head':
        return all_headings
    else:
        return all_data
    
    
def text_cleaning(text):
    '''
    Converts all text to lower case, Removes special charecters, emojis and multiple spaces
    text - Sentence that needs to be cleaned
    '''
    text = ''.join([k for k in text if k not in string.punctuation])
    text = re.sub('[^A-Za-z0-9]+', ' ', str(text).lower()).strip()
    # text = re.sub("/'+/g", ' ', text)
    return text

In [None]:
if BASELINE_HELPING or ALL_BLENDED:
    tqdm.pandas()

    train['text'] = train['Id'].progress_apply(read_append_return)

    if not COMPUTE_CV:
        sample_submission['text'] = sample_submission['Id'].progress_apply(partial(read_append_return, train_files_path=test_files_path))

    train.head()

In [None]:
if BASELINE_HELPING or ALL_BLENDED:
    tqdm.pandas()
    
    train['text'] = train['text'].progress_apply(text_cleaning)

In [None]:
if BASELINE_HELPING or ALL_BLENDED:
    temp_1 = [x.lower() for x in train['dataset_label'].unique()]
    temp_2 = [x.lower() for x in train['dataset_title'].unique()]
    temp_3 = [x.lower() for x in train['cleaned_label'].unique()]

    existing_labels = set(temp_1 + temp_2 + temp_3)

    print(f'len(temp_1) = {len(temp_1)}')
    print(f'len(temp_2) = {len(temp_2)}')
    print(f'len(temp_3) = {len(temp_3)}')
    print(f'len(existing_labels) = {len(existing_labels)}')

    id_list = []
    lables_list = []
    for index, row in tqdm(sample_submission.iterrows()):
        sample_text = row['text']
        row_id = row['Id']
        temp_df = train[train['text'] == text_cleaning(sample_text)]
        cleaned_labels = temp_df['cleaned_label'].to_list()

        for known_label in existing_labels:
            if known_label in sample_text.lower():
                cleaned_labels.append(clean_text(known_label))

        cleaned_labels = [clean_text(x) for x in cleaned_labels]
        cleaned_labels = set(cleaned_labels)
        lables_list.append('|'.join(cleaned_labels))
        id_list.append(row_id)

# 11. Aggregate Final Predictions & Write into CSV format

In [None]:
final_predictions = []

if ALL_BLENDED:
    for literal_match, mlm_pred, lables_match in zip(literal_preds, pred_mlm_labels, lables_list):
        temp = [literal_match, mlm_pred, lables_match]
        temp = [pred for pred in temp if pred]
        temp = ('|').join(temp)
        final_predictions.append(temp)
        
elif BASELINE_HELPING:
    for literal_match, mlm_pred, lables_match in zip(literal_preds, pred_mlm_labels, lables_list):
        if literal_match:
            final_predictions.append(literal_match)
        elif lables_match:
            final_predictions.append(lables_match)
        else:
            final_predictions.append(mlm_pred)

elif MATCH_ONLY:
    final_predictions = literal_preds

else:    
    for literal_match, mlm_pred in zip(literal_preds, pred_mlm_labels):
        if literal_match:
            final_predictions.append(literal_match)
        else:
            final_predictions.append(mlm_pred)

sample_submission['PredictionString'] = final_predictions

In [None]:
sample_submission['PredictionString'] = final_predictions
sample_submission[['Id', 'PredictionString']].to_csv('submission.csv', index=False)

sample_submission.head()

In [None]:
sample_submission.shape

# 12. Evaluation

In [None]:
# Reference from - https://www.kaggle.com/c/coleridgeinitiative-show-us-the-data/discussion/230091

def compute_fbeta(y_true: List[List[str]],
                  y_pred: List[List[str]],
                  beta: float = 0.5) -> float:
    """Compute the Jaccard-based micro FBeta score.

    References
    ----------
    - https://www.kaggle.com/c/coleridgeinitiative-show-us-the-data/overview/evaluation
    """

    def _jaccard_similarity(str1: str, str2: str) -> float:
        a = set(str1.split()) 
        b = set(str2.split())
        c = a.intersection(b)
        return float(len(c)) / (len(a) + len(b) - len(c))

    tp = 0  # true positive
    fp = 0  # false positive
    fn = 0  # false negative
    for ground_truth_list, predicted_string_list in zip(y_true, y_pred):
        predicted_string_list_sorted = sorted(predicted_string_list)
        for ground_truth in sorted(ground_truth_list):            
            if len(predicted_string_list_sorted) == 0:
                fn += 1
            else:
                similarity_scores = [
                    _jaccard_similarity(ground_truth, predicted_string)
                    for predicted_string in predicted_string_list_sorted
                ]
                matched_idx = np.argmax(similarity_scores)
                if similarity_scores[matched_idx] >= 0.5:
                    predicted_string_list_sorted.pop(matched_idx)
                    tp += 1
                else:
                    fn += 1
        fp += len(predicted_string_list_sorted)

    tp *= (1 + beta ** 2)
    fn *= beta ** 2
    fbeta_score = tp / (tp + fp + fn)
    return fbeta_score

In [None]:
if COMPUTE_CV:
    COMPUTE_CV_SCORE = compute_fbeta(sample_submission['cleaned_label'].apply(lambda x: [x]),\
                  sample_submission['PredictionString'].apply(lambda x: x.split('|')))
    print('COMPUTE_CV_SCORE =', COMPUTE_CV_SCORE)
else:
    print(f'COMPUTE_CV = {COMPUTE_CV}')
    
print(f'ALL_BLENDED = {ALL_BLENDED}')
print(f'BASELINE_HELPING = {BASELINE_HELPING}')
print(f'MATCH_ONLY = {MATCH_ONLY}')
print(f'KEN_MATCHING = {KEN_MATCHING}')
print(f'BS_CLEANING = {BS_CLEANING}')
print(f'SEED = {SEED}')