# Setting

In [None]:
COMPUTE_CV = False
EDA_DEMO = True
ALL_BLENDED = False
BASELINE_HELPING = False
MATCH_ONLY = False
MLM_ONLY = False
KEN_MATCHING = True
BS_CLEANING = False
THEO_MERGE = False
SEED = 42

# Install packages

In [None]:
!pip install datasets --no-index --find-links=file:///kaggle/input/coleridge-packages/packages/datasets
!pip install ../input/coleridge-packages/seqeval-1.2.2-py3-none-any.whl
!pip install ../input/coleridge-packages/tokenizers-0.10.1-cp37-cp37m-manylinux1_x86_64.whl
!pip install ../input/coleridge-packages/transformers-4.5.0.dev0-py3-none-any.whl

from IPython.display import clear_output
clear_output()

# Import

In [None]:
import os
import re
import json
import time
import random
import glob
import importlib

import numpy as np
import pandas as pd

from tqdm.autonotebook import tqdm

import torch
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorForLanguageModeling, \
AutoModelForMaskedLM, Trainer, TrainingArguments, pipeline

from typing import List
import string
from functools import partial
import warnings
warnings.filterwarnings("ignore", 'This pattern has match groups')

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

sample_submission = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/sample_submission.csv')
if len(sample_submission) > 4: COMPUTE_CV = False
if COMPUTE_CV: 
    print('this submission notebook will compute CV score but commit notebook will not')
else:
    print('this submission notebook will only be used to submit result')

# Load data

In [None]:
train_path = '../input/coleridgeinitiative-show-us-the-data/train.csv'
train_files_path = '../input/coleridgeinitiative-show-us-the-data/train'
train = pd.read_csv(train_path)

if COMPUTE_CV: 
    sample_submission = train
    paper_test_folder = '../input/coleridgeinitiative-show-us-the-data/train'
    test_files_path = paper_test_folder
else:
    sample_submission = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/sample_submission.csv')
    paper_test_folder = '../input/coleridgeinitiative-show-us-the-data/test'
    test_files_path = paper_test_folder
    
adnl_govt_labels_path = '../input/bigger-govt-dataset-list/data_set_800.csv'

In [None]:
papers = {}
for paper_id in tqdm(sample_submission['Id']):
    with open(f'{paper_test_folder}/{paper_id}.json', 'r') as f:
        paper = json.load(f)
        papers[paper_id] = paper

In [None]:
all_labels = set()

for label_1, label_2, label_3 in train[['dataset_title', 'dataset_label', 'cleaned_label']].itertuples(index=False):
    all_labels.add(str(label_1).lower())
    all_labels.add(str(label_2).lower())
    all_labels.add(str(label_3).lower())
    
print(f'No. different labels: {len(all_labels)}')

#### Additional Govt Datasets

In [None]:
adnl_govt_labels = pd.read_csv(adnl_govt_labels_path)

for l in adnl_govt_labels.title:
    all_labels.add(l)
    
all_labels = set(all_labels)
print(f'No. different labels: {len(all_labels)}')

### Matching on test data

In [None]:
def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower()).strip()


def totally_clean_text(txt):
    txt = clean_text(txt)
    txt = re.sub(' +', ' ', txt)
    return txt

if not BS_CLEANING:
    def text_cleaning(text):
        '''
        Converts all text to lower case, Removes special charecters, emojis and multiple spaces
        text - Sentence that needs to be cleaned
        '''
        text = re.sub('[^A-Za-z0-9]+', ' ', str(text).lower()).strip()
        text = re.sub(' +', ' ', text)
        emoji_pattern = re.compile("["
                                   u"\U0001F600-\U0001F64F"  # emoticons
                                   u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                                   u"\U0001F680-\U0001F6FF"  # transport & map symbols
                                   u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                                   "]+", flags=re.UNICODE)
        text = emoji_pattern.sub(r'', text)
        return text
else:
    def text_cleaning(text):
        '''
        Converts all text to lower case, Removes special charecters, emojis and multiple spaces
        text - Sentence that needs to be cleaned
        '''
        text = ''.join([k for k in text if k not in string.punctuation])
        text = re.sub('[^A-Za-z0-9]+', ' ', str(text).lower()).strip()
        # text = re.sub("/'+/g", ' ', text)
        return text


def read_json_pub(filename, train_data_path=train_files_path, output='text'):
    json_path = os.path.join(train_data_path, (filename+'.json'))
    headings = []
    contents = []
    combined = []
    with open(json_path, 'r') as f:
        json_decode = json.load(f)
        for data in json_decode:
            headings.append(data.get('section_title'))
            contents.append(data.get('text'))
            combined.append(data.get('section_title'))
            combined.append(data.get('text'))
    
    all_headings = ' '.join(headings)
    all_contents = ' '.join(contents)
    all_data = '. '.join(combined)
    
    if output == 'text':
        return all_contents
    elif output == 'head':
        return all_headings
    else:
        return all_data

#### Ken Matching

In [None]:
literal_preds = []

if KEN_MATCHING and not MLM_ONLY:
    literal_preds = []
    to_append = []
    for index, row in tqdm(sample_submission.iterrows()):
        to_append = [row['Id'],'']
        large_string = str(read_json_pub(row['Id'], test_files_path))
        clean_string = text_cleaning(large_string)
        for index, row2 in adnl_govt_labels.iterrows():
            query_string = str(row2['title'])
            if query_string in clean_string:
                if to_append[1] != '' and clean_text(query_string) not in to_append[1]:
                    to_append[1] = to_append[1] + '|' + clean_text(query_string)
                if to_append[1] == '':
                    to_append[1] = clean_text(query_string)
        literal_preds.append(*to_append[1:])

elif MLM_ONLY:
    print('This kernel will only use MLM model to predict.')

# Masked Dataset Modeling

### Paths and Hyperparameters

In [None]:
if not MATCH_ONLY:
    PRETRAINED_PATH = '../input/coleridge-bert-mlmv4/output-mlm/checkpoint-48000'
    TOKENIZER_PATH = '../input/coleridge-bert-mlmv4/model_tokenizer'

    MAX_LENGTH = 64
    OVERLAP = 20

    PREDICT_BATCH = 32 # a higher value requires higher GPU memory usage

    DATASET_SYMBOL = '$' # this symbol represents a dataset name
    NONDATA_SYMBOL = '#' # this symbol represents a non-dataset name

# Transform data to MLM format

### Load model and tokenizer

In [None]:
if not MATCH_ONLY:
    tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH, use_fast=True)
    model = AutoModelForMaskedLM.from_pretrained(PRETRAINED_PATH)

    mlm = pipeline(
        'fill-mask', 
        model=model,
        tokenizer=tokenizer,
        device=0 if torch.cuda.is_available() else -1
    )

### Auxiliary functions

In [None]:
def jaccard_similarity(s1, s2):
    l1 = s1.split(" ")
    l2 = s2.split(" ")    
    intersection = len(list(set(l1).intersection(l2)))
    union = (len(l1) + len(l2)) - intersection
    return float(intersection) / union

def clean_paper_sentence(s):
    """
    This function is essentially clean_text without lowercasing.
    """
    s = re.sub('[^A-Za-z0-9]+', ' ', str(s)).strip()
    s = re.sub(' +', ' ', s)
    return s

def shorten_sentences(sentences):
    """
    Sentences that have more than MAX_LENGTH words will be split
    into multiple sentences with overlappings.
    """
    short_sentences = []
    for sentence in sentences:
        words = sentence.split()
        if len(words) > MAX_LENGTH:
            for p in range(0, len(words), MAX_LENGTH - OVERLAP):
                short_sentences.append(' '.join(words[p:p+MAX_LENGTH]))
        else:
            short_sentences.append(sentence)
    return short_sentences

connection_tokens = {'s', 'of', 'and', 'in', 'on', 'for', 'data', 'dataset'}
def find_mask_candidates(sentence):
    """
    Extract masking candidates for Masked Dataset Modeling from a given $sentence.
    A candidate should be a continuous sequence of at least 2 words, 
    each of these words either has the first letter in uppercase or is one of
    the connection words ($connection_tokens). Furthermore, the connection 
    tokens are not allowed to appear at the beginning and the end of the
    sequence.
    """
    def candidate_qualified(words):
        while len(words) and words[0].lower() in connection_tokens:
            words = words[1:]
        while len(words) and words[-1].lower() in connection_tokens:
            words = words[:-1]
        
        return len(words) >= 2
    
    candidates = []
    
    phrase_start, phrase_end = -1, -1
    for id in range(1, len(sentence)):
        word = sentence[id]
        if word[0].isupper() or word in connection_tokens:
            if phrase_start == -1:
                phrase_start = phrase_end = id
            else:
                phrase_end = id
        else:
            if phrase_start != -1:
                if candidate_qualified(sentence[phrase_start:phrase_end+1]):
                    candidates.append((phrase_start, phrase_end))
                phrase_start = phrase_end = -1
    
    if phrase_start != -1:
        if candidate_qualified(sentence[phrase_start:phrase_end+1]):
            candidates.append((phrase_start, phrase_end))
    
    return candidates

In [None]:
if not MATCH_ONLY:
    mask = mlm.tokenizer.mask_token
    all_test_data = []
    
    for paper_id in tqdm(sample_submission['Id']):
        # load paper
        paper = papers[paper_id]

        # extract sentences
        sentences = set([clean_paper_sentence(sentence) for section in paper 
                         for sentence in section['text'].split('.')
                        ])
        sentences = shorten_sentences(sentences) # make sentences short
        sentences = [sentence for sentence in sentences if len(sentence) > 1] # only accept sentences with length > 1 chars
        sentences = [sentence for sentence in sentences if any(word in sentence.lower() for word in ['data', 'study'])]
        sentences = [sentence.split() for sentence in sentences] # sentence = list of words

        # mask
        test_data = []
        for sentence in sentences:
            for phrase_start, phrase_end in find_mask_candidates(sentence):
                dt_point = sentence[:phrase_start] + [mask] + sentence[phrase_end+1:]
                test_data.append((' '.join(dt_point), ' '.join(sentence[phrase_start:phrase_end+1]))) # (masked text, phrase)

        all_test_data.append(test_data)

### Transform

### Predict

In [None]:
import transformers
import torch.nn as nn
import torch

In [None]:
class DatasetFinder(nn.Module):
    
    def __init__(self,params):
        super().__init__()
        self.model = transformers.AutoModel.from_pretrained("../input/scibert-huggingface/coleridge-scibert-models/output")
        for param in self.model.parameters():
            param.requires_grad=False
        self.dr = torch.nn.Dropout(params['dropout'])
        self.fc1 = torch.nn.Linear(768,params['lstm_inp_size'])
        self.relu = torch.nn.ReLU(inplace=True)
        self.lstm = torch.nn.LSTM(input_size=params['lstm_inp_size'],hidden_size=params['hid_size'],bidirectional=True,batch_first=True)
        self.fc = torch.nn.Linear(2*params['hid_size'],1)
        self.e=0
    def forward(self,inp):
        inp = self.dr(self.model(**inp).last_hidden_state)
        inp = self.relu(self.fc1(inp))
        inp,_=self.lstm(inp)
        inp = self.fc(inp).squeeze(2)
        return torch.sigmoid(inp)

In [None]:
tokenizer = transformers.AutoTokenizer.from_pretrained('../input/scibert-huggingface/coleridge-scibert-models/output')
DEVICE = 'cuda:0' if torch.cuda.is_available() else 'cpu'
M_COUNT=5

params = {
        'lr':0.000150074,
        'loss_func':torch.nn.BCELoss(),
        'lstm_inp_size': 2**10,
        'dropout': 0.0024323,
        'hid_size':2**12
}  

models = [DatasetFinder(params) for i in range(M_COUNT)]
for i in range(M_COUNT):
    models[i].load_state_dict(torch.load(f'../input/processed-train-data-sentence-segmentaion/model_best_{i}.state'))
    for params in models[i].parameters():
        params.requires_grad=False
    models[i].eval()
    models[i] = models[i].to(DEVICE)

In [None]:
def find_dataset_ann(text):
    t = text.split('.')
    p = 0
    inp_id = []
    attention_mask = []
    for x in t:
        com = tokenizer(x,max_length=30,padding="max_length",truncation=True)
        inp_id.append(torch.tensor(com["input_ids"],dtype=torch.long).view(1,-1))
        attention_mask.append(torch.tensor(com["attention_mask"],dtype=torch.long).view(1,-1))
#     ids = np.random.choice(np.arange(len(inp_id)),BATCH_SIZE)
#     if(len(inp_id)<=BATCH_SIZE):
#         ids = np.arange(len(inp_id))
    inp = {"input_ids":torch.cat(inp_id,0).to(DEVICE),
                 "attention_mask":torch.cat(attention_mask,0).to(DEVICE)}
    out = None
    for model in models:
        if out is None:
            out = model(inp).detach().cpu()
        else:
            out = out + model(inp).detach().cpu().numpy()
    out = out/M_COUNT
    ans = (inp['input_ids'].detach().cpu()*(out>0.45)).numpy()
    answers = []
    for i in ans:
        if(i.sum()>0):
            tmp = []
            for x in i:
                if x==0:
                    word = tokenizer.decode(tmp)
                    if len(word.split())>2:
                        answers.append(clean_text(word))
                    tmp = []
                else:
                    tmp.append(x)
            word = tokenizer.decode(tmp)
            if len(word.split())>2:
                answers.append(clean_text(word))
    if len(answers)==0:
        return ""
#     c = []
#     for ans in answers:
#         c.append(text.count(ans))
#     answers = [answers[i] for i in np.argsort(c)[::-1][:3]]
    return "|".join(answers) 

In [None]:
test_path = "../input/coleridgeinitiative-show-us-the-data/test"

In [None]:
dsets = []
ids = []
unused = []
for i, row in tqdm(sample_submission.iterrows()):
    if len(literal_preds[i])<=1:
        unused.append(i)
        text = json.load(open(os.path.join(test_path,sample_submission.iloc[i,0]+".json")))
        sec = []
        for x in text:
            sec.append(x['section_title'])
            sec.append(" ")    
            sec.append(x['text'])    
        entire = "".join(sec)
        try:
            pred = find_dataset_ann(entire)
            literal_preds[i] = pred
            spl = pred.split("|")
            for s in spl:
                dsets.append(s)
                ids.append(i)
        except:
            torch.cuda.empty_cache()
            print("error")
datasets = pd.DataFrame()
datasets['ids']=ids
datasets['dsets'] = dsets

In [None]:
for i in unused:
    test_data = all_test_data[i]
    pred_bag = set()

    if len(test_data):
        texts, phrases = list(zip(*test_data))
        mlm_pred = []
        for p_id in range(0, len(texts), PREDICT_BATCH):
            batch_texts = texts[p_id:p_id+PREDICT_BATCH]
            batch_pred = mlm(list(batch_texts), targets=[f' {DATASET_SYMBOL}', f' {NONDATA_SYMBOL}'])

            if len(batch_texts) == 1:
                batch_pred = [batch_pred]

            mlm_pred.extend(batch_pred)

        for (result1, result2), phrase in zip(mlm_pred, phrases):
            if (result1['score'] > result2['score']*2 and result1['token_str'] == DATASET_SYMBOL) or\
               (result2['score'] > result1['score']*2 and result2['token_str'] == NONDATA_SYMBOL):
                pred_bag.add(clean_text(phrase))

    # filter labels by jaccard score 
    filtered_labels = []

    for label in sorted(pred_bag, key=len, reverse=True):
        if len(filtered_labels) == 0 or all(jaccard_similarity(label, got_label) < 0.75 for got_label in filtered_labels):
            filtered_labels.append(label)
    for s in filtered_labels:
        dsets.append(s)
        ids.append(i)
    if len(literal_preds[i])>3:
        literal_preds[i] = literal_preds[i]+'|'+ '|'.join(filtered_labels)
    else:
        literal_preds[i] = '|'.join(filtered_labels)

In [None]:
for i in set(ids):
    ans = literal_preds[i]
    spl = ans.split("|")
    n_spl = []
    for s in spl:
        if len(datasets[(datasets.ids!=i)&(datasets.dsets==s)])>0:
            n_spl.append(s)
    literal_preds[i] = "|".join(set(n_spl))

In [None]:
sample_submission['PredictionString'] = literal_preds
sample_submission[['Id', 'PredictionString']].to_csv('submission.csv', index=False)

sample_submission.head()

In [None]:
sample_submission