This notebook simply uses matching if a dataset is in the document, it "predicts" the title.  It uses the 180 dataset list from the train data and adds some hand curated govt dataset titles.

In [None]:
# CFG

SEED = 42

In [None]:
# MLM related scripts are writen between #### MLM and #### MLMMLM
# all other scripts by Ryosuke Horiuchi will be written between #### RIOW and #### RIOWRIOW

# MLM related scripts are copied from:
# https://www.kaggle.com/chienhsianghung/external-datasets-matching-mlmv4


#### MLM
!pip install datasets --no-index --find-links=file:///kaggle/input/coleridge-packages/packages/datasets
!pip install ../input/coleridge-packages/seqeval-1.2.2-py3-none-any.whl
!pip install ../input/coleridge-packages/tokenizers-0.10.1-cp37-cp37m-manylinux1_x86_64.whl
!pip install ../input/coleridge-packages/transformers-4.5.0.dev0-py3-none-any.whl
#### MLMMLM

In [None]:
import os
import re
import json
import time
import random
import glob
import importlib
import numpy as np
import pandas as pd
from tqdm.autonotebook import tqdm

#### MLM
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorForLanguageModeling, \
AutoModelForMaskedLM, Trainer, TrainingArguments, pipeline

from typing import List
import string
from functools import partial
import warnings
warnings.filterwarnings("ignore", 'This pattern has match groups')

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
#### MLMMLM

sample_submission = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/sample_submission.csv')

In [None]:
def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

def get_count_tp_fp_fn(prediction, verbose=True):
    preds = prediction.split(" ")
    if verbose:
        print(preds)
    tpc = 0
    fpc = 0
    fnc = 0
    for pred in preds:
        if pred == "TP":
            tpc = tpc + 1
        elif pred == "FP":
            fpc = fpc + 1
        elif pred == "FN":
            fnc = fnc + 1
    return [tpc, fpc, fnc]

def make_col_tp_fp_fn(df, col):
    df['TP'] = df[col].apply(lambda x : x[0])
    df['FP'] = df[col].apply(lambda x : x[1])
    df['FN'] = df[col].apply(lambda x : x[2])
    return df

def get_precision_recall(tp, fp, fn):
    precision = tp / (tp+fp)
    recall = tp / (tp + fn)
    return precision, recall

def fbeta_score(precision, recall, beta):
    fbeta = (1+(beta*beta))*((precision*recall)/( (beta*beta*precision) + recall))
    return fbeta

def coleridge_initiative_jaccard(ground_truth, prediction, verbose=True):
    gts = ground_truth.split('|')
    pds = sorted(prediction.split('|'))
    if verbose:
        print("Ground truth : " , gts)
        print("Prediction : ", pds)
        
    js_scores = []
    cf_matrix = []
    
    #### Counting True Positives (TP) and False Positives (FP)

    for pd in pds:
        if len(pd)>0:
            score = -1
            for gt in gts:
                js = jaccard(pd, gt)
                if js > score:
                    score = js
            if score >= 0.5:
                js_scores.append(score)
                cf_matrix.append("TP")
            else:
                js_scores.append(score)
                cf_matrix.append("FP")

    
    #### Counting False Negatives (FN)
    
    for gt in gts:
        score = -1
        for pd in pds:
            js = jaccard(gt, pd)
            if js > score:
                score = js
        if score == 0:
            js_scores.append(score)
            cf_matrix.append("FN")
            
    return js_scores, " ".join(cf_matrix)
    

def score_df_coleridge_initiative(output, gt_col, pred_col, beta=0.5, verbose=True):
    
    '''
    This function will calculate the FBeta score for Coleridge Initiative competition 
    if given appropriate arguments
    
    Arguments - 
    output - Your submission dataframe that has both ground truth and prediction columns.
    gt_col - This is the column name of ground truth column.
    pred_col - This is the column name of predictions column.
    beta - Beta value to calculate FBeta score.
    
    Returns - 
    This function will return the FBeta (beta=0.5) score.
    
    ## Set verbose = True to print logs    
    '''
    
    ### Jaccard Similarity
    output['evaluation'] = output.apply(lambda x: coleridge_initiative_jaccard(x[gt_col], x[pred_col], verbose=False), axis=1)
    output['js_scores'] = output['evaluation'].apply(lambda x : x[0])
    output['pred_type'] = output['evaluation'].apply(lambda x : x[1])
    
    ### TP, FP and FN 
    output['tp_fp_fn'] = output['pred_type'].apply(lambda x : get_count_tp_fp_fn(x, verbose=False))
    output = make_col_tp_fp_fn(output, 'tp_fp_fn')
    
    tp = sum(output['TP'])
    fp = sum(output['FP'])
    fn = sum(output['FN'])
    precision, recall = get_precision_recall(tp, fp, fn)
    fbeta = fbeta_score(precision, recall, 0.5)
    
    if verbose:

        print("TP_FP_FN : ", tp,fp,fn)

    return fbeta

In [None]:
#### RIOW
sample_sub = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/sample_submission.csv')
train_data_path = '../input/coleridgeinitiative-show-us-the-data/train'
test_data_path = '../input/coleridgeinitiative-show-us-the-data/test'
train_df = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/train.csv')
#### RIOWRIOW

#### MLM
train_path = '../input/coleridgeinitiative-show-us-the-data/train.csv'
train_files_path = '../input/coleridgeinitiative-show-us-the-data/train'
train = pd.read_csv(train_path)


sample_submission = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/sample_submission.csv')
paper_test_folder = '../input/coleridgeinitiative-show-us-the-data/test'
test_files_path = paper_test_folder


papers = {}
for paper_id in tqdm(sample_submission['Id']):
    with open(f'{paper_test_folder}/{paper_id}.json', 'r') as f:
        paper = json.load(f)
        papers[paper_id] = paper


all_labels = set()
for label_1, label_2, label_3 in train[['dataset_title', 'dataset_label', 'cleaned_label']].itertuples(index=False):
    all_labels.add(str(label_1).lower())
    all_labels.add(str(label_2).lower())
    all_labels.add(str(label_3).lower())
    
print(f'No. different labels: {len(all_labels)}')
#### MLMMLM

In [None]:
def read_json_pub(filename, train_data_path=train_data_path, output='text'):
    json_path = os.path.join(train_data_path, (filename+'.json'))
    headings = []
    contents = []
    combined = []
    with open(json_path, 'r') as f:
        json_decode = json.load(f)
        for data in json_decode:
            headings.append(data.get('section_title'))
            contents.append(data.get('text'))
            combined.append(data.get('section_title'))
            combined.append(data.get('text'))
    
    all_headings = ' '.join(headings)
    all_contents = ' '.join(contents)
    all_data = '. '.join(combined)
    
    if output == 'text':
        return all_contents
    elif output == 'head':
        return all_headings
    else:
        return all_data

In [None]:
def text_cleaning(text):
    '''
    Converts all text to lower case, Removes special charecters, emojis and multiple spaces
    text - Sentence that needs to be cleaned
    '''
    text = re.sub('[^A-Za-z0-9]+', ' ', str(text).lower()).strip()
    text = re.sub(' +', ' ', text)
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    return text

In [None]:
def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower()).strip()

In [None]:
#### MLM
# def clean_text(txt):
#     return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower()).strip()

def totally_clean_text(txt):
    txt = clean_text(txt)
    txt = re.sub(' +', ' ', txt)
    return txt
#### MLMMLM

In [None]:
#### MLM
literal_preds = []
for paper_id in sample_submission['Id']:
    paper = papers[paper_id]
    text_1 = '. '.join(section['text'] for section in paper).lower()
    text_2 = totally_clean_text(text_1)
    
    labels = set()
    for label in all_labels:
        if label in text_1 or label in text_2:
            labels.add(clean_text(label))
    
    literal_preds.append('|'.join(labels))
#### MLMMLM

In [None]:
#### MLM
PRETRAINED_PATH = '../input/coleridge-bert-mlmv4/output-mlm/checkpoint-48000'
TOKENIZER_PATH = '../input/coleridge-bert-mlmv4/model_tokenizer'

MAX_LENGTH = 64
OVERLAP = 20

PREDICT_BATCH = 32 # a higher value requires higher GPU memory usage

DATASET_SYMBOL = '$' # this symbol represents a dataset name
NONDATA_SYMBOL = '#' # this symbol represents a non-dataset name
#### MLMMLM

In [None]:
#### MLM
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH, use_fast=True)
model = AutoModelForMaskedLM.from_pretrained(PRETRAINED_PATH)

mlm = pipeline(
    'fill-mask', 
    model=model,
    tokenizer=tokenizer,
    device=0 if torch.cuda.is_available() else -1
)
#### MLMMLM

In [None]:
#### MLM
def jaccard_similarity(s1, s2):
    l1 = s1.split(" ")
    l2 = s2.split(" ")    
    intersection = len(list(set(l1).intersection(l2)))
    union = (len(l1) + len(l2)) - intersection
    return float(intersection) / union

def clean_paper_sentence(s):
    """
    This function is essentially clean_text without lowercasing.
    """
    s = re.sub('[^A-Za-z0-9]+', ' ', str(s)).strip()
    s = re.sub(' +', ' ', s)
    return s

def shorten_sentences(sentences):
    """
    Sentences that have more than MAX_LENGTH words will be split
    into multiple sentences with overlappings.
    """
    short_sentences = []
    for sentence in sentences:
        words = sentence.split()
        if len(words) > MAX_LENGTH:
            for p in range(0, len(words), MAX_LENGTH - OVERLAP):
                short_sentences.append(' '.join(words[p:p+MAX_LENGTH]))
        else:
            short_sentences.append(sentence)
    return short_sentences

connection_tokens = {'s', 'of', 'and', 'in', 'on', 'for', 'data', 'dataset'}
def find_mask_candidates(sentence):
    """
    Extract masking candidates for Masked Dataset Modeling from a given $sentence.
    A candidate should be a continuous sequence of at least 2 words, 
    each of these words either has the first letter in uppercase or is one of
    the connection words ($connection_tokens). Furthermore, the connection 
    tokens are not allowed to appear at the beginning and the end of the
    sequence.
    """
    def candidate_qualified(words):
        while len(words) and words[0].lower() in connection_tokens:
            words = words[1:]
        while len(words) and words[-1].lower() in connection_tokens:
            words = words[:-1]
        
        return len(words) >= 2
    
    candidates = []
    
    phrase_start, phrase_end = -1, -1
    for id in range(1, len(sentence)):
        word = sentence[id]
        if word[0].isupper() or word in connection_tokens:
            if phrase_start == -1:
                phrase_start = phrase_end = id
            else:
                phrase_end = id
        else:
            if phrase_start != -1:
                if candidate_qualified(sentence[phrase_start:phrase_end+1]):
                    candidates.append((phrase_start, phrase_end))
                phrase_start = phrase_end = -1
    
    if phrase_start != -1:
        if candidate_qualified(sentence[phrase_start:phrase_end+1]):
            candidates.append((phrase_start, phrase_end))
    
    return candidates

In [None]:
#### MLM
mask = mlm.tokenizer.mask_token

all_test_data = []
for paper_id in tqdm(sample_submission['Id']):
    # load paper
    paper = papers[paper_id]

    # extract sentences
    sentences = set([clean_paper_sentence(sentence) for section in paper 
                     for sentence in section['text'].split('.')
                    ])
    sentences = shorten_sentences(sentences) # make sentences short
    sentences = [sentence for sentence in sentences if len(sentence) > 1] # only accept sentences with length > 1 chars
    sentences = [sentence for sentence in sentences if any(word in sentence.lower() for word in ['data', 'study'])]
    sentences = [sentence.split() for sentence in sentences] # sentence = list of words

    # mask
    test_data = []
    for sentence in sentences:
        for phrase_start, phrase_end in find_mask_candidates(sentence):
            dt_point = sentence[:phrase_start] + [mask] + sentence[phrase_end+1:]
            test_data.append((' '.join(dt_point), ' '.join(sentence[phrase_start:phrase_end+1]))) # (masked text, phrase)

    all_test_data.append(test_data)
#### MLMMLM

In [None]:
#### MLM
pred_mlm_labels = []
for test_data in tqdm(all_test_data):
    pred_bag = set()

    if len(test_data):
        texts, phrases = list(zip(*test_data))
        mlm_pred = []
        for p_id in range(0, len(texts), PREDICT_BATCH):
            batch_texts = texts[p_id:p_id+PREDICT_BATCH]
            batch_pred = mlm(list(batch_texts), targets=[f' {DATASET_SYMBOL}', f' {NONDATA_SYMBOL}'])

            if len(batch_texts) == 1:
                batch_pred = [batch_pred]

            mlm_pred.extend(batch_pred)

        for (result1, result2), phrase in zip(mlm_pred, phrases):
            if (result1['score'] > result2['score']*2 and result1['token_str'] == DATASET_SYMBOL) or\
               (result2['score'] > result1['score']*2 and result2['token_str'] == NONDATA_SYMBOL):
                pred_bag.add(clean_text(phrase))

    # filter labels by jaccard score 
    filtered_labels = []

    for label in sorted(pred_bag, key=len, reverse=True):
        if len(filtered_labels) == 0 or all(jaccard_similarity(label, got_label) < 0.75 for got_label in filtered_labels):
            filtered_labels.append(label)

    pred_mlm_labels.append('|'.join(filtered_labels))
    
final_predictions = pred_mlm_labels
sample_submission['PredictionString'] = final_predictions
#### MLMMLM

In [None]:
df2=pd.read_csv('../input/bigger-govt-dataset-list/data_set_800.csv')
#df2=pd.read_csv("../input/coleridge-additional-gov-datasets-22000popular/additional_gov_datasets_22000popular.csv")
#df2=pd.read_csv("../input/add-dataset-coloridge/data_set_800_with2000popular.csv")

In [None]:
start_time = time.time()


#### remove >.5 jaccard matches from predicitons
def jaccard_similarity(s1, s2):
    l1 = s1.split(" ")
    l2 = s2.split(" ")    
    intersection = len(list(set(l1).intersection(l2)))
    union = (len(l1) + len(l2)) - intersection
    return float(intersection) / union


#############################
#path=train_data_path
path=test_data_path

#for training use train_sample

#for submission use sample_sub

#############

column_names = ["Id", "PredictionString"]

submission = pd.DataFrame(columns = column_names)
fn_list=[]
fn_text=[]
all_list=[]
all_text=[]
to_append=[]
for index, row in sample_sub.iterrows():
#for index, row in tqdm(train_df.iterrows()):
    to_append=[row['Id'],'']
    large_string = str(read_json_pub(row['Id'],path))
    clean_string=text_cleaning(large_string)
    for index, row2 in df2.iterrows():
        query_string = str(row2['title'])
        if query_string in clean_string:
            if to_append[1]!='' and clean_text(query_string) not in to_append[1]:
                to_append[1]=to_append[1]+'|'+clean_text(query_string)
            if to_append[1]=='':
                to_append[1]=clean_text(query_string)

                
    if to_append[1]=='':
        fn_list+=[row['Id']]
        fn_text+=[large_string]
    all_list+=[row['Id']]
    all_text+=[large_string]


    df_length = len(submission)
    submission.loc[df_length] = to_append
submission.to_csv('submission.csv', index = False)
print("--- %s seconds ---" % (time.time() - start_time))
submission

In [None]:
#%%time
#!pip uninstall fastai en-core-web-sm en-core-web-lg spacy -y -q
#!pip install ../input/spacy3/catalogue-2.0.3-py3-none-any.whl ../input/spacy3/typer-0.3.2-py3-none-any.whl ../input/spacy3/srsly-2.4.1-cp37-cp37m-manylinux2014_x86_64.whl ../input/spacy3/pathy-0.5.2-py3-none-any.whl ../input/spacy3/smart_open-3.0.0-py3-none-any.whl ../input/spacy3/pydantic-1.7.3-cp37-cp37m-manylinux2014_x86_64.whl ../input/spacy3/thinc-8.0.3-cp37-cp37m-manylinux2014_x86_64.whl ../input/spacy3/spacy-3.0.6-cp37-cp37m-manylinux2014_x86_64.whl ../input/spacy3/spacy_legacy-3.0.5-py2.py3-none-any.whl -q
#!pip install ../input/spacy3/en_core_web_lg-3.0.0-py3-none-any.whl ../input/spacy3/en_core_web_md-3.0.0-py3-none-any.whl ../input/spacy3/en_core_web_sm-3.0.0-py3-none-any.whl -q
#!pip install ../input/spacy3/spacy_alignments-0.8.3-cp37-cp37m-manylinux2014_x86_64.whl ../input/spacy3/spacy_transformers-1.0.2-py2.py3-none-any.whl ../input/spacy3/en_core_web_trf-3.0.0-py3-none-any.whl -q
#import spacy
#assert spacy.__version__ == '3.0.6'
#import en_core_web_trf
#import torch 
#if torch.cuda.is_available():
#    spacy.prefer_gpu()
#nlp = spacy.load("../input/spacy-cv-4-model/output/model-best") #load the best model
#nlp2 = spacy.load("../input/spacy-train-set/cv0-model-best") #load the best model

In [None]:
"""
%%time


existing_labels = set(df2["title"])
def nlp_label_cv(Id,text,existing_labels,nlp_list):
    c_label=[]
    for nlp_er0 in nlp_list:
        doc = nlp_er0(text)
        ent_d=set([doc.ents[i].text  for i in range(len(doc.ents)) if (doc.ents[i].label_ == 'DB_label') & (clean_text(doc.ents[i].text) != "")] )
       

        for ent in ent_d:
            j_val=[jaccard(clean_text(ent.lower()), clean_text(list(existing_labels)[i]))>0.7  for i in range(len(existing_labels)) ]
            #c_label+=set(pd.Series(list(existing_labels))[j_val] )
            #j_val=[jaccard(clean_text(ent.lower()), clean_text(list(existing_labels)[i]))  for i in range(len(existing_labels)) ]
            #if np.max(j_val) > 0.7:
            #    c_label+=set(pd.Series(list(existing_labels)).iloc[np.argmax(j_val)] )
            if sum(j_val)==0:
                c_label+=[clean_text(str(ent).lower())]
                #if nlp_qa0(question="dataset?", context=str(ent))["score"] > 0.7:
                #    c_label+=[clean_text(nlp_qa0(question="dataset?", context=str(ent))['answer'].lower())  ]



        del nlp_er0
    #del nlp_qa0

    
    return ["|".join(list(set(c_label)))]
    """

In [None]:
######################################################################################
#############   NER推論部分_pred_nerがサブミッションファイルと同じ形式になる   ##############
######################################################################################

"""
pred_ner=pd.DataFrame(columns=["Id",'PredictionString'])#
tex_df=pd.DataFrame({"Id":fn_list,"raw_text":fn_text}).drop_duplicates()#train
#tex_df=sample_submission_df[["Id","raw_text"]].drop_duplicates()#test
Id_list=[]
pred_list=[]
for Id in tqdm(fn_list):
    if torch.cuda.is_available():
        spacy.prefer_gpu()
        torch.cuda.empty_cache()
        cupy.get_default_memory_pool().free_all_blocks()
    nlp_er = nlp
    nlp_er2 = nlp2
    #nlp_qa0=nlp_qa
    #nlp_er.get_pipe("transformer").model.attrs["flush_cache_chance"] = 1
    text = tex_df.set_index("Id").loc[Id,"raw_text"]
    if len(text) > 200_000:
        text=text[0:200_000]
    Id_list+=[Id]
    #pred_list+=["|".join(set([clean_text(doc.ents[i].text)  for i in range(len(doc.ents)) if doc.ents[i].label_ == 'DB_label' ] ))]
    #pred_ner=pd.concat([pred_ner,nlp_lable(Id,text,existing_labels,nlp_er)],axis=0)
    pred_list+=nlp_label_cv(Id,text,existing_labels,[nlp_er,nlp_er2])


pred_ner=pd.DataFrame({"Id":Id_list,'PredictionString':pred_list})   
sum(pred_ner["PredictionString"]=="")
"""



#### RIOW
pred_ner = sample_submission.copy()
#### RIOWRIOW

In [None]:
#### RIOW
# for i in range(4):
#     print(pred_ner.loc[i, "PredictionString"])
#     print()
#### RIOWRIOW

In [None]:
name=pd.Series(pred_ner["PredictionString"].str.split("|").sum()).value_counts()

In [None]:
tex_df=pd.DataFrame({"Id":all_list,"raw_text":all_text}).drop_duplicates()
use_name=name[name>100].index
column_names = ["Id", "PredictionString"]
pred_match = pd.DataFrame(columns = column_names)
to_append=[]
for Id in tqdm(all_list):
#for index, row in tqdm(train_df.iterrows()):
    to_append=[Id,'']
    large_string = str(tex_df.set_index("Id").loc[Id,"raw_text"])
    clean_string=text_cleaning(large_string)
    for row2 in use_name:
        query_string = str(row2)
        if query_string in clean_string:
            if to_append[1]!='' and clean_text(query_string) not in to_append[1]:
                to_append[1]=to_append[1]+'|'+clean_text(query_string)
            if to_append[1]=='':
                to_append[1]=clean_text(query_string)
    #pred_match+=to_append
    df_length = len(pred_match)
    pred_match.loc[df_length] = to_append

In [None]:
sub=pd.concat([submission,pred_match])
sub["PredictionString"]=sub["PredictionString"].str.split("|")
sub=sub.groupby("Id").sum()
#### RIOW
#sub["PredictionString"]=["|".join(list(set(sub["PredictionString"][i]))) for i in range(sub.shape[0]) ]
sub["PredictionString"] = sub["PredictionString"].apply(lambda x: "|".join(list(set(x))).strip("|"))
#### RIOWRIOW
sub=sub.reset_index()
sub

In [None]:
#### RIOW
# for i in range(4):
#     print(sub.loc[i, "PredictionString"])
#     print()
#### RIOWRIOW

In [None]:
sub.to_csv('submission.csv', index=False)