This notebook simply uses matching if a dataset is in the document, it "predicts" the title.  It uses the 180 dataset list from the train data and adds some hand curated govt dataset titles.

In [None]:
# huggingface related scripts are writen between #### HF and #### HFHF
# all other scripts by Ryosuke Horiuchi will be written between #### RIOW and #### RIOWRIOW

# huggingface related scripts are copied from:
# https://github.com/riow1983/Kaggle-Coleridge-Initiative/blob/main/notebooks/kagglenb008-pytorch-bert-for-ner-inference.ipynb



#### HF
MAX_SAMPLE = None # set a small number for experimentation, set None for production.

!pip install datasets --no-index --find-links=file:///kaggle/input/coleridge-packages/packages/datasets
!pip install ../input/coleridge-packages/seqeval-1.2.2-py3-none-any.whl
!pip install ../input/coleridge-packages/tokenizers-0.10.1-cp37-cp37m-manylinux1_x86_64.whl
!pip install ../input/coleridge-packages/transformers-4.5.0.dev0-py3-none-any.whl
#### HFHF

In [None]:
import os
import re
import json
import time
import datetime
import random
import glob
import importlib
import pickle
import numpy as np
import pandas as pd

from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

#### RIOW
# import torch 
# if torch.cuda.is_available():
#     import cupy
#### RIOWRIOW

In [None]:
#### RIOW
random.seed(123)
np.random.seed(456)
#### RIOWRIOW

In [None]:
def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

def get_count_tp_fp_fn(prediction, verbose=True):
    preds = prediction.split(" ")
    if verbose:
        print(preds)
    tpc = 0
    fpc = 0
    fnc = 0
    for pred in preds:
        if pred == "TP":
            tpc = tpc + 1
        elif pred == "FP":
            fpc = fpc + 1
        elif pred == "FN":
            fnc = fnc + 1
    return [tpc, fpc, fnc]

def make_col_tp_fp_fn(df, col):
    df['TP'] = df[col].apply(lambda x : x[0])
    df['FP'] = df[col].apply(lambda x : x[1])
    df['FN'] = df[col].apply(lambda x : x[2])
    return df

def get_precision_recall(tp, fp, fn):
    precision = tp / (tp+fp)
    recall = tp / (tp + fn)
    return precision, recall

def fbeta_score(precision, recall, beta):
    fbeta = (1+(beta*beta))*((precision*recall)/( (beta*beta*precision) + recall))
    return fbeta

def coleridge_initiative_jaccard(ground_truth, prediction, verbose=True):
    gts = ground_truth.split('|')
    pds = sorted(prediction.split('|'))
    if verbose:
        print("Ground truth : " , gts)
        print("Prediction : ", pds)
        
    js_scores = []
    cf_matrix = []
    
    #### Counting True Positives (TP) and False Positives (FP)

    for pd in pds:
        if len(pd)>0:
            score = -1
            for gt in gts:
                js = jaccard(pd, gt)
                if js > score:
                    score = js
            if score >= 0.5:
                js_scores.append(score)
                cf_matrix.append("TP")
            else:
                js_scores.append(score)
                cf_matrix.append("FP")

    
    #### Counting False Negatives (FN)
    
    for gt in gts:
        score = -1
        for pd in pds:
            js = jaccard(gt, pd)
            if js > score:
                score = js
        if score == 0:
            js_scores.append(score)
            cf_matrix.append("FN")
            
    return js_scores, " ".join(cf_matrix)
    

def score_df_coleridge_initiative(output, gt_col, pred_col, beta=0.5, verbose=True):
    
    '''
    This function will calculate the FBeta score for Coleridge Initiative competition 
    if given appropriate arguments
    
    Arguments - 
    output - Your submission dataframe that has both ground truth and prediction columns.
    gt_col - This is the column name of ground truth column.
    pred_col - This is the column name of predictions column.
    beta - Beta value to calculate FBeta score.
    
    Returns - 
    This function will return the FBeta (beta=0.5) score.
    
    ## Set verbose = True to print logs    
    '''
    
    ### Jaccard Similarity
    output['evaluation'] = output.apply(lambda x: coleridge_initiative_jaccard(x[gt_col], x[pred_col], verbose=False), axis=1)
    output['js_scores'] = output['evaluation'].apply(lambda x : x[0])
    output['pred_type'] = output['evaluation'].apply(lambda x : x[1])
    
    ### TP, FP and FN 
    output['tp_fp_fn'] = output['pred_type'].apply(lambda x : get_count_tp_fp_fn(x, verbose=False))
    output = make_col_tp_fp_fn(output, 'tp_fp_fn')
    
    tp = sum(output['TP'])
    fp = sum(output['FP'])
    fn = sum(output['FN'])
    precision, recall = get_precision_recall(tp, fp, fn)
    fbeta = fbeta_score(precision, recall, 0.5)
    
    if verbose:

        print("TP_FP_FN : ", tp,fp,fn)

    return fbeta

In [None]:
#### RIOW
sample_sub = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/sample_submission.csv')
train_data_path = '../input/coleridgeinitiative-show-us-the-data/train'
test_data_path = '../input/coleridgeinitiative-show-us-the-data/test'
train_df = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/train.csv')
#### RIOWRIOW

#### HF
train_path = '../input/coleridgeinitiative-show-us-the-data/train.csv'
train = pd.read_csv(train_path)
train = train[:MAX_SAMPLE]

paper_train_folder = '../input/coleridgeinitiative-show-us-the-data/train'
papers = {}
for paper_id in train['Id'].unique():
    with open(f'{paper_train_folder}/{paper_id}.json', 'r') as f:
        paper = json.load(f)
        papers[paper_id] = paper
        

sample_submission_path = '../input/coleridgeinitiative-show-us-the-data/sample_submission.csv'
sample_submission = pd.read_csv(sample_submission_path)

paper_test_folder = '../input/coleridgeinitiative-show-us-the-data/test'
for paper_id in sample_submission['Id']:
    with open(f'{paper_test_folder}/{paper_id}.json', 'r') as f:
        paper = json.load(f)
        papers[paper_id] = paper


all_labels = set()
for label_1, label_2, label_3 in train[['dataset_title', 'dataset_label', 'cleaned_label']].itertuples(index=False):
    all_labels.add(str(label_1).lower())
    all_labels.add(str(label_2).lower())
    all_labels.add(str(label_3).lower())
    
print(f'No. different labels: {len(all_labels)}')

#### HFHF

In [None]:
def read_json_pub(filename, train_data_path=train_data_path, output='text'):
    json_path = os.path.join(train_data_path, (filename+'.json'))
    headings = []
    contents = []
    combined = []
    with open(json_path, 'r') as f:
        json_decode = json.load(f)
        for data in json_decode:
            headings.append(data.get('section_title'))
            contents.append(data.get('text'))
            combined.append(data.get('section_title'))
            combined.append(data.get('text'))
    
    all_headings = ' '.join(headings)
    all_contents = ' '.join(contents)
    all_data = '. '.join(combined)
    
    if output == 'text':
        return all_contents
    elif output == 'head':
        return all_headings
    else:
        return all_data

In [None]:
def text_cleaning(text):
    '''
    Converts all text to lower case, Removes special charecters, emojis and multiple spaces
    text - Sentence that needs to be cleaned
    '''
    text = re.sub('[^A-Za-z0-9]+', ' ', str(text).lower()).strip()
    text = re.sub(' +', ' ', text)
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    return text


In [None]:
def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower()).strip()

In [None]:
#### HF
# def clean_text(txt):
#     return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower()).strip()

def totally_clean_text(txt):
    txt = clean_text(txt)
    txt = re.sub(' +', ' ', txt)
    return txt
#### HFHF

In [None]:
#### HF
literal_preds = []
for paper_id in sample_submission['Id']:
    paper = papers[paper_id]
    text_1 = '. '.join(section['text'] for section in paper).lower()
    text_2 = totally_clean_text(text_1)
    
    labels = set()
    for label in all_labels:
        if label in text_1 or label in text_2:
            labels.add(clean_text(label))
    
    literal_preds.append('|'.join(labels))
#### HFHF

In [None]:
#### HF
MAX_LENGTH = 64 # max no. words for each sentence.
OVERLAP = 20 # if a sentence exceeds MAX_LENGTH, we split it to multiple sentences with overlapping
PREDICT_BATCH = 64000 


inputfile = "nb005-pytorch-bert-for-ner"
PRETRAINED_PATH = f'../input/{inputfile}'

TEST_INPUT_SAVE_PATH = './input_data'
TEST_NER_DATA_FILE = 'test_ner_input.json'

TRAIN_PATH = f'../input/{inputfile}/fold_2_train_ner.json'
VAL_PATH = f'../input/{inputfile}/fold_2_valid_ner.json'

PREDICTION_SAVE_PATH = './pred'
PREDICTION_FILE = 'test_predictions.txt'


train = train.groupby('Id').agg({
    'pub_title': 'first',
    'dataset_title': '|'.join,
    'dataset_label': '|'.join,
    'cleaned_label': '|'.join
}).reset_index()

print(f'No. grouped training rows: {len(train)}')






#### HFHF

In [None]:
#### HF
def clean_training_text(txt):
    """
    similar to the default clean_text function but without lowercasing.
    """
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt)).strip()

def shorten_sentences(sentences):
    short_sentences = []
    for sentence in sentences:
        words = sentence.split()
        if len(words) > MAX_LENGTH:
            for p in range(0, len(words), MAX_LENGTH - OVERLAP):
                short_sentences.append(' '.join(words[p:p+MAX_LENGTH]))
        else:
            short_sentences.append(sentence)
    return short_sentences
#### HFHF

In [None]:
#### HF
test_rows = [] # test data in NER format
paper_length = [] # store the number of sentences each paper has

for paper_id in sample_submission['Id']:
    # load paper
    paper = papers[paper_id]
    
    # extract sentences
    sentences = [clean_training_text(sentence) for section in paper 
                 for sentence in section['text'].split('.')
                ]
    sentences = shorten_sentences(sentences) # make sentences short
    sentences = [sentence for sentence in sentences if len(sentence) > 10] # only accept sentences with length > 10 chars
    sentences = [sentence for sentence in sentences if any(word in sentence.lower() for word in ['data', 'study'])]
        
    # collect all sentences in json
    for sentence in sentences:
        sentence_words = sentence.split()
        dummy_tags = ['O']*len(sentence_words)
        test_rows.append({'tokens' : sentence_words, 'tags' : dummy_tags})
    
    # track which sentence belongs to which data point
    paper_length.append(len(sentences))
    
print(f'total number of sentences: {len(test_rows)}')
#### HFHF

In [None]:
#### HF
os.environ["MODEL_PATH"] = f"{PRETRAINED_PATH}"
os.environ["TRAIN_FILE"] = f"{TRAIN_PATH}"
os.environ["VALIDATION_FILE"] = f"{VAL_PATH}"
os.environ["TEST_FILE"] = f"{TEST_INPUT_SAVE_PATH}/{TEST_NER_DATA_FILE}"
os.environ["OUTPUT_DIR"] = f"{PREDICTION_SAVE_PATH}"



# copy my_seqeval.py to the working directory because the input directory is non-writable
!cp /kaggle/input/coleridge-packages/my_seqeval.py ./

# make necessart directories and files
os.makedirs(TEST_INPUT_SAVE_PATH, exist_ok=True)


def bert_predict():
    !python ../input/kaggle-ner-utils/kaggle_run_ner.py \
    --model_name_or_path $MODEL_PATH \
    --train_file $TRAIN_FILE \
    --validation_file $VALIDATION_FILE \
    --test_file $TEST_FILE \
    --output_dir $OUTPUT_DIR \
    --report_to 'none' \
    --seed 123 \
    --do_predict
#### HFHF

In [None]:
#### HF
bert_outputs = []
for batch_begin in range(0, len(test_rows), PREDICT_BATCH):
    # write data rows to input file
    with open(f'{TEST_INPUT_SAVE_PATH}/{TEST_NER_DATA_FILE}', 'w') as f:
        for row in test_rows[batch_begin:batch_begin+PREDICT_BATCH]:
            json.dump(row, f)
            f.write('\n')
    
    # remove output dir
    !rm -r $OUTPUT_DIR
    
    # do predict
    bert_predict()
    
    # read predictions
    with open(f'{PREDICTION_SAVE_PATH}/{PREDICTION_FILE}') as f:
        this_preds = f.read().split('\n')[:-1]
        bert_outputs += [pred.split() for pred in this_preds]


#### HFHF

In [None]:
#### HF
# get test sentences
test_sentences = [row['tokens'] for row in test_rows]
del test_rows



bert_dataset_labels = [] # store all dataset labels for each publication
for length in paper_length:
    labels = set()
    for sentence, pred in zip(test_sentences[:length], bert_outputs[:length]):
        curr_phrase = ''
        for word, tag in zip(sentence, pred):
            if tag == 'B': # start a new phrase
                if curr_phrase:
                    labels.add(curr_phrase)
                    curr_phrase = ''
                curr_phrase = word
            elif tag == 'I' and curr_phrase: # continue the phrase
                curr_phrase += ' ' + word
            else: # end last phrase (if any)
                if curr_phrase:
                    labels.add(curr_phrase)
                    curr_phrase = ''
        # check if the label is the suffix of the sentence
        if curr_phrase:
            labels.add(curr_phrase)
            curr_phrase = ''
    
    # record dataset labels for this publication
    bert_dataset_labels.append(labels)
    
    del test_sentences[:length], bert_outputs[:length]
#### HFHF

In [None]:
df2=pd.read_csv('../input/bigger-govt-dataset-list/data_set_800.csv')
#df2=pd.read_csv("../input/coleridge-additional-gov-datasets-22000popular/additional_gov_datasets_22000popular.csv")
#df2=pd.read_csv("../input/add-dataset-coloridge/data_set_800_with2000popular.csv")

In [None]:
start_time = time.time()


#### remove >.5 jaccard matches from predicitons
def jaccard_similarity(s1, s2):
    l1 = s1.split(" ")
    l2 = s2.split(" ")    
    intersection = len(list(set(l1).intersection(l2)))
    union = (len(l1) + len(l2)) - intersection
    return float(intersection) / union


#### HF
# def jaccard_similarity(s1, s2):
#     l1 = s1.split(" ")
#     l2 = s2.split(" ")    
#     intersection = len(list(set(l1).intersection(l2)))
#     union = (len(l1) + len(l2)) - intersection
#     return float(intersection) / union
#### HFHF

#############################
#path=train_data_path
path=test_data_path

#for training use train_sample

#for submission use sample_sub

#############

column_names = ["Id", "PredictionString"]

submission = pd.DataFrame(columns = column_names)
fn_list=[]
fn_text=[]
all_list=[]
all_text=[]
to_append=[]
for index, row in sample_sub.iterrows():
#for index, row in tqdm(train_df.iterrows()):
    to_append=[row['Id'],'']
    large_string = str(read_json_pub(row['Id'],path))
    clean_string=text_cleaning(large_string)
    for index, row2 in df2.iterrows():
        query_string = str(row2['title'])
        if query_string in clean_string:
            if to_append[1]!='' and clean_text(query_string) not in to_append[1]:
                to_append[1]=to_append[1]+'|'+clean_text(query_string)
            if to_append[1]=='':
                to_append[1]=clean_text(query_string)

                
    if to_append[1]=='':
        fn_list+=[row['Id']]
        fn_text+=[large_string]
    all_list+=[row['Id']]
    all_text+=[large_string]


    df_length = len(submission)
    submission.loc[df_length] = to_append
submission.to_csv('submission.csv', index = False)
print("--- %s seconds ---" % (time.time() - start_time))
submission


In [None]:
#%%time
#!pip uninstall fastai en-core-web-sm en-core-web-lg spacy -y -q
#!pip install ../input/spacy3/catalogue-2.0.3-py3-none-any.whl ../input/spacy3/typer-0.3.2-py3-none-any.whl ../input/spacy3/srsly-2.4.1-cp37-cp37m-manylinux2014_x86_64.whl ../input/spacy3/pathy-0.5.2-py3-none-any.whl ../input/spacy3/smart_open-3.0.0-py3-none-any.whl ../input/spacy3/pydantic-1.7.3-cp37-cp37m-manylinux2014_x86_64.whl ../input/spacy3/thinc-8.0.3-cp37-cp37m-manylinux2014_x86_64.whl ../input/spacy3/spacy-3.0.6-cp37-cp37m-manylinux2014_x86_64.whl ../input/spacy3/spacy_legacy-3.0.5-py2.py3-none-any.whl -q
#!pip install ../input/spacy3/en_core_web_lg-3.0.0-py3-none-any.whl ../input/spacy3/en_core_web_md-3.0.0-py3-none-any.whl ../input/spacy3/en_core_web_sm-3.0.0-py3-none-any.whl -q
#!pip install ../input/spacy3/spacy_alignments-0.8.3-cp37-cp37m-manylinux2014_x86_64.whl ../input/spacy3/spacy_transformers-1.0.2-py2.py3-none-any.whl ../input/spacy3/en_core_web_trf-3.0.0-py3-none-any.whl -q
#import spacy
#assert spacy.__version__ == '3.0.6'
#import en_core_web_trf
#import torch 
#if torch.cuda.is_available():
#    spacy.prefer_gpu()
#nlp = spacy.load("../input/spacy-cv-4-model/output/model-best") #load the best model
#nlp2 = spacy.load("../input/spacy-train-set/cv0-model-best") #load the best model

In [None]:
"""
%%time


existing_labels = set(df2["title"])
def nlp_label_cv(Id,text,existing_labels,nlp_list):
    c_label=[]
    for nlp_er0 in nlp_list:
        doc = nlp_er0(text)
        ent_d=set([doc.ents[i].text  for i in range(len(doc.ents)) if (doc.ents[i].label_ == 'DB_label') & (clean_text(doc.ents[i].text) != "")] )
       

        for ent in ent_d:
            j_val=[jaccard(clean_text(ent.lower()), clean_text(list(existing_labels)[i]))>0.7  for i in range(len(existing_labels)) ]
            #c_label+=set(pd.Series(list(existing_labels))[j_val] )
            #j_val=[jaccard(clean_text(ent.lower()), clean_text(list(existing_labels)[i]))  for i in range(len(existing_labels)) ]
            #if np.max(j_val) > 0.7:
            #    c_label+=set(pd.Series(list(existing_labels)).iloc[np.argmax(j_val)] )
            if sum(j_val)==0:
                c_label+=[clean_text(str(ent).lower())]
                #if nlp_qa0(question="dataset?", context=str(ent))["score"] > 0.7:
                #    c_label+=[clean_text(nlp_qa0(question="dataset?", context=str(ent))['answer'].lower())  ]



        del nlp_er0
    #del nlp_qa0

    
    return ["|".join(list(set(c_label)))]
    """

In [None]:
######################################################################################
#############   NER推論部分_pred_nerがサブミッションファイルと同じ形式になる   ##############
######################################################################################

"""
pred_ner=pd.DataFrame(columns=["Id",'PredictionString'])#
tex_df=pd.DataFrame({"Id":fn_list,"raw_text":fn_text}).drop_duplicates()#train
#tex_df=sample_submission_df[["Id","raw_text"]].drop_duplicates()#test
Id_list=[]
pred_list=[]
for Id in tqdm(fn_list):
    if torch.cuda.is_available():
        spacy.prefer_gpu()
        torch.cuda.empty_cache()
        cupy.get_default_memory_pool().free_all_blocks()
    nlp_er = nlp
    nlp_er2 = nlp2
    #nlp_qa0=nlp_qa
    #nlp_er.get_pipe("transformer").model.attrs["flush_cache_chance"] = 1
    text = tex_df.set_index("Id").loc[Id,"raw_text"]
    if len(text) > 200_000:
        text=text[0:200_000]
    Id_list+=[Id]
    #pred_list+=["|".join(set([clean_text(doc.ents[i].text)  for i in range(len(doc.ents)) if doc.ents[i].label_ == 'DB_label' ] ))]
    #pred_ner=pd.concat([pred_ner,nlp_lable(Id,text,existing_labels,nlp_er)],axis=0)
    pred_list+=nlp_label_cv(Id,text,existing_labels,[nlp_er,nlp_er2])


pred_ner=pd.DataFrame({"Id":Id_list,'PredictionString':pred_list})   
sum(pred_ner["PredictionString"]=="")
"""


#### HF
    
# def jaccard_similarity(s1, s2):
#     l1 = s1.split(" ")
#     l2 = s2.split(" ")    
#     intersection = len(list(set(l1).intersection(l2)))
#     union = (len(l1) + len(l2)) - intersection
#     return float(intersection) / union

filtered_bert_labels = []
for labels in bert_dataset_labels:
    filtered = []
    
    for label in sorted(labels, key=len):
        label = clean_text(label)
        if len(filtered) == 0 or all(jaccard_similarity(label, got_label) < 0.75 for got_label in filtered):
            filtered.append(label)
    
    filtered_bert_labels.append('|'.join(filtered))
    

#### RIOW
final_predictions = []
# for literal_match, bert_pred in zip(literal_preds, filtered_bert_labels):
#     if literal_match:
#         final_predictions.append(literal_match)
#     else:
#         final_predictions.append(bert_pred)
for bert_pred in filtered_bert_labels:
    final_predictions.append(bert_pred)
#### RIOWRIOW
        
sample_submission['PredictionString'] = final_predictions
#### RIOW
pred_ner = sample_submission.copy()
#### RIOWRIOW
#### HFHF

In [None]:
name=pd.Series(pred_ner["PredictionString"].str.split("|").sum()).value_counts()

In [None]:
tex_df=pd.DataFrame({"Id":all_list,"raw_text":all_text}).drop_duplicates()
use_name=name[name>100].index
column_names = ["Id", "PredictionString"]
pred_match = pd.DataFrame(columns = column_names)
to_append=[]
for Id in tqdm(all_list):
#for index, row in tqdm(train_df.iterrows()):
    to_append=[Id,'']
    large_string = str(tex_df.set_index("Id").loc[Id,"raw_text"])
    clean_string=text_cleaning(large_string)
    for row2 in use_name:
        query_string = str(row2)
        if query_string in clean_string:
            if to_append[1]!='' and clean_text(query_string) not in to_append[1]:
                to_append[1]=to_append[1]+'|'+clean_text(query_string)
            if to_append[1]=='':
                to_append[1]=clean_text(query_string)
    #pred_match+=to_append
    df_length = len(pred_match)
    pred_match.loc[df_length] = to_append

In [None]:
sub=pd.concat([submission,pred_match])
sub["PredictionString"]=sub["PredictionString"].str.split("|")
sub=sub.groupby("Id").sum()
#### RIOW
#sub["PredictionString"]=["|".join(list(set(sub["PredictionString"][i]))) for i in range(sub.shape[0]) ]
sub["PredictionString"] = sub["PredictionString"].apply(lambda x: "|".join(list(set(x))).strip("|"))
#### RIOWRIOW
sub=sub.reset_index()
sub

In [None]:
#### RIOW
# for i in range(4):
#     print(sub.loc[i, "PredictionString"])
#     print()
#### RIOWRIOW

In [None]:
sub.to_csv('submission.csv', index=False)