# Abstract

Extract the dataset name included in the treatise by NER. <br>

Since data names rarely appear across sentences, modeling by dividing them into sentence units instead of paper units.

# Reference

https://www.kaggle.com/tungmphung/coleridge-matching-bert-ner <br>
https://github.com/huggingface/transformers/blob/master/examples/README.md

# Setup

In [None]:
# ============== Setup Module ============== #
!pip install datasets --no-index --find-links=file:///kaggle/input/coleridge-packages/packages/datasets
!pip install ../input/coleridge-packages/seqeval-1.2.2-py3-none-any.whl
!pip install ../input/coleridge-packages/tokenizers-0.10.1-cp37-cp37m-manylinux1_x86_64.whl
!pip install ../input/coleridge-packages/transformers-4.5.0.dev0-py3-none-any.whl
!cp /kaggle/input/coleridge-packages/my_seqeval.py ./

In [None]:
# ============== Import Module ============== #
import os
import re
import json
import time
import datetime
import random
import glob
import importlib
import gc

import numpy as np
import pandas as pd

from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

random.seed(123)
np.random.seed(456)

In [None]:
# ============== Set Constance ============== #
TRAIN_DATA_PATH = '../input/coleridgeinitiative-show-us-the-data/train.csv'
SUBMISSION_PATH = '../input/coleridgeinitiative-show-us-the-data/sample_submission.csv'

PAPER_TRAIN_FOLDER = '../input/coleridgeinitiative-show-us-the-data/train'
PAPER_TEST_FOLDER = '../input/coleridgeinitiative-show-us-the-data/test'
RICH_CONTEXT = '../input/coleridge-intiative-rich-context/train_test'

PAPER_DEVIDE_SENT_PATH = '../input/coleride/train_literal_label.pickle'
# PAPER_DEVIDE_SENT_PATH = ''

IS_TRAIN = False

# Preprocess

In [None]:
# ============== NLP Preprocess Helper ============== #
def paper_to_concate_json(id_list, folder):
    papers = {}
    for paper_id in tqdm(id_list):
        with open(f'{folder}/{paper_id}.json', 'r') as f:
            paper = json.load(f)
            papers[paper_id] = paper
    return papers

def clean_text(txt, is_lower=True):
    if is_lower:
        return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower()).strip()
    else:
        return re.sub('[^A-Za-z0-9]+', ' ', str(txt)).strip()

def totally_clean_text(txt, is_lower=True):
    txt = clean_text(txt, is_lower)
    txt = re.sub(' +', ' ', txt)
    return txt

def devide_sentence(id_list, papers, out_file):
    literal_pdf = pd.DataFrame()
    
    for paper_id in tqdm(id_list):
        paper = papers[paper_id]
        text = pd.DataFrame([totally_clean_text(sent) for section in paper for sent in section["text"].split(". ") if len(sent) > 0],columns=["sent"])
        text["Id"] = paper_id
        literal_pdf = pd.concat([literal_pdf, text], axis=0)
    literal_pdf.to_pickle(f"{out_file}.pickle")
    return literal_pdf
    
def is_in_label(labels, in_file, out_file):
    literal_pdf = pd.read_pickle(f"{in_file}.pickle")
    label_pdf = pd.DataFrame()
    for label in tqdm(labels):
        literal_pdf[label] = literal_pdf["sent"].str.contains(label) * 1
        
        tmp = literal_pdf[literal_pdf[label] == 1][["Id", "sent", label]]
        tmp["Label"] = label
        label_pdf = pd.concat([label_pdf, tmp[["Id", "sent", "Label"]]])
        gc.collect()
        
    label_pdf.to_pickle(f"{out_file}.pickle")
    return label_pdf

def change_enr_format(id_list, papers, labels, out_file, is_train_data=True):
    if is_train_data:
        devide_sentence(id_list, papers, "train_literal")
        literal_pdf = is_in_label(labels, "train_literal", out_file)
        
    else:
        literal_pdf = devide_sentence(id_list, papers, "test_literal")
    
    return literal_pdf

In [None]:
# ============== Load Data ============== #
train_df = pd.read_csv(TRAIN_DATA_PATH)
sub_df   = pd.read_csv(SUBMISSION_PATH)

## Concate Paper
train_id_list = train_df["Id"].unique()
test_id_list = sub_df["Id"].unique()
# train_paper = paper_to_concate_json(train_id_list, PAPER_TRAIN_FOLDER)
test_paper = paper_to_concate_json(test_id_list, PAPER_TEST_FOLDER)

In [None]:
# ============== Get Train Label(cleaned) ============== #
all_labels = set()

for label_1, label_2, label_3 in train_df[['dataset_title', 'dataset_label', 'cleaned_label']].itertuples(index=False):
    all_labels.add(clean_text(label_1))
    all_labels.add(clean_text(label_2))
    all_labels.add(clean_text(label_3))
    
print(f'No. different labels: {len(all_labels)}')

In [None]:
# ============== Paper Devide Sentense ============== #

## Coleridge Initiative
# Train
if not os.path.exists(PAPER_DEVIDE_SENT_PATH):
    train_literal_df = change_enr_format(train_id_list, train_paper, all_labels, "train_literal_label")
else:
    train_literal_df = pd.read_pickle(PAPER_DEVIDE_SENT_PATH)

# Test
test_literal_df = change_enr_format(test_id_list, test_paper, None, "test_literal_label", is_train_data=False)

In [None]:
with open(os.path.join(f"{RICH_CONTEXT}/data_set_citations.json")) as f:
    rich_citations = json.load(f)

mentions_list = []
for rc in tqdm(rich_citations):
    pub_id = rc["publication_id"]
    for mentions in rc['mention_list']:
        mentions_list.append([pub_id, mentions])
mentions_df = pd.DataFrame(mentions_list, columns=["pub_id", "mentions"])
mentions_df["cleaned_mention"] = mentions_df["mentions"].apply(lambda x: clean_text(x, is_lower=False))

sent_list = []
for ids in tqdm(mentions_df["pub_id"].unique()):
    path = f"{RICH_CONTEXT}/files/text/{ids}.txt"
    with open(path, "r") as f:
        lines = f.readlines()
    
    lines = "".join(lines)
    lines = lines.replace("\n", " ")
    sentences = lines.split(". ")
    for sentence in sentences:
        clean_sent = totally_clean_text(sentence, is_lower=False)
        sent_list.append([ids, clean_sent])
sent_df = pd.DataFrame(sent_list, columns=["Id", "sent"])

add_train_df = pd.DataFrame()
for ids in tqdm(mentions_df["pub_id"].unique()):
    mention_list = mentions_df[mentions_df["pub_id"] == ids]["cleaned_mention"].tolist()
    sentense_list = sent_df[sent_df["Id"] == ids].reset_index(drop=True)
    for mention in mention_list:
        tmp = sentense_list[sentense_list["sent"].str.contains(mention)].reset_index(drop=True)
        tmp["Label"] = mention
        add_train_df =  pd.concat([add_train_df, tmp[["Id", "sent", "Label"]]])

In [None]:
# ============== Checking datasets that cannot be retrieved ============== #

# 19661
train_df["cleaned_label"] = train_df["cleaned_label"].str.rstrip()
tmp = train_df.merge(train_literal_df[["Id", "Label"]].drop_duplicates(), left_on=["Id", "cleaned_label"], right_on=["Id", "Label"], how="left")

# 6
tmp[tmp["Label"].isnull()]

Paper Id: cf2aaa14-bd90-4e69-aca4-c2855747b0e5 <br>
Paper Id: 44339037-7785-4d4d-b334-b47fd81b8b9e
* Straddle sentences
* Ex: National Science Foundation. Survey of Earned Doctorates

Paper Id: 45dd2256-74f2-4f72-beb2-a8b770baf233
* Original data name contains dots
* Ex: NSF. Survey of Earned Doctorates

Paper Id: cb21f8af-8296-4970-ad64-24821f2eeb61 <br>
Paper Id: a69f443d-6318-40ef-aa0e-b08c2ec338f8 <br>
Paper Id: 29b4a5a2-1304-4a22-8a14-c2aebae5503c
* Dataset name and Reference Number?
* Ex: (SARS-CoV) [2] . Genome sequences

In [None]:
add_train_df.columns = ["Id", "sent", "Label"]

In [None]:
# ============== Add NER Label (Train) ============== #
# train_literal_df = pd.concat([train_literal_df[["Id", "sent", "Label"]], add_train_df])
train_rows_dict = {}
for ids, sents, label in tqdm(add_train_df[["Id", "sent", "Label"]].itertuples(index=False)):
    sent_list = sents.split()
    label_sent_list = label.split()
    
    if not ids in train_rows_dict:
        train_rows_dict[ids] = []
    
    dummy_tags = []
    idx = 0
    for i, sent in enumerate(sent_list):
        if sent == label_sent_list[idx]:
            idx += 1
            if idx == len(label_sent_list):
                dummy_tags += ["B"] + ["I"] * (idx - 1)
                idx = 0
            else:
                if i == (len(sent_list) - 1):
                    dummy_tags.extend(["O"] * (idx))
                    idx = 0
        else:
            dummy_tags.extend(["O"] * (idx + 1))
            idx = 0
            

    assert len(sent_list) == len(dummy_tags)
    train_rows_dict[ids].append({'tokens' : sent_list, 'tags' : dummy_tags})

In [None]:
label_in_ids = list(train_rows_dict.keys())
# set(train_id_list) - set(label_in_ids)

In [None]:
# ============== Seperate Train Valid ============== #
train_id_count = len(label_in_ids)
sel_train_id = np.random.choice(label_in_ids, int(train_id_count * 0.8))

train_rows = []
valid_rows = []
for ids in tqdm(label_in_ids):
    if ids in sel_train_id:
        train_rows += train_rows_dict[ids]
    else:
        valid_rows += train_rows_dict[ids]

In [None]:
# ============== Add NER Label (Test)============== #
# test_rows = []
# for sents in tqdm(test_literal_df["sent"]):
#     sent_list = sents.split()
#     dummy_tags = ["O"] * len(sent_list)
            
#     test_rows.append({'tokens' : sent_list, 'tags' : dummy_tags})

In [None]:
# ============== Add NER Label (Train All)============== #
# train_literal_all_df = pd.read_pickle("../input/coleride/train_literal.pickle")
# train_literal_all_df = train_literal_all_df.sample(frac=0.1)
# train_all_rows = []
# for sents in tqdm(train_literal_all_df["sent"]):
#     sent_list = sents.split()
#     dummy_tags = ["O"] * len(sent_list)
            
#     train_all_rows.append({'tokens' : sent_list, 'tags' : dummy_tags})

In [None]:
train_rows[0]

# Modeling

In [None]:
# ============== Set Constance (Model)============== #

MAX_LENGTH = 256
OVERLAP = 20 

PREDICT_BATCH = 64000 

PRETRAINED_PATH = '.'
TEST_INPUT_SAVE_PATH = './test'
TEST_NER_DATA_FILE = 'test_ner_input.json'
TRAIN_INPUT_SAVE_PATH = './train'
TRAIN_PATH = 'train_ner.json'
VAL_PATH = 'valid_ner.json'


PREDICTION_SAVE_PATH = './pred'
PREDICTION_FILE = 'test_predictions.txt'

In [None]:
# ============== Set Environ ============== #
os.environ["MODEL_PATH"] = f"{PRETRAINED_PATH}"
os.environ["TRAIN_FILE"] = f"{TRAIN_INPUT_SAVE_PATH}/{TRAIN_PATH}"
os.environ["VALIDATION_FILE"] = f"{TRAIN_INPUT_SAVE_PATH}/{VAL_PATH}"
os.environ["TEST_FILE"] = f"{TEST_INPUT_SAVE_PATH}/{TEST_NER_DATA_FILE}"
os.environ["OUTPUT_DIR"] = f"{PREDICTION_SAVE_PATH}"

In [None]:
# ============== Create Directory ============== #

# make necessart directories and files
os.makedirs(TEST_INPUT_SAVE_PATH, exist_ok=True)

# make necessart directories and files
os.makedirs(TRAIN_INPUT_SAVE_PATH, exist_ok=True)

# make necessart directories and files
os.makedirs(PREDICTION_SAVE_PATH, exist_ok=True)

# make necessart directories and files
os.makedirs(PRETRAINED_PATH, exist_ok=True)

In [None]:
# ============== Train ============== #
def bert_train():
    !python ../input/kaggle-ner-utils/kaggle_run_ner.py \
    --model_name_or_path bert-base-cased \
    --train_file "$TRAIN_FILE" \
    --validation_file "$VALIDATION_FILE" \
    --num_train_epochs 5 \
    --per_device_train_batch_size 16 \
    --per_device_eval_batch_size 16 \
    --save_steps 15000 \
    --output_dir "$MODEL_PATH" \
    --report_to 'none' \
    --seed 123 \
    --do_train \
    --do_eval \
    --overwrite_output_dir

In [None]:
# ============== Pred ============== #
def bert_predict():
    !python ../input/kaggle-ner-utils/kaggle_run_ner.py \
    --model_name_or_path "$MODEL_PATH" \
    --train_file "$TRAIN_FILE" \
    --validation_file "$VALIDATION_FILE" \
    --test_file "$TEST_FILE" \
    --output_dir "$OUTPUT_DIR" \
    --report_to 'none' \
    --seed 123 \
    --do_predict

In [None]:
# ============== Execute Train ============== #

# for batch_begin in range(0, len(train_rows), PREDICT_BATCH):
with open(f'{TRAIN_INPUT_SAVE_PATH}/{TRAIN_PATH}', 'w') as f:
    for row in train_rows:
        json.dump(row, f)
        f.write('\n')

with open(f'{TRAIN_INPUT_SAVE_PATH}/{VAL_PATH}', 'w') as f:
    for row in valid_rows:
        json.dump(row, f)
        f.write('\n')

# if IS_TRAIN:
bert_train()

# Prediction

In [None]:
# ============== Execute Predict ============== #

bert_outputs = []

for batch_begin in range(0, len(test_rows), PREDICT_BATCH):
    # write data rows to input file
    with open(f'{TEST_INPUT_SAVE_PATH}/{TEST_NER_DATA_FILE}', 'w') as f:
        for row in test_rows[batch_begin:batch_begin+PREDICT_BATCH]:
            json.dump(row, f)
            f.write('\n')
    
    # do predict
    bert_predict()
    
    # read predictions
    with open(f'{PREDICTION_SAVE_PATH}/{PREDICTION_FILE}') as f:
        this_preds = f.read().split('\n')[:-1]
        bert_outputs += [pred.split() for pred in this_preds]
    
    # remove output dir
    !rm -r "$OUTPUT_DIR"

In [None]:
# ============== Get NER ============== #
def get_ner(literal_df, rows, bert_outputs):
    labels = []
    for ids, sentence, pred in zip(literal_df["Id"], rows, bert_outputs):
        curr_phrase = ''
        for word, tag in zip(sentence["tokens"], pred):
            if tag == 'B': # start a new phrase
                if curr_phrase:
                    labels.append([ids, curr_phrase])
                    curr_phrase = ''
                curr_phrase = word
            elif tag == 'I' and curr_phrase: # continue the phrase
                curr_phrase += ' ' + word
            else: # end last phrase (if any)
                if curr_phrase:
                    labels.append([ids, curr_phrase])
                    curr_phrase = ''
        # check if the label is the suffix of the sentence
        if curr_phrase:
            labels.append([ids, curr_phrase])
            curr_phrase = ''

    pred_subs = pd.DataFrame(labels, columns=["Id", "Label"]).drop_duplicates()
    return pred_subs
pred_subs = get_ner(test_literal_df, test_rows, bert_outputs)
# pred_subs = get_ner(train_literal_all_df, train_all_rows, bert_outputs)

In [None]:
pred_subs.to_csv("ner_list.csv", index=False)

# Submission

In [None]:
# ============== Submission (Handle Unknown Test Dataset) ============== #

preds = pred_subs.groupby("Id")["Label"].apply(list).apply(lambda x: "|".join(x)).to_frame()
preds.columns = ["PredictionString"]
submission = pd.read_csv(SUBMISSION_PATH)

id_list = []
pred_list = []
for ids in submission["Id"].tolist():
    id_list.append(ids)
    if ids in preds.index:
        pred_list.append(preds.loc[ids]["PredictionString"])
    else:
        pred_list.append("")

submission = pd.DataFrame({"Id": id_list, "PredictionString": pred_list})

In [None]:
submission.to_csv("submission.csv", index=False)
submission.head()