This notebook gives a simple combination of literal matching and Named Entity Recognition using BERT (base model from huggingface).

The training phase of the BERT model was done in another kernel: Pytorch BERT for Named Entity Recognition.

In [None]:
MAX_SAMPLE = None # set a small number for experimentation, set None for production.

# Install packages

In [None]:
!pip install datasets --no-index --find-links=file:///kaggle/input/coleridge-packages/packages/datasets
!pip install ../input/coleridge-packages/seqeval-1.2.2-py3-none-any.whl
!pip install ../input/coleridge-packages/tokenizers-0.10.1-cp37-cp37m-manylinux1_x86_64.whl
!pip install ../input/coleridge-packages/transformers-4.5.0.dev0-py3-none-any.whl

# Import

In [None]:
import os
import re
import json
import time
import datetime
import random
import glob
import importlib

import numpy as np
import pandas as pd

from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

random.seed(123)
np.random.seed(456)

# Load data

In [None]:
# read train data
full_df_path = '../input/coleridgeinitiative-show-us-the-data/train.csv'
df = pd.read_csv(full_df_path)
df = df[:MAX_SAMPLE]

# train papers added to papers (200)
paper_df_folder = '../input/coleridgeinitiative-show-us-the-data/train'
papers = {}
for paper_id in df['Id'].unique():
    with open(f'{paper_df_folder}/{paper_id}.json', 'r') as f:
        paper = json.load(f)
        papers[paper_id] = paper

In [None]:
# Split the input train data to train and validation set based on Id
COVID_papers = df[df['dataset_title'] == "Our World in Data COVID-19 dataset"]
COVID_papers_IDS = set(COVID_papers['Id'])
df_train_set = df[~df['Id'].isin(COVID_papers_IDS)]
df_in_val_set = df[df['Id'].isin(COVID_papers_IDS)]

train, validate = train_test_split(df_train_set['Id'].unique(), test_size=0.01, random_state=42)
# print("Train Shape : ", train.shape)

validate = np.append(df_in_val_set["Id"].to_numpy(), validate) # add Id's with Our World in Data COVID-19 dataset to the validation set
print("Validate Shape : ", validate.shape)


In [None]:
# train papers with titles
# train_ground_truth = df[df['Id'].isin(train)]
# validation papers with titles
validate_ground_truth = df[df['Id'].isin(validate)]

# train papers
# train_papers = {your_key: papers[your_key] for your_key in train}
# validate papers
validate_papers = {your_key: papers[your_key] for your_key in validate}

#print(train_ground_truth.shape)
print(validate_ground_truth.shape)
#print(len(train_papers))
print(len(validate_papers))


# Literal matching

### Create a knowledge bank

### Matching on test data

In [None]:
def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower()).strip()

def totally_clean_text(txt):
    txt = clean_text(txt)
    txt = re.sub(' +', ' ', txt)
    return txt

In [None]:
#Known labels for literal matching

# all_labels = set()

# for label_1, label_2, label_3 in train_ground_truth[['dataset_title', 'dataset_label', 'cleaned_label']].itertuples(index=False):
#     all_labels.add(str(label_1).lower())
#     all_labels.add(str(label_2).lower())
#     all_labels.add(str(label_3).lower())
    
# print(f'No. different labels: {len(all_labels)}')

In [None]:
#For literal matching

# literal_preds = []

# for paper_id in sample_submission['Id']:
#     paper = train_papers[paper_id]
#     text_1 = '. '.join(section['text'] for section in paper).lower()
#     text_2 = totally_clean_text(text_1)
    
#     labels = set()
#     for label in all_labels:
#         if label in text_1 or label in text_2:
#             labels.add(clean_text(label))
    
#     literal_preds.append('|'.join(labels))


# Bert prediction

### Paths and Hyperparameters

In [None]:
MAX_LENGTH = 64 # max no. words for each sentence.
OVERLAP = 20 # if a sentence exceeds MAX_LENGTH, we split it to multiple sentences with overlapping

PREDICT_BATCH = 64000 

PRETRAINED_PATH = '../input/bert-label-frequency-equal/output'
TEST_INPUT_SAVE_PATH = './input_data'
TEST_NER_DATA_FILE = 'test_ner_input.json'
TRAIN_PATH = '../input/bert-label-frequency-equal/train_ner.json'
VAL_PATH = '../input/bert-label-frequency-equal/train_ner.json'

PREDICTION_SAVE_PATH = './pred'
PREDICTION_FILE = 'test_predictions.txt'

### Transform data to NER format

Group by publication, training labels should have the same form as expected output.

In [None]:
# train = train_ground_truth.groupby('Id').agg({
#     'pub_title': 'first',
#     'dataset_title': '|'.join,
#     'dataset_label': '|'.join,
#     'cleaned_label': '|'.join
# }).reset_index()

# print(f'No. grouped training rows: {len(train)}')

In [None]:
def clean_training_text(txt):
    """
    similar to the default clean_text function but without lowercasing.
    """
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt)).strip()

def shorten_sentences(sentences):
    short_sentences = []
    for sentence in sentences:
        words = sentence.split()
        if len(words) > MAX_LENGTH:
            for p in range(0, len(words), MAX_LENGTH - OVERLAP):
                short_sentences.append(' '.join(words[p:p+MAX_LENGTH]))
        else:
            short_sentences.append(sentence)
    return short_sentences

def find_sublist(big_list, small_list):
    all_positions = []
    for i in range(len(big_list) - len(small_list) + 1):
        if small_list == big_list[i:i+len(small_list)]:
            all_positions.append(i)
        
    return all_positions

def tag_sentence(sentence, labels): # requirement: both sentence and labels are already cleaned
    sentence_words = sentence.split()
    labels = sorted(labels, key=len, reverse=True)
    if labels is not None and any(re.findall(f'\\b{label}\\b', sentence)
                                  for label in labels): # positive sample
        nes = ['O'] * len(sentence_words)
        for label in labels:
            label_words = label.split()

            all_pos = find_sublist(sentence_words, label_words)
            for pos in all_pos:
                nes[pos] = 'B'
                for i in range(pos+1, pos+len(label_words)):
                    nes[i] = 'I'

        return True, sentence_words, nes
        
    else: # negative sample
        nes = ['O'] * len(sentence_words)
        return False, sentence_words, nes

In [None]:
test_rows = [] # test data in NER format
paper_length = [] # store the number of sentences each paper has

#for paper_id in sample_submission['Id']:
for paper_id in validate_ground_truth['Id']:
    sentences = []
    # load paper
    # paper = train_papers[paper_id]
    paper = validate_papers[paper_id]
    
    # extract sentences
    sentences_all = [clean_training_text(sentence) for section in paper    # split the sentences
            for sentence in section['text'].split('.') 
            ]
    
    sentences_short = shorten_sentences(sentences_all)
    sentences = [sentence for sentence in sentences_short if len(sentence) > 10]    #keep the sentences with at least 10 characters

#     for i in range(0, len(sentences_lengths)):
#         if len(re.findall('([A-Z][a-z]+)', sentences_lengths[i])) > 3:
#             sentences.append(sentences_lengths[i])
        
    # collect all sentences in json
    labels = validate_ground_truth[validate_ground_truth['Id']==paper_id]['dataset_label']
    labels = [clean_training_text(label) for label in labels]
#     print(labels)
    for sentence in sentences:
        sentence_words = sentence.split()
        dummy_tags = ['O']*len(sentence_words)
        pos, sentence_words, tags = tag_sentence(sentence, labels)
        test_rows.append({'tokens' : sentence_words, 'tags' : tags})
        
    # track which sentence belongs to which data point
    paper_length.append(len(sentences))
print(f'total number of sentences: {len(test_rows)}')

### Do predict and collect results

In [None]:
os.environ["MODEL_PATH"] = f"{PRETRAINED_PATH}"
os.environ["TRAIN_FILE"] = f"{TRAIN_PATH}"
os.environ["VALIDATION_FILE"] = f"{VAL_PATH}"
os.environ["TEST_FILE"] = f"{TEST_INPUT_SAVE_PATH}/{TEST_NER_DATA_FILE}"
os.environ["OUTPUT_DIR"] = f"{PREDICTION_SAVE_PATH}"

In [None]:
# copy my_seqeval.py to the working directory because the input directory is non-writable
!cp /kaggle/input/coleridge-packages/my_seqeval.py ./

# make necessart directories and files
os.makedirs(TEST_INPUT_SAVE_PATH, exist_ok=True)

In [None]:
def bert_predict():
    !python ../input/kaggle-ner-utils/kaggle_run_ner.py \
    --model_name_or_path "$MODEL_PATH" \
    --train_file "$TRAIN_FILE" \
    --validation_file "$TRAIN_FILE" \
    --test_file "$TEST_FILE" \
    --output_dir "$OUTPUT_DIR" \
    --report_to 'none' \
    --seed 123 \
    --do_predict

In [None]:
bert_outputs = []

for batch_begin in range(0, len(test_rows), PREDICT_BATCH):
    # write data rows to input file
    with open(f'{TEST_INPUT_SAVE_PATH}/{TEST_NER_DATA_FILE}', 'w') as f:
        for row in test_rows[batch_begin:batch_begin+PREDICT_BATCH]:
            json.dump(row, f)
            f.write('\n')
    
    # remove output dir
    !rm -r "$OUTPUT_DIR"
    
    # do predict
    bert_predict()
    
    # read predictions
    with open(f'{PREDICTION_SAVE_PATH}/{PREDICTION_FILE}') as f:
        this_preds = f.read().split('\n')[:-1]
        bert_outputs += [pred.split() for pred in this_preds]

### Restore Dataset labels from predictions

In [None]:
# get test sentences
test_sentences = [row['tokens'] for row in test_rows]

del test_rows

In [None]:
bert_dataset_labels = [] # store all dataset labels for each publication

for length in paper_length:
    labels = set()
    for sentence, pred in zip(test_sentences[:length], bert_outputs[:length]):
        curr_phrase = ''
        for word, tag in zip(sentence, pred):
            if tag == 'B': # start a new phrase
                #print(word)
                if curr_phrase:
                    labels.add(curr_phrase)
                    curr_phrase = ''
                curr_phrase = word
            elif tag == 'I' and curr_phrase: # continue the phrase
                curr_phrase += ' ' + word
            else: # end last phrase (if any)
                if curr_phrase:
                    labels.add(curr_phrase)
                    curr_phrase = ''
        # check if the label is the suffix of the sentence
        if curr_phrase:
            labels.add(curr_phrase)
            curr_phrase = ''
    
    # record dataset labels for this publication
    bert_dataset_labels.append(labels)
    
    del test_sentences[:length], bert_outputs[:length]

### Filter based on Jaccard score and clean

In [None]:
def jaccard_similarity(s1, s2):
    l1 = s1.split(" ")
    l2 = s2.split(" ")    
    intersection = len(list(set(l1).intersection(l2)))
    union = (len(l1) + len(l2)) - intersection
    return float(intersection) / union

filtered_bert_labels = []

for labels in bert_dataset_labels:
    filtered = []
    
    for label in sorted(labels, key=len):
        label = clean_text(label)
        if len(filtered) == 0 or all(jaccard_similarity(label, got_label) < 0.75 for got_label in filtered):
            filtered.append(label)
    
    filtered_bert_labels.append('|'.join(filtered))

# Aggregate final predictions and write submission file

In [None]:
final_predictions = []
for bert_pred in filtered_bert_labels:
    #if literal_match:
    #    final_predictions.append(literal_match)
    #else:
    final_predictions.append(bert_pred)

In [None]:
#if you want to submit to the competition

# sample_submission['PredictionString'] = final_predictions
# sample_submission.head()

In [None]:
# sample_submission.to_csv(f'submission.csv', index=False)

# Validation

In [None]:
validate_ground_truth['pred'] = filtered_bert_labels

In [None]:
def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [None]:
def get_precision_recall(tp, fp, fn):
    precision = tp / (tp+fp)
    recall = tp / (tp + fn)
    return precision, recall

def fbeta_score(precision, recall, beta):
    fbeta = (1+(beta*beta))*((precision*recall)/( (beta*beta*precision) + recall))
    return fbeta

In [None]:
def coleridge_initiative_jaccard(ground_truth, prediction, verbose=True):
    gts = ground_truth.split('|')
    if len(prediction) > 0:
        pds = sorted(prediction.split('|'))
    else:
        pds = []
    if verbose:
        print("Ground truth : " , gts)
        print("Prediction : ", pds)
        
    js_scores = []
    cf_matrix = []
    
    #### Counting True Positives (TP) and False Positives (FP)
    
    for pd in pds:
        score = -1
        for gt in gts:
            js = jaccard(pd, gt)
            if js > score:
                score = js
        if score >= 0.5:
            js_scores.append(score)
            cf_matrix.append("TP")
        else:
            js_scores.append(score)
            cf_matrix.append("FP")
            
    #### Counting False Negatives (FN)
    
    for gt in gts:
        score = 0
        for pd in pds:
            js = jaccard(gt, pd)
            if js > score:
                score = js
        if score == 0:
            js_scores.append(score)
            cf_matrix.append("FN")
            
    return js_scores, " ".join(cf_matrix)

In [None]:
validate_ground_truth['evaluation'] = validate_ground_truth.apply(lambda x: coleridge_initiative_jaccard(x['cleaned_label'],
                                                                                                         x['pred'], verbose=False), axis=1)
validate_ground_truth['js_scores'] = validate_ground_truth['evaluation'].apply(lambda x : x[0])
validate_ground_truth['pred_type'] = validate_ground_truth['evaluation'].apply(lambda x : x[1])


#coleridge_initiative_jaccard(validate_ground_truth['cleaned_label'], validate_ground_truth['pred'] , verbose=True)

In [None]:
def get_count_tp_fp_fn(prediction, verbose=True):
    preds = prediction.split(" ")
    if verbose:
        print(preds)
    tpc = 0
    fpc = 0
    fnc = 0
    for pred in preds:
        if pred == "TP":
            tpc = tpc + 1
        elif pred == "FP":
            fpc = fpc + 1
        elif pred == "FN":
            fnc = fnc + 1
    return [tpc, fpc, fnc]

def make_col_tp_fp_fn(df, col):
    df['TP'] = df[col].apply(lambda x : x[0])
    df['FP'] = df[col].apply(lambda x : x[1])
    df['FN'] = df[col].apply(lambda x : x[2])
    return df

In [None]:
validate_ground_truth['tp_fp_fn'] = validate_ground_truth['pred_type'].apply(lambda x : get_count_tp_fp_fn(x, verbose=False))
validate_ground_truth = make_col_tp_fp_fn(validate_ground_truth, 'tp_fp_fn')
tp = sum(validate_ground_truth['TP'])
fp = sum(validate_ground_truth['FP'])
fn = sum(validate_ground_truth['FN'])

print("True Positives (TP) : ", tp)
print("False Positives (FP) : ", fp)
print("False Negatives (FN) : ", fn)

precision, recall = get_precision_recall(tp, fp, fn)
print("Precision : ", precision)
print("Recall : ", recall)

fbeta = fbeta_score(precision, recall, 0.5)
print("FBeta Score : ", fbeta)

In [None]:
all_false_negatives = validate_ground_truth[(validate_ground_truth['FN'] > 0) & (validate_ground_truth['TP'] == 0) & (validate_ground_truth['FP'] == 0)]
# False negative is when there is no prediction --> FN at least 1 and TP and FP equal to 0

all_full_true_positive = validate_ground_truth[(validate_ground_truth['TP'] > 0) & (validate_ground_truth['FP'] == 0) & (validate_ground_truth['FN'] == 0)]
# Fully correct predictions --> TP bigger at least 1 and FP and FN equal to 0

all_only_wrong_prediction = validate_ground_truth[(validate_ground_truth['TP'] == 0) & (validate_ground_truth['FP'] > 0)]
# only wrong predictions --> TP is equal to 0 and FP is at least 1, FN does not matter

all_good_and_wrong_pred = validate_ground_truth[(validate_ground_truth['TP'] > 0) & (validate_ground_truth['FP'] > 0)]
# at least one good prediction and at least one bad prediction, FN are always 0 in this case

In [None]:
all_labels = validate_ground_truth['dataset_label'].value_counts().rename_axis('label').reset_index(name='all_labels_count')
false_negatives = all_false_negatives["dataset_label"].value_counts().rename_axis('label').reset_index(name='false_negatives')
full_true_positive = all_full_true_positive["dataset_label"].value_counts().rename_axis('label').reset_index(name='full_true_positive')
only_wrong_prediction = all_only_wrong_prediction["dataset_label"].value_counts().rename_axis('label').reset_index(name='full_wrong')
good_and_wrong_pred = all_good_and_wrong_pred["dataset_label"].value_counts().rename_axis('label').reset_index(name='good_and_wrong')


x1 = pd.merge(all_labels, false_negatives , on='label', how = 'left')
x2 = pd.merge(x1, full_true_positive , on='label', how = 'left')
x3 = pd.merge(x2, only_wrong_prediction , on='label', how = 'left')
x4 = pd.merge(x3, good_and_wrong_pred , on='label', how = 'left')

x4.fillna(0)

In [None]:
pd.set_option("display.max_rows", None, "display.max_columns", None)

In [None]:
# Datasets die maar 1 keer voorkomen in de validatieset

single_datasets = x4[(x4['all_labels_count'] ==1)].fillna(0)
print("False_negative % = ", sum(single_datasets["false_negatives"]) / sum(single_datasets["all_labels_count"]))
print("Full_true_positive % = ", sum(single_datasets["full_true_positive"]) / sum(single_datasets["all_labels_count"]))
print("Full_wrong % = ", sum(single_datasets["full_wrong"]) / sum(single_datasets["all_labels_count"]))
print("Good_and_wrong % = ", sum(single_datasets["good_and_wrong"]) / sum(single_datasets["all_labels_count"]))

In [None]:
# Datasets die het vaakst voorkomen in de validatieset

common_datasets = x4[(x4['all_labels_count'] > 30)].fillna(0)
print("False_negative % = ", sum(common_datasets["false_negatives"]) / sum(common_datasets["all_labels_count"]))
print("Full_true_positive % = ", sum(common_datasets["full_true_positive"]) / sum(common_datasets["all_labels_count"]))
print("Full_wrong % = ", sum(common_datasets["full_wrong"]) / sum(common_datasets["all_labels_count"]))
print("Good_and_wrong % = ", sum(common_datasets["good_and_wrong"]) / sum(common_datasets["all_labels_count"]))

In [None]:
labels_mostly_abbrev = ["ADNI", "SLOSH model", "NOAA Tide Gauge", "IBTrACS", "NOAA C-CAP"]
labels_containing_abrev = ["ADNI", "Alzheimer's Disease Neuroimaging Initiative (ADNI)", "Baltimore Longitudinal Study of Aging (BLSA)",
                          "SLOSH model", "NOAA Tide Gauge", "SARS-CoV-2 genome sequences", "IBTrACS", "genome sequence of SARS-CoV-2", 
                          "North American Breeding Bird Survey (BBS)", "genome sequences of SARS-CoV-2", "COVID-19 Open Research Dataset",
                          "USDA Census of Agriculture", "SARS-CoV-2 genome sequence", "COVID-19 Open Research Dataset (CORD-19)", 
                          "NOAA Optimum Interpolation Sea Surface Temperature", "NSF Survey of Earned Doctorates", "NCES Common Core of Data",
                          "NOAA C-CAP", "NOAA tide station", "NASS Census of Agriculture", "NOAA World Ocean Database", "ANSS Comprehensive Catalog",
                          "COVID-19 Image Data Collection", "NSF Survey of Graduate Students and Postdoctorates in Science and Engineering",
                           "genome sequence of 2019-nCoV", "NSF Survey of Industrial Research and Development", "SARS-CoV-2 full genome sequence",
                          "genome sequences of 2019-nCoV", "ARMS Farm Financial and Crop Production Practices", "NSF Survey of Science and Engineering Research Facilities",
                          "2019-nCoV complete genome sequences", "NOAA National Water Level Observation Network", "SARS-CoV-2 full genome sequences",
                          "genome sequences of COVID-19", "COVID-19 genome sequences", "COVID-19 Open Research Data", "ANSS Comprehensive Earthquake Catalog",
                          "Our World in Data COVID-19", "genome sequence of COVID-19", "NOAA water level station", "COVID-19 Death data",
                          "The National Institute on Aging Genetics of Alzheimer's Disease Data Storage Site (NIAGADS)", "NOAA Sea, Lake, and Overland Surges from Hurricanes"]


In [None]:
# algemene score

x4_nona = x4.fillna(0)
print("False_negative % = ", sum(x4_nona["false_negatives"]) / sum(x4_nona["all_labels_count"]))
print("Full_true_positive % = ", sum(x4_nona["full_true_positive"]) / sum(x4_nona["all_labels_count"]))
print("Full_wrong % = ", sum(x4_nona["full_wrong"]) / sum(x4_nona["all_labels_count"]))
print("Good_and_wrong % = ", sum(x4_nona["good_and_wrong"]) / sum(x4_nona["all_labels_count"]))

In [None]:
# datasets met vooral afkortingen

results_labels_mostly_abbrev = x4[x4['label'].isin(labels_mostly_abbrev)].fillna(0)

print("False_negative % = ", sum(results_labels_mostly_abbrev["false_negatives"]) / sum(results_labels_mostly_abbrev["all_labels_count"]))
print("Full_true_positive % = ", sum(results_labels_mostly_abbrev["full_true_positive"]) / sum(results_labels_mostly_abbrev["all_labels_count"]))
print("Full_wrong % = ", sum(results_labels_mostly_abbrev["full_wrong"]) / sum(results_labels_mostly_abbrev["all_labels_count"]))
print("Good_and_wrong % = ", sum(results_labels_mostly_abbrev["good_and_wrong"]) / sum(results_labels_mostly_abbrev["all_labels_count"]))

In [None]:
# datasets die afkorting bevatten

results_labels_contain_abbrev = x4[x4['label'].isin(labels_containing_abrev)].fillna(0)
print("False_negative % = ", sum(results_labels_contain_abbrev["false_negatives"]) / sum(results_labels_contain_abbrev["all_labels_count"]))
print("Full_true_positive % = ", sum(results_labels_contain_abbrev["full_true_positive"]) / sum(results_labels_contain_abbrev["all_labels_count"]))
print("Full_wrong % = ", sum(results_labels_contain_abbrev["full_wrong"]) / sum(results_labels_contain_abbrev["all_labels_count"]))
print("Good_and_wrong % = ", sum(results_labels_contain_abbrev["good_and_wrong"]) / sum(results_labels_contain_abbrev["all_labels_count"]))

In [None]:
# datasets zonder afkortingen

results_no_abbrev = x4[~x4['label'].isin(labels_containing_abrev)].fillna(0)
print("False_negative % = ", sum(results_no_abbrev["false_negatives"]) / sum(results_no_abbrev["all_labels_count"]))
print("Full_true_positive % = ", sum(results_no_abbrev["full_true_positive"]) / sum(results_no_abbrev["all_labels_count"]))
print("Full_wrong % = ", sum(results_no_abbrev["full_wrong"]) / sum(results_no_abbrev["all_labels_count"]))
print("Good_and_wrong % = ", sum(results_no_abbrev["good_and_wrong"]) / sum(results_no_abbrev["all_labels_count"]))