# BERT NER & Literal Matching
## Coleridge challenge
This notebook gives a simple combination of literal matching and Named Entity Recognition using a BERT-like model.
The training phase of the BERT model was done in another kernel: https://www.kaggle.com/isaiahvh/pytorch-bert-for-named-entity-recognition

**MODEL USED: SciBERT**

In [None]:
MAX_SAMPLE = None # Limits used sampled to a small number for experimentation, set None for production.
CUSTOM_VALIDATION = 1 # 0 == identical train-test set, 1 == partially overlapping train-test set, 2 == disjoint train-test set
# all own custom test set of equal size

# Install packages

In [None]:
!pip install datasets --no-index --find-links=file:///kaggle/input/coleridge-packages/packages/datasets
!pip install ../input/coleridge-packages/seqeval-1.2.2-py3-none-any.whl
!pip install ../input/coleridge-packages/tokenizers-0.10.1-cp37-cp37m-manylinux1_x86_64.whl
!pip install ../input/coleridge-packages/transformers-4.5.0.dev0-py3-none-any.whl

# Import

In [None]:
import os
import re
import json
import time
import datetime
import random
import glob
import importlib

import numpy as np
import pandas as pd

from collections import namedtuple
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

random.seed(123)
np.random.seed(456)

# Load data

In [None]:
# # NOTE: We use training data (here only) for literal matching
# # QUESTION: Why do we use both training and test data for `papers`?
# train_path = '../input/coleridgeinitiative-show-us-the-data/train.csv'
# train = pd.read_csv(train_path)
# train = train[:MAX_SAMPLE]

# paper_train_folder = '../input/coleridgeinitiative-show-us-the-data/train'
# papers = {}
# for paper_id in train['Id'].unique():
#     with open(f'{paper_train_folder}/{paper_id}.json', 'r') as f:
#         paper = json.load(f)
#         papers[paper_id] = paper

In [None]:
papers = {}
if CUSTOM_VALIDATION == 2:
    sample_submission_path = '../input/colridge-custom-dataset-split/data_subsets/Test_set.csv'
    paper_test_folder = '../input/coleridgeinitiative-show-us-the-data/train'
elif CUSTOM_VALIDATION == 0:
    sample_submission_path = '../input/colridge-custom-dataset-split/Full_Overlap_Test_set.csv'
    paper_test_folder = '../input/coleridgeinitiative-show-us-the-data/train'
elif CUSTOM_VALIDATION == 1:
    sample_submission_path = '../input/colridge-custom-dataset-split/Partial_Overlap_Test_set.csv'
    paper_test_folder = '../input/coleridgeinitiative-show-us-the-data/train'
else:
    assert False, "CUSTOM_VALIDATION has an invalid value"
sample_submission = pd.read_csv(sample_submission_path)



if MAX_SAMPLE is not None:
    sample_submission = sample_submission.head(MAX_SAMPLE)
    

sample_submission = sample_submission.groupby('Id').agg({
    'pub_title': 'first',
    'dataset_title': '|'.join,
    'dataset_label': '|'.join,
    'cleaned_label': '|'.join
}).reset_index()

for paper_id in sample_submission['Id']:
    with open(f'{paper_test_folder}/{paper_id}.json', 'r') as f:
        paper = json.load(f)
        papers[paper_id] = paper

# Literal matching

### Create a knowledge bank

### Matching on test data

In [None]:
def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower()).strip()

# Also removes any duplicate (subsequent) spaces
def totally_clean_text(txt):
    txt = clean_text(txt)
    txt = re.sub(' +', ' ', txt)
    return txt

# Bert prediction

### Paths and Hyperparameters

In [None]:
MAX_LENGTH = 48 # Max number of words for each sentence
OVERLAP = 16 # if a sentence exceeds MAX_LENGTH, we split it to multiple sentences with overlapping

# QUESTION: Is this words, characters, etc.?
PREDICT_BATCH = 64000 

# NOTE: Despite naming, doesn't necessarily contain distilbert
# Check the version description of this dataset for the model context
PRETRAINED_PATH = '../input/distilbertnerfordatasets/output'
TEST_INPUT_SAVE_PATH = './input_data'
TEST_NER_DATA_FILE = 'test_ner_input.json'
TRAIN_PATH = '../input/distilbertnerfordatasets/train_ner.json'
# SUGGESTION: Actually separate into train & validation data to limit overfitting
VAL_PATH = '../input/distilbertnerfordatasets/train_ner.json'

PREDICTION_SAVE_PATH = './pred'
PREDICTION_FILE = 'test_predictions.txt'

### Transform data to NER format

Group by publication, training labels should have the same form as expected output.

In [None]:
def clean_training_text(txt):
    """
    similar to the default clean_text function but without lowercasing.
    """
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt)).strip()

def shorten_sentences(sentences):
    short_sentences = []
    for sentence in sentences:
        words = sentence.split()
        if len(words) > MAX_LENGTH:
            for p in range(0, len(words), MAX_LENGTH - OVERLAP):
                short_sentences.append(' '.join(words[p:p+MAX_LENGTH]))
        else:
            short_sentences.append(sentence)
    return short_sentences

In [None]:
test_rows = [] # Test data in NER format
paper_length = []

for paper_id in sample_submission['Id']:
    paper = papers[paper_id]
    
    sentences = [clean_training_text(sentence) for section in paper 
                 for sentence in section['text'].split('.')
                ]
    sentences = shorten_sentences(sentences) # make sentences short
    sentences = [sentence for sentence in sentences if len(sentence) > 10] # only accept sentences with length > 10 chars
    # QUESTION: Why do we load only sentences with 'data' or 'study'?
    sentences = [sentence for sentence in sentences if any(word in sentence.lower() for word in ['data', 'study'])]
        
    # Collect 'dummy' labeled sentences as dictionary
    for sentence in sentences:
        sentence_words = sentence.split()
        dummy_tags = ['O']*len(sentence_words)
        test_rows.append({'tokens' : sentence_words, 'tags' : dummy_tags})
    
    # Track number of sentence fragments each paper has
    paper_length.append(len(sentences))
    
print(f'Loaded {len(test_rows)} sentence fragments')

### Do predict and collect results

In [None]:
os.environ["MODEL_PATH"] = f"{PRETRAINED_PATH}"
os.environ["TRAIN_FILE"] = f"{TRAIN_PATH}"
os.environ["VALIDATION_FILE"] = f"{VAL_PATH}"
os.environ["TEST_FILE"] = f"{TEST_INPUT_SAVE_PATH}/{TEST_NER_DATA_FILE}"
os.environ["OUTPUT_DIR"] = f"{PREDICTION_SAVE_PATH}"

In [None]:
# copy my_seqeval.py to the working directory because the input directory is non-writable
!cp /kaggle/input/coleridge-packages/my_seqeval.py ./

# make necessart directories and files
os.makedirs(TEST_INPUT_SAVE_PATH, exist_ok=True)

In [None]:
# QUESTION: What are `train_file` and `validation_file` for if we are only interested in predictions?
# Could we omit them, given the `--do_predict` flag?

def bert_predict():
    !python ../input/kaggle-ner-utils/kaggle_run_ner.py \
    --model_name_or_path "$MODEL_PATH" \
    --train_file "$TRAIN_FILE" \
    --validation_file "$VALIDATION_FILE" \
    --test_file "$TEST_FILE" \
    --output_dir "$OUTPUT_DIR" \
    --report_to 'none' \
    --seed 123 \
    --do_predict

In [None]:
bert_outputs = []

for batch_begin in range(0, len(test_rows), PREDICT_BATCH):
    # Write test data to input file
    with open(f'{TEST_INPUT_SAVE_PATH}/{TEST_NER_DATA_FILE}', 'w') as f:
        for row in test_rows[batch_begin:batch_begin+PREDICT_BATCH]:
            json.dump(row, f)
            f.write('\n')
    
    # Remove previous output dir
    !rm -r "$OUTPUT_DIR"
    
    # Perform prediction
    bert_predict()
    
    # Read predictions
    with open(f'{PREDICTION_SAVE_PATH}/{PREDICTION_FILE}') as f:
        this_preds = f.read().split('\n')[:-1]
        bert_outputs += [pred.split() for pred in this_preds]

### Restore Dataset labels from predictions

In [None]:
# Retrieve test sentences
test_sentences = [row['tokens'] for row in test_rows]
del test_rows

In [None]:
if all(all(tag == 'O' for tag in sentence) for sentence in bert_outputs):
    print("No predictions were made")
else:
    print("Some predictions were made")

In [None]:
bert_dataset_labels = [] # Dataset labels for each publication

for length in paper_length:
    labels = set()
    for sentence, pred in zip(test_sentences[:length], bert_outputs[:length]):
        curr_phrase = ''
        
        for word, tag in zip(sentence, pred):
            if tag == 'B': # Start a new phrase
                if curr_phrase:
                    labels.add(curr_phrase)
                    curr_phrase = ''
                curr_phrase = word
            elif tag == 'I' and curr_phrase: # Continue the phrase
                curr_phrase += ' ' + word
            else: # End last phrase (if any)
                if curr_phrase:
                    labels.add(curr_phrase)
                    curr_phrase = ''
                    
        # Add label if suffix of the sentence
        if curr_phrase:
            labels.add(curr_phrase)
            curr_phrase = ''
    
    # Record dataset labels for this publication
    bert_dataset_labels.append(labels)
    
    # NOTE: One sloppy way to do indexing.
    # QUESTION: Would it improve anything (i.e. RAM)?
    del test_sentences[:length], bert_outputs[:length]

In [None]:
# Output sample predictions
bert_dataset_labels[:5]
if all(pred == set() for pred in bert_dataset_labels):
    print("WARNING: No predictions were made!")

### Filter based on Jaccard score and clean

In [None]:
def jaccard_similarity(s1, s2):
    l1 = s1.split(" ")
    l2 = s2.split(" ")    
    intersection = len(list(set(l1).intersection(l2)))
    union = (len(l1) + len(l2)) - intersection
    return float(intersection) / union

# Select from bert labels to prevent near-duplicates
final_bert_selection = []
total_duplicate_exclusion = 0

for labels in bert_dataset_labels:
    label_selection = []
    
    for new_label in sorted(labels, key=len):
        new_label = clean_text(new_label)
        if len(label_selection) == 0 or all(jaccard_similarity(new_label, included_label) < 0.75 for included_label in label_selection):
            label_selection.append(new_label)
        else:
            total_duplicate_exclusion += 1
    
    final_bert_selection.append('|'.join(label_selection))

In [None]:
# Output sample bert label selection
print(total_duplicate_exclusion)
print(final_bert_selection[:5])

# Aggregate final predictions and write submission file

In [None]:
final_predictions = final_bert_selection

In [None]:
sample_submission['PredictionString'] = final_predictions
sample_submission.head()
sample_submission.to_csv(f'submission.csv', index=False)

In [None]:
def f_score(TP, FP, FN, beta = 0.5):
    num = (1 + beta*beta) * TP
    denom = (1 + beta*beta) * TP + beta*beta * FN + FP
    return num/denom

In [None]:
print(len(final_predictions), len(sample_submission))

pred_count = 0

TP = 0
FP = 0
FN = 0

for predictions, truths in zip(final_predictions, sample_submission['cleaned_label']):
    predictions = [clean_text(pred) for pred in predictions.split('|')]
    truths = [truth for truth in truths.split('|')]

    # Determine best matches for truths
    JACCARD_THRESHOLD = 0.5
    potential_matches = []
    Match = namedtuple('Match', 'truth_index pred_index score')

    for i, t in enumerate(truths):
        for j, p in enumerate(predictions):
            score = jaccard_similarity(t, p)
            potential_matches.append(Match(i, j, score))

    potential_matches.sort(key=lambda m: m.score, reverse=True)

    matches = []
    for potential_match in potential_matches:
        if any(used_match.truth_index == potential_match.truth_index\
              or used_match.pred_index == potential_match.pred_index\
              for used_match in matches):
            continue
        if potential_match.score >= JACCARD_THRESHOLD:
            matches.append(potential_match)

    n_true_pos = len(matches)
    n_false_pos = (len(predictions) - len(matches))
    n_false_neg = len(truths) - len(matches)

    pred_count += len(predictions)
    TP += n_true_pos
    FP += n_false_pos
    FN += n_false_neg

print(f"TP: {TP}, FP: {FP}, FN: {FN}")
print(f"--> sum = {TP+FP+FN}, total_preds = {pred_count}")

score = f_score(TP, FP, FN, beta = 0.5)
print(f"Achieved F-micro0.5 score of {score}")