# MLiP Group 15
Using this notebook, we will perform NER using a fine-tuned Sci-BERT model, combined with string matching.

Note: we used a notebook from tungmphung (https://www.kaggle.com/tungmphung/coleridge-matching-bert-ner) as a starting point.

# Settings

In [None]:
ABELVALIDATIONSIZE = 4000
SCIBERTDATASETNAME = "uniquevalidation-10neg-1pos"

validation_version = True


# Install offline packages

In [None]:
# Install offline libraries
!pip install datasets --no-index --find-links=file:///kaggle/input/coleridge-packages/packages/datasets
!pip install ../input/coleridge-packages/seqeval-1.2.2-py3-none-any.whl
!pip install ../input/coleridge-packages/tokenizers-0.10.1-cp37-cp37m-manylinux1_x86_64.whl
!pip install ../input/coleridge-packages/transformers-4.5.0.dev0-py3-none-any.whl


# Import

In [None]:
import os
import re
import json
import time
import datetime
import random
import glob
import importlib

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

random.seed(123)
np.random.seed(456)

# Load data from competition

In [None]:
train_path = '../input/coleridgeinitiative-show-us-the-data/train.csv'
# Load and display the train data from train.csv
unique = True

if unique:
    train = pd.read_csv(train_path)
    list_unique = ['Baccalaureate and Beyond','National Assessment of Education Progress','World Ocean Database','Survey of Industrial Research and Development','Survey of Doctorate Recipients',"COVID-19 Deaths data","The National Institute on Aging Genetics of Alzheimer's Disease Data Storage Site (NIAGADS)",'National Assessment of Education Progress','National Teacher and Principal Survey','Beginning Postsecondary Student','Rural-Urban Continuum Codes', 'NOAA Tide Gauge','National Education Longitudinal Study'] 
   
    boolean_series_test = train.dataset_title.isin(list_unique)
    boolean_series_train = ~train.dataset_title.isin(list_unique)
    test = train[boolean_series_test]
    train = train[boolean_series_train]
else:
    train = pd.read_csv(train_path)

#display(train)
#display(test)

# Load all the train papers into a dictionary
paper_train_folder = '../input/coleridgeinitiative-show-us-the-data/train'
papers = {}
for paper_id in tqdm(train['Id'].unique(), ascii=True, desc="Loading papers"):
    with open(f'{paper_train_folder}/{paper_id}.json', 'r') as f:
        paper = json.load(f)
        papers[paper_id] = paper

In [None]:
# load and display sample submission
sample_submission_path = '../input/coleridgeinitiative-show-us-the-data/sample_submission.csv'
sample_submission = pd.read_csv(sample_submission_path)
display(sample_submission)

# load the test data
if validation_version:
    paper_test_folder = '../input/coleridgeinitiative-show-us-the-data/train'
    papers_test = {}
    for paper_id in tqdm(test['Id'].unique(), ascii=True, desc="Loading papers"):
        with open(f'{paper_train_folder}/{paper_id}.json', 'r') as f:
            paper = json.load(f)
            papers_test[paper_id] = paper
    sample_submission = test
else:
    paper_test_folder = '../input/coleridgeinitiative-show-us-the-data/test'
    papers_test = {}
    for paper_id in sample_submission['Id']:
        with open(f'{paper_test_folder}/{paper_id}.json', 'r') as f:
            paper = json.load(f)
            papers_test[paper_id] = paper

# Literal string matching

### Create a set of all found datasets

In [None]:
all_labels = set() # all dataset names

for label_1, label_2, label_3 in train[['dataset_title', 'dataset_label', 'cleaned_label']].itertuples(index=False):
    all_labels.add(str(label_1).lower())
    all_labels.add(str(label_2).lower())
    all_labels.add(str(label_3).lower())
    
print(f'No. different labels: {len(all_labels)}')
if len(all_labels) > 5:
    print("Examples:")
    print(list(dict.fromkeys(all_labels))[:10])

### Matching on test data

In [None]:
# Cleaning text by removing all but single spaces and lowercase characters/letters
def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower()).strip()

def totally_clean_text(txt):
    txt = clean_text(txt)
    txt = re.sub(' +', ' ', txt)
    return txt

In [None]:
# loop through all papers, lowercase/clean the text and match it with the set of datasets
literal_preds = []

for paper_id in tqdm(sample_submission['Id'], ascii=True, desc="matching papers"):
    paper = papers_test[paper_id]
    text_1 = '. '.join(section['text'] for section in paper).lower()
    text_2 = totally_clean_text(text_1)
    
    labels = set()
    for label in all_labels:
        if label in text_1 or label in text_2 or clean_text(label) in text_2:
            labels.add(clean_text(label))
    
    literal_preds.append('|'.join(labels))

print(len(literal_preds))
literal_preds[:10]

# Sci-BERT prediction

### Paths and Hyperparameters

In [None]:
# TODO: look into this preprocessing for prediction
MAX_LENGTH = 64 # max no. words for each sentence.
OVERLAP = 20 # if a sentence exceeds MAX_LENGTH, we split it to multiple sentences with overlapping

PREDICT_BATCH = 64000 

PRETRAINED_PATH = '../input/' + SCIBERTDATASETNAME + '/output'
TEST_INPUT_SAVE_PATH = './input_data'
TEST_NER_DATA_FILE = 'test_ner_input.json'
TRAIN_PATH = '../input/' + SCIBERTDATASETNAME + '/train_ner.json'
VAL_PATH = '../input/' + SCIBERTDATASETNAME + '/train_ner.json'

# Output of the kaggle_run_ner.py predictions
PREDICTION_SAVE_PATH = './pred'
PREDICTION_FILE = 'test_predictions.txt'

### Transform data to NER format

Group by publication, training labels should have the same form as expected output.

In [None]:
# note: train there is the train.csv dataframe
train = train.groupby('Id').agg({
    'pub_title': 'first',
    'dataset_title': '|'.join,
    'dataset_label': '|'.join,
    'cleaned_label': '|'.join
}).reset_index()

print(f'No. grouped training rows: {len(train)}')

display(train)

In [None]:
# does NOT do lowercasing. Which is important as we use SciBERT-cased
def clean_SciBERT_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt)).strip()

# for sliding window. Sci-BERT cannot handle too long inputs
def shorten_sentences(sentences):
    short_sentences = []
    for sentence in sentences:
        words = sentence.split()
        if len(words) > MAX_LENGTH:
            for p in range(0, len(words), MAX_LENGTH - OVERLAP):
                short_sentences.append(' '.join(words[p:p+MAX_LENGTH]))
        else:
            short_sentences.append(sentence)
    return short_sentences

In [None]:
test_rows = [] # test data in NER format
paper_length = [] # store the number of sentences each paper has

for paper_id in tqdm(sample_submission['Id'], ascii=True, desc="preprocessing papers"): # loop over test papers
    # load paper
    paper = papers_test[paper_id]
    
    # extract sentences from the text part of the paper
    sentences = [clean_SciBERT_text(sentence) for section in paper 
                 for sentence in section['text'].split('.')
                ]
    sentences = shorten_sentences(sentences) # make sentences short
    #sentences = [sentence for sentence in sentences if len(sentence) > 10] # only accept sentences with length > 10 words
    
    # only look at sentences which contain these words
    sentences = [sentence for sentence in sentences if any(word in sentence.lower() for word in ['data', 'study', 'statistics', 'compilation', 'dossier', 'dataset', 'reports', 'studies', 'measurements', 'file', 'archive', 'set', 'public', 'toy', 'synthetic'])]
    #sentences = [sentence for sentence in sentences if any(word in sentence.lower() for word in ['data', 'study'])]
    
    # collect all sentences in json
    for sentence in sentences:
        sentence_words = sentence.split()
        dummy_tags = ['O']*len(sentence_words) # TODO: figure out why this happens
        test_rows.append({'tokens' : sentence_words, 'tags' : dummy_tags})
    
    # track which sentence belongs to which data point
    paper_length.append(len(sentences))
    
print(f'total number of sentences: {len(test_rows)}')

### Do predict and collect results using kaggle_run_ner.py

In [None]:
os.environ["MODEL_PATH"] = f"{PRETRAINED_PATH}"
os.environ["TRAIN_FILE"] = f"{TRAIN_PATH}"
os.environ["VALIDATION_FILE"] = f"{VAL_PATH}"
os.environ["TEST_FILE"] = f"{TEST_INPUT_SAVE_PATH}/{TEST_NER_DATA_FILE}"
os.environ["OUTPUT_DIR"] = f"{PREDICTION_SAVE_PATH}"

In [None]:
# copy my_seqeval.py to the working directory because the input directory is non-writable
!cp /kaggle/input/coleridge-packages/my_seqeval.py ./

# make necessary directories and files
os.makedirs(TEST_INPUT_SAVE_PATH, exist_ok=True)

In [None]:
#--seed 123 \
def bert_predict():
    !python ../input/kaggle-ner-utils/kaggle_run_ner.py \
    --model_name_or_path "$MODEL_PATH" \
    --train_file "$TRAIN_FILE" \
    --validation_file "$VALIDATION_FILE" \
    --test_file "$TEST_FILE" \
    --output_dir "$OUTPUT_DIR" \
    --report_to 'none' \
    --seed 123 \
    --do_predict

In [None]:
# load the outputs from the prediction file
bert_outputs = [] # these contain the ner labels for each sentence

for batch_begin in range(0, len(test_rows), PREDICT_BATCH):
    # write data rows to input file
    with open(f'{TEST_INPUT_SAVE_PATH}/{TEST_NER_DATA_FILE}', 'w') as f:
        for row in test_rows[batch_begin:batch_begin+PREDICT_BATCH]:
            json.dump(row, f)
            f.write('\n')
    
    # remove output dir
    !rm -r "$OUTPUT_DIR"
    
    # do predict
    bert_predict()
    
    # read predictions
    with open(f'{PREDICTION_SAVE_PATH}/{PREDICTION_FILE}') as f:
        this_preds = f.read().split('\n')[:-1]
        bert_outputs += [pred.split() for pred in this_preds]

### Restore Dataset labels from predictions

In [None]:
# get test sentences
test_sentences = [row['tokens'] for row in test_rows]

#print(test_sentences)
#print(bert_outputs)

In [None]:
bert_dataset_labels = [] # store all dataset labels for each publication

# Check for B and I tags to reconstruct dataset names
for length in paper_length:
    labels = set()
    for sentence, pred in zip(test_sentences[:length], bert_outputs[:length]):
        curr_phrase = ''
        for word, tag in zip(sentence, pred):
            if tag == 'B': # start a new phrase
                if curr_phrase:
                    labels.add(curr_phrase)
                    curr_phrase = ''
                curr_phrase = word
            elif tag == 'I' and curr_phrase: # continue the phrase
                curr_phrase += ' ' + word
            else: # end last phrase (if any)
                if curr_phrase:
                    labels.add(curr_phrase)
                    curr_phrase = ''
        # check if the label is the suffix of the sentence
        if curr_phrase:
            labels.add(curr_phrase)
            curr_phrase = ''
    
    # record dataset labels for this publication
    bert_dataset_labels.append(labels)
    
    del test_sentences[:length], bert_outputs[:length]

#bert_dataset_labels

### Filter based on Jaccard score and clean

In [None]:
# TODO: look into this
def jaccard_similarity(s1, s2):
    l1 = s1.lower().split(" ")
    l2 = s2.lower().split(" ")    
    intersection = len(list(set(l1).intersection(l2)))
    union = (len(l1) + len(l2)) - intersection
    return float(intersection) / union

filtered_bert_labels = []
#test['cleaned_label']


if validation_version:
    validatable_labels = []
    truths = [x for x in test['cleaned_label']]
    for labels in bert_dataset_labels:
        filtered = []
        for label in sorted(labels, key=len):
            label = clean_text(label)
            if len(filtered) == 0 or all(jaccard_similarity(label, got_label) < 0.75 for got_label in filtered):
                filtered.append(label)
        validatable_labels.append(filtered)
    
    TP = 0
    FP = 0
    FN = 0
    for i in range(len(validatable_labels)):
        for p in validatable_labels[i]:
            if jaccard_similarity(p, truths[i]) >= 0.5:
                TP+=1
            else:
                FP+=1        
        if len(validatable_labels[i]) == 0:
            FN+=1
    
    print("score is:")
    print((5*TP)/((5*TP)+(4*FP)+FN))
        
else:
    for labels in bert_dataset_labels:
        filtered = []

        for label in sorted(labels, key=len):
            label = clean_text(label)
            if len(filtered) == 0 or all(jaccard_similarity(label, got_label) < 0.75 for got_label in filtered):
                filtered.append(label)

        filtered_bert_labels.append('|'.join(filtered))
    
#filtered_bert_labels[:5]

# Aggregate final predictions and write submission file

In [None]:
#final_predictions = literal_preds
final_predictions = filtered_bert_labels



#final_predictions = []
#for literal_match, bert_pred in zip(literal_preds, filtered_bert_labels):
#    if literal_match:
#        final_predictions.append(literal_match)
#    else:
#        final_predictions.append(bert_pred)

In [None]:
sample_submission['PredictionString'] = final_predictions
sample_submission.head()

In [None]:
sample_submission.to_csv(f'submission.csv', index=False)