# General information
This notebook is used for the course: Machine Learning in Practice. This specific notebook is used in combination with "Without internet - BERT NER - MLiP 28". As the competition requires a notebook that does not use internet, this notebook was split from the ohter notebook to make sure we could use internet for the parts that needed internet. 

This notebook is adapted from the notebook by Tung M. Phung: https://www.kaggle.com/tungmphung/coleridge-matching-bert-ner

If something is added or changed by our team it is stated in or above that code cell.



In [None]:
MAX_SAMPLE = None # set a small number for experimentation, set None for production.

MAX_LENGTH = 64 #max no. words for each sentence
OVERLAP = 20 #if a sentence exeeds MAX_LENGTH, we split it into multiple sentences with overlapping

# Install packages

In [None]:
!pip install datasets --no-index --find-links=file:///kaggle/input/coleridge-packages/packages/datasets
!pip install ../input/coleridge-packages/seqeval-1.2.2-py3-none-any.whl
!pip install ../input/coleridge-packages/tokenizers-0.10.1-cp37-cp37m-manylinux1_x86_64.whl
!pip install ../input/coleridge-packages/transformers-4.5.0.dev0-py3-none-any.whl

# Import

In [None]:
import os
import re
import json
import time
import datetime
import random
import glob
import importlib

import numpy as np
import pandas as pd

from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

random.seed(123)
np.random.seed(456)

# copy my_seqeval.py to the working directory because the input directory is non-writable
!cp /kaggle/input/coleridge-packages/my_seqeval.py ./

# Load data

In [None]:
train_path = '../input/coleridgeinitiative-show-us-the-data/train.csv'
train = pd.read_csv(train_path)
train = train[:MAX_SAMPLE]

paper_train_folder = '../input/coleridgeinitiative-show-us-the-data/train'
papers = {}
for paper_id in train['Id'].unique():
    with open(f'{paper_train_folder}/{paper_id}.json', 'r') as f:
        paper = json.load(f)
        papers[paper_id] = paper

In [None]:
# For the literal matching later on we need a knowledge bank. This is created here since
# we now create our validation set in the same loop that we process the train dataset and
# in the Coleridge:Matching notebook this was done before grouping the training labels

all_labels = set()

for label_1, label_2, label_3 in train[['dataset_title', 'dataset_label', 'cleaned_label']].itertuples(index=False):
    all_labels.add(str(label_1).lower())
    all_labels.add(str(label_2).lower())
    all_labels.add(str(label_3).lower())
    
print(f'No. different labels: {len(all_labels)}')

In [None]:
#group by publication, training labels should have the same form as expected output.
train = train.groupby('Id').agg({
    'pub_title': 'first',
    'dataset_title': '|'.join,
    'dataset_label': '|'.join,
    'cleaned_label': '|'.join
}).reset_index()

print(f'No. grouped training rows: {len(train)}')

papers = {}
for paper_id in train['Id'].unique():
    with open(f'{paper_train_folder}/{paper_id}.json', 'r') as f:
        paper = json.load(f)
        papers[paper_id] = paper

# Create training and validation sets
Here we create the training and validation sets based on the original train data. To do this we start with defining the original functions and then adapt the for loop of the train data to assign the last 30% of

In [None]:
def clean_training_text(txt):
    """
    similar to the default clean_text function but without lowercasing.
    """
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt)).strip()

def shorten_sentences(sentences):
    short_sentences = []
    for sentence in sentences:
        words = sentence.split()
        if len(words) > MAX_LENGTH:
            for p in range(0, len(words), MAX_LENGTH - OVERLAP):
                short_sentences.append(' '.join(words[p:p+MAX_LENGTH]))
        else:
            short_sentences.append(sentence)
    return short_sentences

def find_sublist(big_list, small_list):
    all_positions = []
    for i in range(len(big_list) - len(small_list) + 1):
        if small_list == big_list[i:i+len(small_list)]:
            all_positions.append(i)
    
    return all_positions

def tag_sentence(sentence, labels): # requirement: both sentence and labels are already cleaned
    sentence_words = sentence.split()
    
    if labels is not None and any(re.findall(f'\\b{label}\\b', sentence)
                                  for label in labels): # positive sample
        nes = ['O'] * len(sentence_words)
        for label in labels:
            label_words = label.split()

            all_pos = find_sublist(sentence_words, label_words)
            for pos in all_pos:
                nes[pos] = 'B'
                for i in range(pos+1, pos+len(label_words)):
                    nes[i] = 'I'

        return True, list(zip(sentence_words, nes))
        
    else: # negative sample
        nes = ['O'] * len(sentence_words)
        return False, list(zip(sentence_words, nes))

def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower()).strip()

def totally_clean_text(txt):
    txt = clean_text(txt)
    txt = re.sub(' +', ' ', txt)
    return txt

create validation set

In [None]:
#Changed by our team for evaluation purposes

# Data is already shuffled in previous cell, so we just take first part for training and second part for validation
train_val_split = 0.7 # 70% as training data
train_val_split = int(train_val_split * len(train))
training_set = True
test_rows = []
literal_preds = [] # this is for the test set
paper_length = [] # also for test set

cnt_pos, cnt_neg = 0, 0 # number of sentences that contain/not contain labels
ner_data = []
pbar = tqdm(total=train_val_split)

iter = 0
for i, id, dataset_label in train[['Id', 'dataset_label']].itertuples():
    if iter == train_val_split:
        training_set = False
         
    # paper
    paper = papers[id]
    
    # labels
    labels = dataset_label.split('|')
    labels = [clean_training_text(label) for label in labels]

    # sentences
    sentences = set([clean_training_text(sentence) for section in paper 
                 for sentence in section['text'].split('.') 
                ])
    
    sentences = shorten_sentences(sentences) # make sentences short
    sentences = [sentence for sentence in sentences if len(sentence) > 10] # only accept sentences with length > 10 chars
    
    
    if training_set:
        for sentence in sentences:
            is_positive, tags = tag_sentence(sentence, labels)
            if is_positive:
                cnt_pos += 1
                ner_data.append(tags)
            elif any(word in sentence.lower() for word in ['data', 'study']): 
                ner_data.append(tags)
                cnt_neg += 1

        # process bar
        pbar.update(1)
        pbar.set_description(f"Training data size: {cnt_pos} positives + {cnt_neg} negatives")
            
    else: # we're creating the validation set using the same logic as for the test set in the other notebook
        sentences = [sentence for sentence in sentences if any(word in sentence.lower() for word in ['data', 'study'])]
        
        for sentence in sentences:
            sentence_words = sentence.split()
            dummy_tags = ['O']*len(sentence_words)
            test_rows.append({'tokens' : sentence_words, 'tags' : dummy_tags})
            
        # this is copied from the Coleride:Matching notebook and is used for matching on test data  
        text_1 = '. '.join(section['text'] for section in paper).lower()
        text_2 = totally_clean_text(text_1)

        labels = set()
        for label in all_labels:
            if label in text_1 or label in text_2:
                labels.add(clean_text(label))
        literal_preds.append('|'.join(labels))
        
        paper_length.append(len(sentences))
        # until here    
        
    iter = iter+1
 #shuffling
random.shuffle(ner_data)
    
with open('train_ner_validation_version.json', 'w') as f:
    for row in ner_data:
        words, nes = list(zip(*row))
        row_json = {'tokens' : words, 'tags' : nes}
        json.dump(row_json, f)
        f.write('\n')


In [None]:
#Added by our team for evaluation purposes

# to illustrate
print(ner_data[6])
print(test_rows[0])
print("\n")

n_train = len(ner_data)
n_val = len(test_rows)
print('n_train:', n_train)
print('n_val:', n_val)

per_train = (100/(n_train+n_val))*n_train
print(f"train/val split: {per_train}/{100-per_train}")
print("\n")

print(literal_preds[:5])

In [None]:
sample_submission_path = '../input/coleridgeinitiative-show-us-the-data/sample_submission.csv'
sample_submission = pd.read_csv(sample_submission_path)

paper_test_folder = '../input/coleridgeinitiative-show-us-the-data/test'
for paper_id in sample_submission['Id']:
    with open(f'{paper_test_folder}/{paper_id}.json', 'r') as f:
        paper = json.load(f)
        papers[paper_id] = paper

# Literal matching

### Create a knowledge bank

In [None]:
all_labels = set()

for label_1, label_2, label_3 in train[['dataset_title', 'dataset_label', 'cleaned_label']].itertuples(index=False):
    all_labels.add(str(label_1).lower())
    all_labels.add(str(label_2).lower())
    all_labels.add(str(label_3).lower())
    
print(f'No. different labels: {len(all_labels)}')

### Matching on test data

In [None]:
literal_preds = []

for paper_id in sample_submission['Id']:
    paper = papers[paper_id]
    text_1 = '. '.join(section['text'] for section in paper).lower()
    text_2 = totally_clean_text(text_1)
    
    labels = set()
    for label in all_labels:
        if label in text_1 or label in text_2:
            labels.add(clean_text(label))
    
    literal_preds.append('|'.join(labels))


In [None]:
literal_preds[:5]

# Bert prediction

### Paths and Hyperparameters

In [None]:
MAX_LENGTH = 64 # max no. words for each sentence.
OVERLAP = 20 # if a sentence exceeds MAX_LENGTH, we split it to multiple sentences with overlapping

PREDICT_BATCH = 64000 

#Changed by our team since we wanted to try different BERT based models.
#BERT:
#PRETRAINED_PATH = '../input/coleridge-bert-models/output'
#SciBERT: 
#PRETRAINED_PATH = 'allenai/scibert_scivocab_cased'
#PRETRAINED_PATH = '../input/scibert/output'
#BioBERT:
PRETRAINED_PATH = 'dmis-lab/biobert-base-cased-v1.1'

TEST_INPUT_SAVE_PATH = './input_data'
TEST_NER_DATA_FILE = 'test_ner_input.json'
TRAIN_PATH = './train_ner_validation_version.json'
VAL_PATH = './train_ner_validation_version.json'

PREDICTION_SAVE_PATH = './pred'
PREDICTION_FILE = 'test_predictions.txt'

### Transform data to NER format

Group by publication, training labels should have the same form as expected output.

In [None]:
train = train.groupby('Id').agg({
    'pub_title': 'first',
    'dataset_title': '|'.join,
    'dataset_label': '|'.join,
    'cleaned_label': '|'.join
}).reset_index()

print(f'No. grouped training rows: {len(train)}')

In [None]:
test_rows = [] # test data in NER format
paper_length = [] # store the number of sentences each paper has

for paper_id in sample_submission['Id']:
    # load paper
    paper = papers[paper_id]
    
    # extract sentences
    sentences = [clean_training_text(sentence) for section in paper 
                 for sentence in section['text'].split('.')
                ]
    sentences = shorten_sentences(sentences) # make sentences short
    sentences = [sentence for sentence in sentences if len(sentence) > 10] # only accept sentences with length > 10 chars
    sentences = [sentence for sentence in sentences if any(word in sentence.lower() for word in ['data', 'study'])]
        
    # collect all sentences in json
    for sentence in sentences:
        sentence_words = sentence.split()
        dummy_tags = ['O']*len(sentence_words)
        test_rows.append({'tokens' : sentence_words, 'tags' : dummy_tags})
    
    # track which sentence belongs to which data point
    paper_length.append(len(sentences))
    
print(f'total number of sentences: {len(test_rows)}')

### Do predict and collect results

In [None]:
os.environ["MODEL_PATH"] = f"{PRETRAINED_PATH}"
os.environ["TRAIN_FILE"] = f"{TRAIN_PATH}"
os.environ["VALIDATION_FILE"] = f"{VAL_PATH}"
os.environ["TEST_FILE"] = f"{TEST_INPUT_SAVE_PATH}/{TEST_NER_DATA_FILE}"
os.environ["OUTPUT_DIR"] = f"{PREDICTION_SAVE_PATH}"

In [None]:
# copy my_seqeval.py to the working directory because the input directory is non-writable
!cp /kaggle/input/coleridge-packages/my_seqeval.py ./

# make necessart directories and files
os.makedirs(TEST_INPUT_SAVE_PATH, exist_ok=True)

In [None]:
def bert_predict():
    !python ../input/kaggle-ner-utils/kaggle_run_ner.py \
    --model_name_or_path "$MODEL_PATH" \
    --train_file "$TRAIN_FILE" \
    --validation_file "$VALIDATION_FILE" \
    --test_file "$TEST_FILE" \
    --output_dir "$OUTPUT_DIR" \
    --report_to 'none' \
    --seed 123 \
    --do_predict

In [None]:
bert_outputs = []

for batch_begin in range(0, len(test_rows), PREDICT_BATCH):
    # write data rows to input file
    with open(f'{TEST_INPUT_SAVE_PATH}/{TEST_NER_DATA_FILE}', 'w') as f:
        for row in test_rows[batch_begin:batch_begin+PREDICT_BATCH]:
            json.dump(row, f)
            f.write('\n')
    
    # do predict
    bert_predict()
    
    # read predictions
    with open(f'{PREDICTION_SAVE_PATH}/{PREDICTION_FILE}') as f:
        this_preds = f.read().split('\n')[:-1]
        bert_outputs += [pred.split() for pred in this_preds]

In [None]:
# get test sentences
test_sentences = [row['tokens'] for row in test_rows]

del test_rows

In [None]:
# store all dataset labels for each publication by looping over each paper and
# adding the phrases which were predicted as labels

bert_dataset_labels = []

for length in paper_length:
    labels = set()
    for sentence, pred in zip(test_sentences[:length], bert_outputs[:length]):
        curr_phrase = ''
        for word, tag in zip(sentence, pred):
            if tag == 'B': # start a new phrase
                if curr_phrase:
                    labels.add(curr_phrase)
                    curr_phrase = ''
                curr_phrase = word
            elif tag == 'I' and curr_phrase: # continue the phrase
                curr_phrase += ' ' + word
            else: # end last phrase (if any)
                if curr_phrase:
                    labels.add(curr_phrase)
                    curr_phrase = ''
        # check if the label is the suffix of the sentence
        if curr_phrase:
            labels.add(curr_phrase)
            curr_phrase = ''
    
    # record dataset labels for this publication
    bert_dataset_labels.append(labels)
    
    del test_sentences[:length], bert_outputs[:length]


In [None]:
def jaccard_similarity(s1, s2):
    l1 = s1.split(" ")
    l2 = s2.split(" ")    
    intersection = len(list(set(l1).intersection(l2)))
    union = (len(l1) + len(l2)) - intersection
    return float(intersection) / union

In [None]:
# the following code filters out any duplicate predicted labels (assuming the similarity is large)

filtered_bert_labels = []

for labels in bert_dataset_labels:
    filtered = []
    
    for label in sorted(labels, key=len):
        label = clean_text(label)
        if len(filtered) == 0 or all(jaccard_similarity(label, got_label) < 0.75 for got_label in filtered):
            filtered.append(label)
    
    filtered_bert_labels.append('|'.join(filtered))

In [None]:
#Added by our team for evaluation purposes
# the following code zips the true labels and the predicted labels of each paper together
# to determine the confusion matrix.
# We will not keep track of TN, as this consists of all non-dataset words

duplicates = []
tp = [] # label and predicted
#tn = [] # remainder
fp = [] # predicted but no label match
fn = [] # label but no predicted

for labels, preds in zip(literal_preds, filtered_bert_labels):
    labels = labels.split('|')
    preds = preds.split('|')
    matched_preds = []
    dup = []
    
    for label in labels:
        best_pred = ('', 0)
        for pred in preds:
            j_sim = jaccard_similarity(label, pred)
            if j_sim >= 0.5 and j_sim > best_pred[1]:
                best_pred = (pred, j_sim)
        if best_pred[0]:
            if best_pred[0] in matched_preds:
                dup.append(best_pred[0])
            else:
                matched_preds.append(best_pred[0])
                tp.append(best_pred[0])
        else:
            fn.append(label)
            
    for pred in preds:
        if pred not in matched_preds:
            fp.append(pred)
    if dup:
        duplicates.append(dup)

In [None]:
#Added by our team for evaluation purposes
print('true positives:', len(tp))
print('false positives:',len(fp))
print('false_negatives:',len(fn))
print('\n')

print('duplicate true labels:', len(duplicates))
print(duplicates[:5])