# Using BERT for NER for Coleridge Challenge
This notebook shows how to fine-tune a BERT model (from huggingface) for our dataset recognition task.

Note that internet is needed during the training phase (for downloading the bert-base-cased model). Therefore, this notebook is executed seperately. Internet can be turned off during prediction.

## Install packages

In [None]:
!pip install datasets --no-index --find-links=file:///kaggle/input/coleridge-packages/packages/datasets
!pip install ../input/coleridge-packages/seqeval-1.2.2-py3-none-any.whl
!pip install ../input/coleridge-packages/tokenizers-0.10.1-cp37-cp37m-manylinux1_x86_64.whl
!pip install ../input/coleridge-packages/transformers-4.5.0.dev0-py3-none-any.whl

# Import

In [None]:
import os
import re
import json
import time
import datetime
import random
import glob
import importlib

import numpy as np
import pandas as pd

from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

random.seed(123)
np.random.seed(456)

In [None]:
# QUESTION: What does my_seqeval.py do and do we need it?

# Copy my_seqeval.py to the working directory because the input directory is non-writable
!cp /kaggle/input/coleridge-packages/my_seqeval.py ./

# Hyper-parameters

In [None]:
MAX_LENGTH = 48 # Max number of words for each sentence
OVERLAP = 16 # If a sentence exceeds MAX_LENGTH, split into multiple sentences with OVERLAP
MAX_SAMPLE = None # Restrict samples for experimentation and speed, set None for production
CUSTOM_SPLIT = False

# Load data

In [None]:
# Load labels
if CUSTOM_SPLIT:
    train_path = '../input/colridge-custom-dataset-split/data_subsets/Train_set.csv'
else:
    train_path = '../input/coleridgeinitiative-show-us-the-data/train.csv'
paper_train_folder = '../input/coleridgeinitiative-show-us-the-data/train'

train = pd.read_csv(train_path)
train = train[:MAX_SAMPLE]
print(f'Loaded {len(train)} training labels')

In [None]:
# Aggregate labels per publication
train = train.groupby('Id').agg({
    'pub_title': 'first',
    'dataset_title': '|'.join,
    'dataset_label': '|'.join,
    'cleaned_label': '|'.join
}).reset_index()

print(f'Number of publications in training labels: {len(train)}')

In [None]:
# Load paper contents
papers = {}
for paper_id in tqdm(train['Id'].unique()):
    with open(f'{paper_train_folder}/{paper_id}.json', 'r') as f:
        paper = json.load(f)
        papers[paper_id] = paper

# Transform data to NER format

In [None]:
def clean_training_text(txt):
    """
    Clean text without lowercasing.
    """
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt)).strip()
 
def shorten_sentences(sentences):
    '''
    Split sentences with overlap if too long
    '''
    short_sentences = []
    for sentence in sentences:
        words = sentence.split()
        if len(words) > MAX_LENGTH:
            for p in range(0, len(words), MAX_LENGTH - OVERLAP):
                short_sentences.append(' '.join(words[p:p+MAX_LENGTH]))
        else:
            short_sentences.append(sentence)
    return short_sentences

def find_sublist(big_list, small_list):
    '''
    Find offset(s) of sublist in list
    '''
    all_positions = []
    for i in range(len(big_list) - len(small_list) + 1):
        if small_list == big_list[i:i+len(small_list)]:
            all_positions.append(i)
    return all_positions

def tag_sentence(sentence, labels):
    '''
    Create Named EntitieS (NES) for sentences
    REQUIREMENT: both sentence and labels are already cleaned
    NOTE: Uses `O` for non-entities, `B` for beginning of entities and `I` for continuing entities
    '''
    sentence_words = sentence.split()
    nes = ['O'] * len(sentence_words)
    
    # QUESTION: Why is regex used? Is it a faster initial determination (of positive labels)?
    if labels is not None and any(re.findall(f'\\b{label}\\b', sentence)
                                  for label in labels): # Contains at least 1 dataset entity
        for label in labels:
            label_words = label.split()

            # Find, for each label, the offset(s) in the sentence
            all_pos = find_sublist(sentence_words, label_words)
            for pos in all_pos:
                nes[pos] = 'B'
                for i in range(pos+1, pos+len(label_words)):
                    nes[i] = 'I'

        return True, list(zip(sentence_words, nes))
        
    else: # If only negative entities, return immediately
        return False, list(zip(sentence_words, nes))

In [None]:
cnt_pos, cnt_neg = 0, 0 # Number of sentences that contain/do not contain labels
ner_data = [] # List of (sentence_words, NES) elements

pbar = tqdm(total=len(train))
for i, _id, dataset_label in train[['Id', 'dataset_label']].itertuples():
    paper = papers[_id]
    labels = dataset_label.split('|')
    labels = [clean_training_text(label) for label in labels]
    
    # Clean sentences
    sentences = set([clean_training_text(sentence) for section in paper 
        for sentence in section['text'].split('.') 
    ])
    sentences = shorten_sentences(sentences)
    sentences = [sentence for sentence in sentences if len(sentence) > 10] # Enforce minimum-length sentences
    # NOTE: This selection could technically lead to false negatives
    
    
    # Selection of sentences for NER training data
    # Use dataset entities and sentences with related concepts 
    # NOTE: Using all sentences would introduce a class imbalance
    for sentence in sentences:
        is_positive, tags = tag_sentence(sentence, labels)
        if is_positive:
            cnt_pos += 1
            ner_data.append(tags)
        elif any(word in sentence.lower() for word in ['data', 'study']): 
            ner_data.append(tags)
            cnt_neg += 1
    
    pbar.update(1)
    pbar.set_description(f"Training data size: {cnt_pos} positives + {cnt_neg} negatives")

# Shuffle sentence tags
random.shuffle(ner_data)

In [None]:
# Write training data to disk
with open('train_ner.json', 'w') as f:
    for row in ner_data:
        words, nes = list(zip(*row))
        row_json = {'tokens' : words, 'tags' : nes}
        json.dump(row_json, f)
        f.write('\n')

# Fine-tune a BERT model for NER

In [None]:
# TODO: Inspect command flags and options

!python ../input/kaggle-ner-utils/kaggle_run_ner.py \
--model_name_or_path 'bert-base-cased' \
--train_file './train_ner.json' \
--validation_file './train_ner.json' \
--num_train_epochs 1 \
--per_device_train_batch_size 8 \
--per_device_eval_batch_size 8 \
--save_steps 15000 \
--output_dir './output' \
--report_to 'none' \
--seed 123 \
--do_train 