This notebook shows how to fine-tune a BERT model (from huggingface) for our dataset recognition task.

Note that internet is needed during the training phase (for downloading the bert-base-cased model). Internet can be turned off during prediction.

## Install packages

In [None]:
!pip install datasets --no-index --find-links=file:///kaggle/input/coleridge-packages/packages/datasets
!pip install ../input/coleridge-packages/seqeval-1.2.2-py3-none-any.whl
!pip install ../input/coleridge-packages/tokenizers-0.10.1-cp37-cp37m-manylinux1_x86_64.whl
!pip install ../input/coleridge-packages/transformers-4.5.0.dev0-py3-none-any.whl

# Import

In [None]:
import os
import re
import json
import time
import datetime
import random
import glob
import importlib
from sklearn.model_selection import train_test_split
from random import sample

import numpy as np
import pandas as pd

from tqdm import tqdm

import seaborn as sns

random.seed(123)
np.random.seed(456)

In [None]:
# copy my_seqeval.py to the working directory because the input directory is non-writable
!cp /kaggle/input/coleridge-packages/my_seqeval.py ./

# Hyper-parameters

In [None]:
MAX_LENGTH = 64 # max no. words for each sentence.
OVERLAP = 20 # if a sentence exceeds MAX_LENGTH, we split it to multiple sentences with overlapping
data_augmentation = True

# Default training data size for validation1 settings with 4000 validation size: 38441 positives + 446696 negatives
POS_SAMPLE_SIZE = 38441
NEG_SAMPLE_SIZE = 192205

VALIDATION_SIZE = 4000

MAX_SAMPLE = None # set a small number for experimentation, set None for production.

# Load data

In [None]:
train_path = '../input/coleridgeinitiative-show-us-the-data/train.csv'
paper_train_folder = '../input/coleridgeinitiative-show-us-the-data/train'


train = pd.read_csv(train_path)
#train, test = train_test_split(temp, test_size = VALIDATION_SIZE, random_state=1337)
train = train[:MAX_SAMPLE]
print(f'No. raw training rows: {len(train)}')
print(f'No. raw training rows: {len(test)}')

Group by publication, training labels should have the same form as expected output.

In [None]:
train = train.groupby('Id').agg({
    'pub_title': 'first',
    'dataset_title': '|'.join,
    'dataset_label': '|'.join,
    'cleaned_label': '|'.join
}).reset_index()

print(f'No. grouped training rows: {len(train)}')

In [None]:
papers = {}
for paper_id in train['Id'].unique():
    with open(f'{paper_train_folder}/{paper_id}.json', 'r') as f:
        paper = json.load(f)
        papers[paper_id] = paper

# Transform data to NER format

In [None]:
def clean_training_text(txt):
    """
    similar to the default clean_text function but without lowercasing.
    """
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt)).strip()

def shorten_sentences(sentences):
    short_sentences = []
    for sentence in sentences:
        words = sentence.split()
        if len(words) > MAX_LENGTH:
            for p in range(0, len(words), MAX_LENGTH - OVERLAP):
                short_sentences.append(' '.join(words[p:p+MAX_LENGTH]))
        else:
            short_sentences.append(sentence)
    return short_sentences

def find_sublist(big_list, small_list):
    all_positions = []
    for i in range(len(big_list) - len(small_list) + 1):
        if small_list == big_list[i:i+len(small_list)]:
            all_positions.append(i)
    
    return all_positions

def tag_sentence(sentence, labels): # requirement: both sentence and labels are already cleaned
    sentence_words = sentence.split()
    
    if labels is not None and any(re.findall(f'\\b{label}\\b', sentence)
                                  for label in labels): # positive sample
        nes = ['O'] * len(sentence_words)
        for label in labels:
            label_words = label.split()

            all_pos = find_sublist(sentence_words, label_words)
            for pos in all_pos:
                nes[pos] = 'B'
                for i in range(pos+1, pos+len(label_words)):
                    nes[i] = 'I'

        return True, list(zip(sentence_words, nes))
        
    else: # negative sample
        nes = ['O'] * len(sentence_words)
        return False, list(zip(sentence_words, nes))

In [None]:
cnt_pos, cnt_neg = 0, 0 # number of sentences that contain/not contain labels
ner_data = []
pos_indexes = []
neg_indexes = []

alllabels = []
for i, id, dataset_label in train[['Id', 'dataset_label']].itertuples():
    labels = dataset_label.split('|')
    labels = [clean_training_text(label) for label in labels]
    alllabels += labels
alllabels = list(set(alllabels))

pbar = tqdm(total=len(train))
for i, id, dataset_label in train[['Id', 'dataset_label']].itertuples():
    # paper
    paper = papers[id]
    
    # labels
    labels = dataset_label.split('|')
    labels = [clean_training_text(label) for label in labels]
    
    # sentences
    sentences = set([clean_training_text(sentence) for section in paper 
                 for sentence in section['text'].split('.') 
                ])
    sentences = shorten_sentences(sentences) # make sentences short
    
    #for sentence in sentences:
    #    chars = 0
    #    for token in sentence:
    #        chars += len(token)
    #    inputCharLengths.append(chars)
    #    inputChars.append(sentence)
    #sentences = [sentence for sentence in sentences if len(sentence) > 10] # only accept sentences with length > 10 chars
    
    # remove sentences with > 500 characters
    sentences = [sentence for sentence in sentences if sum(len(i) for i in sentence) < 500]
    
    
    # positive sample
    for sentence in sentences:
        is_positive, tags = tag_sentence(sentence, labels)
        if is_positive:
            cnt_pos += 1
            if data_augmentation and (random.random() < 0.05):
                start_index = 0
                end_index = 0
                for i in range(len(tags)-1):
                    currentt, ctag = tags[i]
                    nextt, ntag = tags[i+1]
                    if ctag == 'O' and ntag == 'B':
                        start_index = i+1
                    elif ctag == 'B' and ntag == 'O':
                        end_index =  i+1
                    elif ctag == 'I' and ntag == 'O':
                        end_index = i+1
                sbegin = tags[:start_index]
                send = tags[end_index:]
                newlabel = random.choice(alllabels).split()
                bi = []
                for i in range(len(newlabel)):
                    if i == 0:
                        bi.append((newlabel[i], 'B'))
                    else:
                        bi.append((newlabel[i], 'I'))
                tags = sbegin + bi + send
                print(tags)
            ner_data.append(tags)
            pos_indexes.append(len(ner_data) - 1)
        elif any(word in sentence.lower() for word in ['data', 'study']): #'statistics', 'compilation', 'dossier', 'dataset', 'reports', 'studies', 'measurements', 'file', 'archive', 'set', 'public', 'toy', 'synthetic' 
            ner_data.append(tags)
            cnt_neg += 1
            neg_indexes.append(len(ner_data) - 1)
    
    # process bar
    pbar.update(1)
    pbar.set_description(f"Training data size before balance: {cnt_pos} positives + {cnt_neg} negatives")

    
print("posOrigLen:", len(pos_indexes))
print("negOrigLen:", len(neg_indexes))

# adjust sample balance
pos_indexes_after_sample = sample(pos_indexes,POS_SAMPLE_SIZE)
neg_indexes_after_sample = sample(neg_indexes,NEG_SAMPLE_SIZE)

pos_to_remove = set(pos_indexes) - set(pos_indexes_after_sample)
neg_to_remove = set(neg_indexes) - set(neg_indexes_after_sample)
to_remove = pos_to_remove.union(neg_to_remove)

print("Before:", len(ner_data))
ner_data = [v for i, v in enumerate(ner_data) if i not in to_remove]
print("After:", len(ner_data))

    
# shuffling
random.shuffle(ner_data)

In [None]:
#plt.plot(inputCharLengths)
#plt.show()

#largest = np.argmax(inputCharLengths)
#print(inputChars[largest])
#print(largest)

write data to file.

In [None]:
# sorting the list
#list2 = inputCharLengths.copy()
#list2.sort()
  
# printing the last element
#print("Largest length is:", list2[-1])

#plt.plot(list2)
#plt.show()

In [None]:
with open('train_ner.json', 'w') as f:
    for row in ner_data:
        words, nes = list(zip(*row))
        row_json = {'tokens' : words, 'tags' : nes}
        json.dump(row_json, f)
        f.write('\n')

# Fine-tune a BERT model for NER

In [None]:
!python ../input/kaggle-ner-utils/kaggle_run_ner.py \
--model_name_or_path 'allenai/scibert_scivocab_cased' \
--train_file './train_ner.json' \
--validation_file './train_ner.json' \
--num_train_epochs 1 \
--per_device_train_batch_size 8 \
--per_device_eval_batch_size 8 \
--save_steps 60000 \
--output_dir './output' \
--report_to 'none' \
--seed 123 \
--do_train 

After the tuning finishes, we should find our model in './output'.