This notebook shows how to fine-tune a BERT model (from huggingface) for our dataset recognition task.

Note that internet is needed during the training phase (for downloading the bert-base-cased model). Internet can be turned off during prediction.

Changes by Group 12:
- Added 'train.head()' twice
- Added the comment about loading the articles
- Added and updated docstrings
- Changed 'id' to 'paper_id' to prevent shadowing
- Only append every other negative sample, to reduce the amount of negative samples


## Install packages

In [None]:
!pip install datasets --no-index --find-links=file:///kaggle/input/coleridge-packages/packages/datasets
!pip install ../input/coleridge-packages/seqeval-1.2.2-py3-none-any.whl
!pip install ../input/coleridge-packages/tokenizers-0.10.1-cp37-cp37m-manylinux1_x86_64.whl
!pip install ../input/coleridge-packages/transformers-4.5.0.dev0-py3-none-any.whl

# Import

In [None]:
import os
import re
import json
import time
import datetime
import random
import glob
import importlib

import numpy as np
import pandas as pd

from tqdm import tqdm  # Progressbar
import matplotlib.pyplot as plt
import seaborn as sns

random.seed(123)
np.random.seed(456)

In [None]:
# copy my_seqeval.py to the working directory because the input directory is non-writable
!cp /kaggle/input/coleridge-packages/my_seqeval.py ./

# Hyper-parameters

In [None]:
MAX_LENGTH = 64 # max no. words for each sentence.
OVERLAP = 20 # if a sentence exceeds MAX_LENGTH, we split it to multiple sentences with overlapping

MAX_SAMPLE = None # set a small number for experimentation, set None for production.

# Load data

In [None]:
train_path = '../input/coleridgeinitiative-show-us-the-data/train.csv'
paper_train_folder = '../input/coleridgeinitiative-show-us-the-data/train'

train = pd.read_csv(train_path)
train = train[:MAX_SAMPLE]
print(f'No. raw training rows: {len(train)}')
train.head()


Group by publication, training labels should have the same form as expected output.

In [None]:
train = train.groupby('Id').agg({
    'pub_title': 'first',
    'dataset_title': '|'.join,
    'dataset_label': '|'.join,
    'cleaned_label': '|'.join
}).reset_index()

print(f'No. grouped training rows: {len(train)}')
train.head()

In [None]:
# Add the text of the articles into a dictionary:
papers = {}
for paper_id in train['Id'].unique():
    with open(f'{paper_train_folder}/{paper_id}.json', 'r') as f:
        paper = json.load(f)
        papers[paper_id] = paper


# Transform data to NER format

In [None]:
def clean_training_text(txt):
    """ Similar to the default clean_text function but without lowercasing.
    Replaces all sequences of non-alphabetical and non-numerical chars
    ('[^A-Za-z0-9]+') with one space (' ').
    """
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt)).strip()

def shorten_sentences(sentences):
    """Split sentences into shorter ones with OVERLAP chars shared and a
    length of MAX_LENGTH. Returns a list.
    """
    short_sentences = []
    for sentence in sentences:
        words = sentence.split()
        if len(words) > MAX_LENGTH:
            for p in range(0, len(words), MAX_LENGTH - OVERLAP):
                short_sentences.append(' '.join(words[p:p+MAX_LENGTH]))
        else:
            short_sentences.append(sentence)
    return short_sentences

def find_sublist(big_list, small_list):
    """Find the indices of big_list where small_list occurs. Returns a
    list of these indices.
    """
    all_positions = []
    for i in range(len(big_list) - len(small_list) + 1):
        if small_list == big_list[i:i+len(small_list)]:
            all_positions.append(i)
    
    return all_positions


In [None]:
# NEW (Emma): only a small subset of the additional labels!
additional_labels_path = '../input/filtered-bigger-govt-dataset/ExtraLabelsCleaned.txt'
add_labels = pd.read_csv(additional_labels_path)
print(f'Number of rows of additional labels: {len(add_labels)}')
add_labels_set = set(add_labels['Label'])

add_labels_set_escaped = set()
for label in add_labels_set:
    label_escaped = label.replace("-", r"\-").replace("(", r"\(").replace(")", r"\)")
    label_cleaned = clean_training_text(label_escaped)
    add_labels_set_escaped.add(label_cleaned)

add_labels.head()


In [None]:
def tag_sentence(sentence, labels):
    """Create IOB tag sequence of a sentence with B and I on the
    'labels' and O on other words.
    Requirement: sentence should be cleaned, because Str.split() is used.
    """
    
    sentence_lower = sentence.lower()
    sentence_words_lower = sentence.lower().split()
    sentence_words = sentence.split()
    
    if labels is not None and any(re.findall(f'\\b{label}\\b', sentence_lower)
                                  for label in labels): # positive sample
        nes = ['O'] * len(sentence_words)

        for label in labels:
            label_words = label.split()
            all_pos = find_sublist(sentence_words_lower, label_words)
            
            # NEW (Emma):
#             n_of_matches = len(all_pos)
#             if n_of_matches > 0:
#                 LABEL_FREQS[label] = LABEL_FREQS.get(label, 0) + n_of_matches
            
            # OLD:
            for pos in all_pos:
                nes[pos] = 'B'
                for i in range(pos+1, pos+len(label_words)):
                    nes[i] = 'I'

        return True, list(zip(sentence_words, nes))
        
    else: # negative sample
        nes = ['O'] * len(sentence_words)
        return False, list(zip(sentence_words, nes))

Create IOB tag sequences for all the 

In [None]:
cnt_pos, cnt_neg = 0, 0  # number of sentences that contain/not contain labels
cnt_neg_total = 0  # NEW: amount of encountered negative sentences (always bigger or equal to cnt_neg)
ner_data = []

pbar = tqdm(total=len(train),position=0, leave=True)
for i, paper_id, dataset_label in train[['Id', 'dataset_label']].itertuples():
    # paper
    paper = papers[paper_id]
    
    # labels
    labels = dataset_label.split('|')
    labels = set([clean_training_text(label) for label in labels])
    for label in labels: 
        add_labels_set_escaped.add(label.lower())  # only have lower cases in this list

    # sentences
    sentences = set([clean_training_text(sentence) for section in paper 
                 for sentence in section['text'].split('.') 
                ])
    sentences = shorten_sentences(sentences) # make sentences short
    sentences = [sentence for sentence in sentences if len(sentence) > 10] # only accept sentences with length > 10 chars
    
    # NEW: Select the labels that are in this text:
    text = "\s".join(sentences)
    lower_labels = set()
    for label in add_labels_set_escaped:
        if label in text.lower():
            lower_labels.add(label)

    # OLD:
    # positive sample
    last = None
    for sentence in sentences:
        is_positive, tags = tag_sentence(sentence, lower_labels)
        if is_positive:
            cnt_pos += 1
            ner_data.append(tags)
        elif any(word in sentence.lower() for word in ['data', 'study']):
            if cnt_neg_total % 2 == 0:  # NEW: only append every other negative sample
                ner_data.append(tags)
                cnt_neg += 1
            cnt_neg_total += 1
    
    # process bar
    pbar.update(1)
    pbar.set_description(f"Training data size: {cnt_pos} positives + {cnt_neg} negatives")
    
    # NEW:
#     if i >= 10:
#         break
    # OLD:
    
# shuffling
random.shuffle(ner_data)


Write the tokens (sentences) and the IOB tags to a file, with the format:

{

"tokens": \["Source", "USDA", "Economic", "Research", "Service", "using", "Agricultural", "Resource", "Management", "Survey", "2006"\],

"tags": \["O", "O", "O", "O", "O", "O", "B", "I", "I", "I", "O"\]

}

Another example:

{

"tokens": \["Currently", "we", "have", "used", "the", "COVID", "19", "Open", "Research", "Dataset", "CORD", "19", "publicly", "made", "available", "by", "Allen", "Institute", "of", "AI", "on", "20th", "March", "2020"\],

"tags": \["O", "O", "O", "O", "O", "B", "I", "I", "I", "I", "I", "I", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"\]

}

Negative sentences only have "O" tags.


In [None]:
with open('train_ner.json', 'w') as f:
    for row in ner_data:
        words, nes = list(zip(*row))
        row_json = {'tokens' : words, 'tags' : nes}
        json.dump(row_json, f)
        f.write('\n')

# Fine-tune a BERT model for NER

In [None]:
!python ../input/kaggle-ner-utils/kaggle_run_ner.py \
--model_name_or_path 'bert-base-cased' \
--train_file './train_ner.json' \
--validation_file './train_ner.json' \
--num_train_epochs 1 \
--per_device_train_batch_size 8 \
--per_device_eval_batch_size 8 \
--save_steps 15000 \
--output_dir './output' \
--report_to 'none' \
--seed 123 \
--do_train 

After the tuning finishes, we should find our model in './output'.