This notebook shows how to fine-tune a BERT model (from huggingface) for our dataset recognition task.

Note that internet is needed during the training phase (for downloading the bert-base-cased model). Internet can be turned off during prediction.

## Install packages

In [None]:
!pip install datasets --no-index --find-links=file:///kaggle/input/coleridge-packages/packages/datasets
!pip install ../input/coleridge-packages/seqeval-1.2.2-py3-none-any.whl
!pip install ../input/coleridge-packages/tokenizers-0.10.1-cp37-cp37m-manylinux1_x86_64.whl
!pip install ../input/coleridge-packages/transformers-4.5.0.dev0-py3-none-any.whl

In [None]:
#install translator. Make sure internet is enabled
!pip install google_trans_new
from google_trans_new import google_translator

#packages to find synonyms
import nltk
from nltk.corpus import wordnet

# Import

In [None]:
#imports
import os
import re
import json
import time
import datetime
import random
import glob
import importlib
from sklearn.model_selection import train_test_split

import numpy as np
import pandas as pd

from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

#set seed for reproducability
random.seed(123)
np.random.seed(456)

In [None]:
# copy my_seqeval.py to the working directory because the input directory is non-writable
!cp /kaggle/input/coleridge-packages/my_seqeval.py ./

# Hyper-parameters

In [None]:
MAX_LENGTH = 64 # max no. words for each sentence.
OVERLAP = 20 # if a sentence exceeds MAX_LENGTH, we split it to multiple sentences with overlapping

MAX_SAMPLE = None # set a small number for experimentation, set None for production.
ADD_SYNONYMS = False
ADD_TRANSLATIONS = True
EQUALIZE_COUNTS =  True
assert EQUALIZE_COUNTS == True or (ADD_SYNONYMS == False and ADD_TRANSLATIONS == False), "for the translations and synonyms to work label counts have to be equalised"

# Load data

In [None]:
train_path = '../input/coleridgeinitiative-show-us-the-data/train.csv'
paper_train_folder = '../input/coleridgeinitiative-show-us-the-data/train'

df = pd.read_csv(train_path)
df = df[:MAX_SAMPLE]
print(f'No. raw training rows: {len(df)}')


In [None]:
# remove papers with dataset title Our World in Data COVID-19 dataset --> this will be in the validation set
COVID_papers = df[df['dataset_title'] == "Our World in Data COVID-19 dataset"]
COVID_papers_IDS = set(COVID_papers['Id'])
df = df[~df['Id'].isin(COVID_papers_IDS)]


#leave out 1% to test on
train_ids, validate_ids = train_test_split(df['Id'].unique(), test_size=0.01, random_state=42)
print(train_ids.shape, validate_ids.shape)

Group by publication, training labels should have the same form as expected output.

In [None]:
papers = {}
for paper_id in df['Id'].unique():
    with open(f'{paper_train_folder}/{paper_id}.json', 'r') as f:
        paper = json.load(f)
        papers[paper_id] = paper

In [None]:
# train papers with titles
train = df[df['Id'].isin(train_ids)]
train['New_label'] = train['dataset_label']
papers = {your_key: papers[your_key] for your_key in train_ids}


In [None]:
def add_translations(labels):
    import time
    translator = google_translator()
    new_labels = []
    for label in labels:
        result = translator.translate(label, lang_src = 'en', lang_tgt='ug') #you can choose other languages for different sentences
        result = translator.translate(result, lang_src = 'ug', lang_tgt='zh')
        result = translator.translate(result, lang_src = 'zh', lang_tgt='en')
        new_labels.append(result)
        time.sleep(1)
    return new_labels


In [None]:
def add_synonyms(labels):
    
    stop_words = ['in', 'of', 'and']
    new_labels = []
    for label in labels:
        string=label
        words = string.split()
        for word in words:
            if word in stop_words: #We don't want synonyms of stop words
                continue
            synonyms = []
            for syn in wordnet.synsets(word): 
                for l in syn.lemmas():
                    synonyms.append(l.name()) #synonyms contains all synonyms of a word
            synonyms = [syn for syn in synonyms if not syn.lower() in word.lower() and not word.lower() in syn.lower()] #remove synonyms that are part of the word (or vice versa)
            if len(synonyms)>0:   #if we have any synonyms:
                rep = synonyms[0] #replace out word with the first synonym
                st=string.replace(word,rep, 1)
                string = st
        new_labels.append(string)
    return new_labels

In [None]:
#function that counts the distribution of datasets among papers
def get_init_dist(labels):
    print(labels)
    init_dist = []
    for label in labels:
        init_dist.append([len(set(train[train['dataset_title'] == label]['Id'])), label]) #find papers where label occurs
    init_dist = sorted(init_dist)
    return init_dist

In [None]:
#function that replaces dataset labels in text to straighten the skewed distribution. 
#after running this function each dataset title will occur approximately the same number of times across all papers.
def make_occurences_equal(init_dist):
    no_datasets = sum([x[0] for x in init_dist])/len(init_dist) #total number of datasets
    low_mentions = 0
    high_mentions = len(init_dist)-1
    high_mention_papers = train[train['dataset_title']==init_dist[high_mentions][1]] 
    start_index = 0
    while high_mentions > low_mentions:
        #print("new: ", init_dist[low_mentions][1], " old: ", init_dist[high_mentions][1])
        Ids = list(set(train[train['dataset_title']==init_dist[high_mentions][1]]['Id']))
        while init_dist[low_mentions][0] < no_datasets and init_dist[high_mentions][0] > no_datasets:
            Id = Ids[start_index]
            text = papers[Id]
            train_rows = train[train["Id"]==Id]
            labels_in_text = train_rows[train_rows["dataset_title"]==init_dist[high_mentions][1]]['dataset_label']
            labels_sorted = sorted(labels_in_text, key=len, reverse=True)
            low_mention_title = init_dist[low_mentions][1]
            for label in labels_sorted:
                for section in papers[Id]: 
                    section['text'] = section['text'].replace(label, low_mention_title) #replace high mention title with low mention title
                train.loc[train[(train["Id"] == Id) & (train['dataset_label']==label)].index, "New_label"] = low_mention_title
            init_dist[low_mentions][0] += 1
            init_dist[high_mentions][0] -= 1
            start_index += 1 
        if init_dist[low_mentions][0] >= no_datasets:
            low_mentions += 1
        else:
            high_mentions -= 1
            high_mention_papers = train[train['dataset_title']==init_dist[high_mentions][1]]
            start_index = 0
    return init_dist


In [None]:
# all dataset labels
all_labels = list(set(train['dataset_title']))
#print(all_labels)
if ADD_SYNONYMS:
    all_labels = list(set(all_labels).union(set(add_synonyms(all_labels))))
    #print(all_labels)
if ADD_TRANSLATIONS:
    all_labels = list(set(all_labels).union(set(add_translations(all_labels))))
    #print(all_labels)
if EQUALIZE_COUNTS: 
    init_dist = get_init_dist(all_labels)
    #print(init_dist)
    init_dist = make_occurences_equal(init_dist)
    #print(init_dist)

In [None]:
#group same paper Ids together
train = train.groupby('Id').agg({
    'pub_title': 'first',
    'dataset_title': '|'.join,
    'dataset_label': '|'.join,
    'cleaned_label': '|'.join,
    'New_label': '|'.join
}).reset_index()

print(f'No. grouped training rows: {len(train)}')

# Transform data to NER format

In [None]:
def clean_training_text(txt):
    """
    similar to the default clean_text function but without lowercasing.
    """
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt)).strip()

def shorten_sentences(sentences):
    short_sentences = []
    for sentence in sentences:
        words = sentence.split()
        if len(words) > MAX_LENGTH:
            for p in range(0, len(words), MAX_LENGTH - OVERLAP):
                short_sentences.append(' '.join(words[p:p+MAX_LENGTH]))
        else:
            short_sentences.append(sentence)
    return short_sentences

def find_sublist(big_list, small_list):
    all_positions = []
    for i in range(len(big_list) - len(small_list) + 1):
        if small_list == big_list[i:i+len(small_list)]:
            all_positions.append(i)
    
    return all_positions

def tag_sentence(sentence, labels): # requirement: both sentence and labels are already cleaned
    sentence_words = sentence.split()
    labels = sorted(labels, key=len, reverse=True)
    if labels is not None and any(re.findall(f'\\b{label}\\b', sentence)
                                  for label in labels): # positive sample
        nes = ['O'] * len(sentence_words)
        for label in labels:
            label_words = label.split()

            all_pos = find_sublist(sentence_words, label_words)
            for pos in all_pos:
                nes[pos] = 'B'
                for i in range(pos+1, pos+len(label_words)):
                    nes[i] = 'I'

        return True, list(zip(sentence_words, nes))
        
    else: # negative sample
        nes = ['O'] * len(sentence_words)
        return False, list(zip(sentence_words, nes))

In [None]:
cnt_pos, cnt_neg, cnt_rnd = 0, 0 ,0 # number of sentences that contain/not contain labels
ner_data = []

# pbar = tqdm(total=len(train))
for i, id, dataset_label in train[['Id', 'New_label']].itertuples():
    # paper
    paper = papers[id]
    
    # labels
    labels = dataset_label.split('|')
    labels = [clean_training_text(label) for label in labels]
    
    # sentences
    sentences = set([clean_training_text(sentence) for section in paper 
                 for sentence in section['text'].split('.') 
                ])
    sentences = shorten_sentences(sentences) # make sentences short
    sentences = [sentence for sentence in sentences if len(sentence) > 10] # only accept sentences with length > 10 chars
    
    # positive sample
    for sentence in sentences:
        is_positive, tags = tag_sentence(sentence, labels)
        if is_positive:
            cnt_pos += 1
            ner_data.append(tags)
        elif any(word in sentence.lower() for word in ['data', 'study']): 
            ner_data.append(tags)
            cnt_neg += 1
        else:
            p = random.uniform(0,1)
            if p < 0.01:
                ner_data.append(tags)
                cnt_rnd += 1
                
    
print(cnt_pos)
print(cnt_neg)
print(cnt_rnd)
    # process bar
#     pbar.update(1)
#     pbar.set_description(f"Training data size: {cnt_pos} positives + {cnt_neg} negatives")

# shuffling
random.shuffle(ner_data)

write data to file.

In [None]:
with open('train_ner.json', 'w') as f:
    for row in ner_data:
        words, nes = list(zip(*row))
        row_json = {'tokens' : words, 'tags' : nes}
        json.dump(row_json, f)
        f.write('\n')

# Fine-tune a BERT model for NER

In [None]:
!python ../input/kaggle-ner-utils/kaggle_run_ner.py \
--model_name_or_path 'bert-base-cased' \
--train_file './train_ner.json' \
--validation_file './train_ner.json' \
--num_train_epochs 1 \
--per_device_train_batch_size 8 \
--per_device_eval_batch_size 8 \
--save_steps 15000 \
--output_dir './output' \
--report_to 'none' \
--seed 123 \
--do_train 

After the tuning finishes, we should find our model in './output'.