This notebook shows how to fine-tune a BERT model (from huggingface) for our dataset recognition task.

Note that internet is needed during the training phase (for downloading the bert-base-cased model). Internet can be turned off during prediction.

In [None]:
LENGTH = 1
FL_TH = 0.75

MAX_LENGTH = 64 # max no. words for each sentence.
OVERLAP = 20 # if a sentence exceeds MAX_LENGTH, we split it to multiple sentences with overlapping

MAX_SAMPLE = None # set a small number for experimentation, set None for production.
SEED = 42

# Setting

In [None]:
!pip install datasets --no-index --find-links=file:///kaggle/input/coleridge-packages/packages/datasets
!pip install ../input/coleridge-packages/seqeval-1.2.2-py3-none-any.whl
!pip install ../input/coleridge-packages/tokenizers-0.10.1-cp37-cp37m-manylinux1_x86_64.whl
!pip install ../input/coleridge-packages/transformers-4.5.0.dev0-py3-none-any.whl

# copy my_seqeval.py to the working directory because the input directory is non-writable
!cp /kaggle/input/coleridge-packages/my_seqeval.py ./

import os
import re
import json
import time
import datetime
import random
import glob
import importlib

import numpy as np
import pandas as pd

from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

from IPython.display import clear_output

clear_output()

In [None]:
# https://huggingface.co/transformers/_modules/transformers/trainer_utils.html
def set_seed(seed: int):
    """
    Helper function for reproducible behavior to set the seed in ``random``, ``numpy``, ``torch`` and/or ``tf`` (if
    installed).

    Args:
        seed (:obj:`int`): The seed to set.
    """
    random.seed(seed)
    np.random.seed(seed)
    # ^^ safe to call this function even if cuda is not available
    
    print(f'Setted Pipeline SEED = {SEED}')


set_seed(SEED)

# Internal Training Data

In [None]:
def text_cleaning(text):
    '''
    Converts all text to lower case, Removes special charecters, emojis and multiple spaces
    text - Sentence that needs to be cleaned
    '''
    text = re.sub('[^A-Za-z0-9]+', ' ', str(text).lower()).strip()
    text = re.sub(' +', ' ', text)
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    return text


def jaccard_similarity(s1, s2):
    l1 = s1.split(" ")
    l2 = s2.split(" ")    
    intersection = len(list(set(l1).intersection(l2)))
    union = (len(l1) + len(l2)) - intersection
    return float(intersection) / union


def find_data_sets_id(publication_id: int) -> list:
    data_set_ids = []
    for class_ in data_set_citations:
        if class_['publication_id'] == publication_id:
            data_set_ids.append(class_['data_set_id'])
    return data_set_ids


def find_data_set_citations_mention_list(publication_id: int) -> str:
    mention_list = []
    for class_ in data_set_citations:
        if class_['publication_id'] == publication_id:
            mention_list.append( '|'.join(class_['mention_list']) )
    return '|'.join( [label for label in mention_list if label != ''] )


def find_data_sets_title(data_set_id: int) -> str:
    for class_ in data_sets:
        if class_['data_set_id'] == data_set_id:
            return class_['title']
        

def RichContextDF(publications: '.json') -> pd.DataFrame:
    publication_id = []
    text_file_name = []
    citations_mention_list = []
    data_sets_title = []
    for class_ in publications:
        publication_id.append(class_['publication_id']) # to get data_set_citations
        text_file_name.append(class_['text_file_name']) # to get text_file
        
        # to get citations_mention_list
        citations_mention_list.append( find_data_set_citations_mention_list(class_['publication_id']) )
        
        # to get data_sets_title
        data_sets_title_temp = []
        for data_sets_id in find_data_sets_id( class_['publication_id'] ):
            data_sets_title_temp.append( find_data_sets_title(data_sets_id) )
        data_sets_title.append('||'.join(data_sets_title_temp))
    
    return pd.DataFrame({
        'publication_id': publication_id,
        'text_file_name': text_file_name,
        'citations_mention_list': citations_mention_list,
        'data_sets_title': data_sets_title
    })

In [None]:
train_path = '../input/coleridgeinitiative-show-us-the-data/train.csv'
paper_train_folder = '../input/coleridgeinitiative-show-us-the-data/train'

train = pd.read_csv(train_path)
train = train[:MAX_SAMPLE]
print(f'No. raw training rows: {len(train)}')

Group by publication, training labels should have the same form as expected output.

## Train

In [None]:
train_path = '../input/coleridgeinitiative-show-us-the-data/train.csv'
paper_train_folder = '../input/coleridgeinitiative-show-us-the-data/train'

train = pd.read_csv(train_path)
print('train size before agg.:', len(train))

train = train[:MAX_SAMPLE]
# Group by publication, training labels should have the same form as expected output.
train = train.groupby('Id').agg({
    'pub_title': 'first',
    'dataset_title': '|'.join,
    'dataset_label': '|'.join,
    'cleaned_label': '|'.join
}).reset_index()    
print('train size after agg.:', len(train))

train = train.sort_values(by=['Id'])
train.head()

## Pseudo

In [None]:
pseudo_train_path = '../input/coleridge-pseudolabelsv2-0585/submission.csv'

pseudo_train = pd.read_csv(pseudo_train_path)
print('pseudo_train size before drop_duplicates.:', len(pseudo_train))

pseudo_train = pseudo_train[:MAX_SAMPLE]
pseudo_train = pseudo_train.drop_duplicates(subset='Id')
print('pseudo_train size after drop_duplicates:', len(pseudo_train), '\n')

pseudo_train = pseudo_train.sort_values(by=['Id']).reset_index(drop=True)
pseudo_train.head()

## Train + Pseudo

In [None]:
train['dataset_label'] = train['dataset_label'] + '|' + pseudo_train['PredictionString']

del pseudo_train

train.head()

In [None]:
train_temp = []
for labels in tqdm(train['dataset_label']):
    labels = labels.split('|')
    filtered_labels = []
    for label in labels:
        if len(filtered_labels) == 0 or all(jaccard_similarity(text_cleaning(label), text_cleaning(got_label)) < FL_TH for got_label in filtered_labels):
            filtered_labels.append(label)
    train_temp.append('|'.join(filtered_labels))
train['dataset_label'] = train_temp
train.head()

# External Training Data

In [None]:
with open(f'../input/rich-context-competition-train-testtargz/train_test/publications.json', 'r') as f:
    publications = json.load(f)
    
with open(f'../input/rich-context-competition-train-testtargz/train_test/data_set_citations.json', 'r') as f:
    data_set_citations = json.load(f)
    
with open(f'../input/rich-context-competition-train-testtargz/train_test/data_sets.json', 'r') as f:
    data_sets = json.load(f)

In [None]:
RichContext_train = RichContextDF(publications)
RichContext_train = RichContext_train[ RichContext_train['citations_mention_list'] != '' ]
RichContext_train

# Transform data to NER format

In [None]:
def clean_training_text(txt):
    """
    similar to the default clean_text function but without lowercasing.
    """
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt)).strip()

def shorten_sentences(sentences):
    short_sentences = []
    for sentence in sentences:
        words = sentence.split()
        if len(words) > MAX_LENGTH:
            for p in range(0, len(words), MAX_LENGTH - OVERLAP):
                short_sentences.append(' '.join(words[p:p+MAX_LENGTH]))
        else:
            short_sentences.append(sentence)
    return short_sentences

def find_sublist(big_list, small_list):
    all_positions = []
    for i in range(len(big_list) - len(small_list) + 1):
        if small_list == big_list[i:i+len(small_list)]:
            all_positions.append(i)
    
    return all_positions

def tag_sentence(sentence, labels): # requirement: both sentence and labels are already cleaned
    sentence_words = sentence.split()
    
    if labels is not None and any(re.findall(f'\\b{label}\\b', sentence)
                                  for label in labels): # positive sample
        nes = ['O'] * len(sentence_words)
        for label in labels:
            label_words = label.split()

            all_pos = find_sublist(sentence_words, label_words)
            for pos in all_pos:
                nes[pos] = 'B'
                for i in range(pos+1, pos+len(label_words)):
                    nes[i] = 'I'

        return True, list(zip(sentence_words, nes))
        
    else: # negative sample
        nes = ['O'] * len(sentence_words)
        return False, list(zip(sentence_words, nes))

In [None]:
cnt_pos, cnt_neg = 0, 0 # number of sentences that contain/not contain labels
ner_data = []

pbar = tqdm(total = len(RichContext_train))
for paper_id, dataset_labels in RichContext_train[['publication_id', 'citations_mention_list']].itertuples(index=False):
    # labels
    labels = dataset_labels.split('|')
    labels = [clean_training_text(label) for label in labels]
    
    # paper
    with open(f'../input/rich-context-competition-train-testtargz/train_test/files/text/{paper_id}.txt', 'r') as f:
        paper = f.readlines()
    paper = [line[:-1] for line in paper]
    content = ' '.join(paper)
    
    # sentences
    sentences = set([clean_training_text(sentence) for sentence in content.split('.')])
    sentences = shorten_sentences(sentences)
    sentences = [sentence for sentence in sentences if len(sentence) > LENGTH]
    
    # positive sample
    for sentence in sentences:
        is_positive, tags = tag_sentence(sentence, labels)
        if is_positive:
            cnt_pos += 1
            ner_data.append(tags)
        elif any(word in sentence.lower() for word in ['data', 'study']): 
            ner_data.append(tags)
            cnt_neg += 1
    
    # process bar
    pbar.update(1)
    pbar.set_description(f"Training data size: {cnt_pos} positives + {cnt_neg} negatives")
pbar.close()

# shuffling
random.shuffle(ner_data)

In [None]:
papers = {}
for paper_id in tqdm(train['Id'].unique()):
    with open(f'{paper_train_folder}/{paper_id}.json', 'r') as f:
        paper = json.load(f)
        papers[paper_id] = paper

In [None]:
pbar = tqdm(total=len(train))
for i, id, dataset_label in train[['Id', 'dataset_label']].itertuples():
    # paper
    paper = papers[id]
    
    # labels
    labels = dataset_label.split('|')
    labels = [clean_training_text(label) for label in labels]
    
    # sentences
    sentences = set([clean_training_text(sentence) for section in paper 
                 for sentence in section['text'].split('.') 
                ])
    sentences = shorten_sentences(sentences) # make sentences short
    sentences = [sentence for sentence in sentences if len(sentence) > LENGTH] # only accept sentences with length > 10 chars
    
    # positive sample
    for sentence in sentences:
        is_positive, tags = tag_sentence(sentence, labels)
        if is_positive:
            cnt_pos += 1
            ner_data.append(tags)
        elif any(word in sentence.lower() for word in ['data', 'study']): 
            ner_data.append(tags)
            cnt_neg += 1
    
    # process bar
    pbar.update(1)
    pbar.set_description(f"Training data size: {cnt_pos} positives + {cnt_neg} negatives")
pbar.close()

# shuffling
random.shuffle(ner_data)

write data to file.

In [None]:
with open('train_ner.json', 'w') as f:
    for row in tqdm(ner_data):
        words, nes = list(zip(*row))
        row_json = {'tokens' : words, 'tags' : nes}
        json.dump(row_json, f)
        f.write('\n')

# Fine-tune a BERT model for NER

In [None]:
!python ../input/kaggle-ner-utils/kaggle_run_ner.py \
--model_name_or_path 'bert-base-cased' \
--train_file './train_ner.json' \
--validation_file './train_ner.json' \
--num_train_epochs 1 \
--per_device_train_batch_size 8 \
--per_device_eval_batch_size 8 \
--save_steps 15000 \
--output_dir './output' \
--report_to 'none' \
--seed 42 \
--do_train 

After the tuning finishes, we should find our model in './output'.