This notebook gives a simple combination of literal matching and Named Entity Recognition using BERT (base model from huggingface).

The training phase of the BERT model was done in another kernel: Pytorch BERT for Named Entity Recognition.

In [None]:
MAX_SAMPLE = None # set a small number for experimentation, set None for production.

# Install packages

In [None]:
!pip install datasets --no-index --find-links=file:///kaggle/input/coleridge-packages/packages/datasets
!pip install ../input/coleridge-packages/seqeval-1.2.2-py3-none-any.whl
!pip install ../input/coleridge-packages/tokenizers-0.10.1-cp37-cp37m-manylinux1_x86_64.whl
!pip install ../input/coleridge-packages/transformers-4.5.0.dev0-py3-none-any.whl

# Import

In [None]:
import os
import re
import json
import time
import datetime
import random
import glob
import importlib

import numpy as np
import pandas as pd

from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

random.seed(123)
np.random.seed(456)

# Load data

In [None]:
from sklearn.model_selection import train_test_split

train_path = '../input/coleridgeinitiative-show-us-the-data/train.csv'
train = pd.read_csv(train_path)
train = train[:MAX_SAMPLE]

# Split train test data again to verify the accuracy using our own metrics.
train, train_test = train_test_split(train,test_size=0.2, random_state = 1734)

paper_train_folder = '../input/coleridgeinitiative-show-us-the-data/train'
papers = {}
for paper_id in train['Id'].unique():
    with open(f'{paper_train_folder}/{paper_id}.json', 'r') as f:
        paper = json.load(f)
        papers[paper_id] = paper

In [None]:
# sample_submission_path = '../input/coleridgeinitiative-show-us-the-data/sample_submission.csv'
# sample_submission = pd.read_csv(sample_submission_path)

# paper_test_folder = '../input/coleridgeinitiative-show-us-the-data/test'
# for paper_id in sample_submission['Id']:
#     with open(f'{paper_test_folder}/{paper_id}.json', 'r') as f:
#         paper = json.load(f)
#         papers[paper_id] = paper

In [None]:
# This section is for the testing on training data
for paper_id in train_test['Id'].unique():
    with open(f'{paper_train_folder}/{paper_id}.json', 'r') as f:
        paper = json.load(f)
        papers[paper_id] = paper

# Literal matching

### Create a knowledge bank

In [None]:
all_labels = set()

for label_1, label_2, label_3 in train[['dataset_title', 'dataset_label', 'cleaned_label']].itertuples(index=False):
    all_labels.add(str(label_1).lower())
    all_labels.add(str(label_2).lower())
    all_labels.add(str(label_3).lower())
    
print(f'No. different labels: {len(all_labels)}')

### Matching on test data

In [None]:
def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower()).strip()

def totally_clean_text(txt):
    txt = clean_text(txt)
    txt = re.sub(' +', ' ', txt)
    return txt

In [None]:
# literal_preds = []

# for paper_id in sample_submission['Id']:
#     paper = papers[paper_id]
#     text_1 = '. '.join(section['text'] for section in paper).lower()
#     text_2 = totally_clean_text(text_1)
    
#     labels = set()
#     for label in all_labels:
#         if label in text_1 or label in text_2:
#             labels.add(clean_text(label))
    
#     literal_preds.append('|'.join(labels))


In [None]:
# # This section is for the testing on training data
# literal_preds = []

# for paper_id in train_test['Id'].unique():
#     paper = papers[paper_id]
#     text_1 = '. '.join(section['text'] for section in paper).lower()
#     text_2 = totally_clean_text(text_1)
    
#     labels = set()
#     for label in all_labels:
#         if label in text_1 or label in text_2:
#             labels.add(clean_text(label))
    
#     literal_preds.append('|'.join(labels))

In [None]:
# literal_preds[:5]

# Bert prediction

### Paths and Hyperparameters

In [None]:
MAX_LENGTH = 64 # max no. words for each sentence.
OVERLAP = 20 # if a sentence exceeds MAX_LENGTH, we split it to multiple sentences with overlapping

PREDICT_BATCH = 64000 

PRETRAINED_PATH = '../input/pytorch-bert-for-named-entity-recognition/output'
TEST_INPUT_SAVE_PATH = './input_data'
TEST_NER_DATA_FILE = 'test_ner_input.json'
TRAIN_PATH = '../input/pytorch-bert-for-named-entity-recognition/train_ner.json'
VAL_PATH = '../input/pytorch-bert-for-named-entity-recognition/train_ner.json'

PREDICTION_SAVE_PATH = './pred'
PREDICTION_FILE = 'test_predictions.txt'

### Transform data to NER format

Group by publication, training labels should have the same form as expected output.

In [None]:
train = train.groupby('Id').agg({
    'pub_title': 'first',
    'dataset_title': '|'.join,
    'dataset_label': '|'.join,
    'cleaned_label': '|'.join
}).reset_index()

print(f'No. grouped training rows: {len(train)}')

In [None]:
def clean_training_text(txt):
    """
    similar to the default clean_text function but without lowercasing.
    """
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt)).strip()

def shorten_sentences(sentences):
    short_sentences = []
    for sentence in sentences:
        words = sentence.split()
        if len(words) > MAX_LENGTH:
            for p in range(0, len(words), MAX_LENGTH - OVERLAP):
                short_sentences.append(' '.join(words[p:p+MAX_LENGTH]))
        else:
            short_sentences.append(sentence)
    return short_sentences

In [None]:
# test_rows = [] # test data in NER format
# paper_length = [] # store the number of sentences each paper has

# for paper_id in sample_submission['Id']:
#     # load paper
#     paper = papers[paper_id]
    
#     # extract sentences
#     sentences = [clean_training_text(sentence) for section in paper 
#                  for sentence in section['text'].split('.')
#                 ]
#     sentences = shorten_sentences(sentences) # make sentences short
#     sentences = [sentence for sentence in sentences if len(sentence) > 10] # only accept sentences with length > 10 chars
#     sentences = [sentence for sentence in sentences if any(word in sentence.lower() for word in ['data', 'study'])]
        
#     # collect all sentences in json
#     for sentence in sentences:
#         sentence_words = sentence.split()
#         dummy_tags = ['O']*len(sentence_words)
#         test_rows.append({'tokens' : sentence_words, 'tags' : dummy_tags})

#     # track which sentence belongs to which data point
#     paper_length.append(len(sentences))

# print(f'total number of sentences: {len(test_rows)}')

In [None]:
test_rows = [] # test data in NER format
paper_length = [] # store the number of sentences each paper has

for paper_id in tqdm(train_test['Id'].unique()):
    # load paper
    paper = papers[paper_id]
    
    # extract sentences
    sentences = [clean_training_text(sentence) for section in paper 
                 for sentence in section['text'].split('.')
                ]
    sentences = shorten_sentences(sentences) # make sentences short
    sentences = [sentence for sentence in sentences if len(sentence) > 10] # only accept sentences with length > 10 chars
    sentences = [sentence for sentence in sentences if any(word in sentence.lower() for word in ['data', 'study'])]
        
    # collect all sentences in json
    for sentence in sentences:
        sentence_words = sentence.split()
        dummy_tags = ['O']*len(sentence_words)
        test_rows.append({'tokens' : sentence_words, 'tags' : dummy_tags})
    
    # track which sentence belongs to which data point
    paper_length.append(len(sentences))
    
print(f'total number of sentences: {len(test_rows)}')

### Do predict and collect results

In [None]:
os.environ["MODEL_PATH"] = f"{PRETRAINED_PATH}"
os.environ["TRAIN_FILE"] = f"{TRAIN_PATH}"
os.environ["VALIDATION_FILE"] = f"{VAL_PATH}"
os.environ["TEST_FILE"] = f"{TEST_INPUT_SAVE_PATH}/{TEST_NER_DATA_FILE}"
os.environ["OUTPUT_DIR"] = f"{PREDICTION_SAVE_PATH}"

In [None]:
# copy my_seqeval.py to the working directory because the input directory is non-writable
!cp /kaggle/input/coleridge-packages/my_seqeval.py ./

# make necessart directories and files
os.makedirs(TEST_INPUT_SAVE_PATH, exist_ok=True)

In [None]:
def bert_predict():
    !python ../input/kaggle-ner-utils/kaggle_run_ner.py \
    --model_name_or_path "$MODEL_PATH" \
    --train_file "$TRAIN_FILE" \
    --validation_file "$VALIDATION_FILE" \
    --test_file "$TEST_FILE" \
    --output_dir "$OUTPUT_DIR" \
    --report_to 'none' \
    --seed 123 \
    --do_predict

In [None]:
# print(len(test_rows))

In [None]:
bert_outputs = []

for batch_begin in range(0, len(test_rows), PREDICT_BATCH):
    # write data rows to input file
    with open(f'{TEST_INPUT_SAVE_PATH}/{TEST_NER_DATA_FILE}', 'w') as f:
        for row in test_rows[batch_begin:batch_begin+PREDICT_BATCH]:
            json.dump(row, f)
            f.write('\n')
    
    # remove output dir
    !rm -r "$OUTPUT_DIR"
    
    # do predict
    bert_predict()
    
    # read predictions
    with open(f'{PREDICTION_SAVE_PATH}/{PREDICTION_FILE}') as f:
        this_preds = f.read().split('\n')[:-1]
        bert_outputs += [pred.split() for pred in this_preds]

### Restore Dataset labels from predictions

In [None]:
# get test sentences
test_sentences = [row['tokens'] for row in test_rows]

del test_rows

In [None]:
bert_dataset_labels = [] # store all dataset labels for each publication

for length in paper_length:
    labels = set()
    # for sentence, pred in zip(test_sentences[:length], bert_outputs[:length]):
    for sentence, pred in zip(test_sentences[:2], bert_outputs[:2]):
        curr_phrase = ''
        for word, tag in zip(sentence, pred):
            if tag == 'B': # start a new phrase
                if curr_phrase:
                    labels.add(curr_phrase)
                    curr_phrase = ''
                curr_phrase = word
            elif tag == 'I' and curr_phrase: # continue the phrase
                curr_phrase += ' ' + word
            else: # end last phrase (if any)
                if curr_phrase:
                    labels.add(curr_phrase)
                    curr_phrase = ''
        # check if the label is the suffix of the sentence
        if curr_phrase:
            labels.add(curr_phrase)
            curr_phrase = ''
    
    # record dataset labels for this publication
    bert_dataset_labels.append(labels)
    
    del test_sentences[:length], bert_outputs[:length]

In [None]:
bert_dataset_labels[:5]

### Filter based on Jaccard score and clean

In [None]:
def jaccard_similarity(s1, s2):
    l1 = s1.split(" ")
    l2 = s2.split(" ")    
    intersection = len(list(set(l1).intersection(l2)))
    union = (len(l1) + len(l2)) - intersection
    return float(intersection) / union

filtered_bert_labels = []

for labels in bert_dataset_labels:
    filtered = []
    
    for label in sorted(labels, key=len):
        label = clean_text(label)
        if len(filtered) == 0 or all(jaccard_similarity(label, got_label) < 0.75 for got_label in filtered):
            filtered.append(label)
    
    filtered_bert_labels.append('|'.join(filtered))

In [None]:
filtered_bert_labels[:5]

# Aggregate final predictions and write submission file

In [None]:
final_predictions = filtered_bert_labels
# for literal_match, bert_pred in zip(literal_preds, filtered_bert_labels):
#     if literal_match:
#         final_predictions.append(literal_match)
#     else:
#         final_predictions.append(bert_pred)

In [None]:
# sample_submission['PredictionString'] = final_predictions
# sample_submission.head()

In [None]:
train['dataset_title2'] = [clean_training_text(x.lower()) for x in train['dataset_title']]
train_test['dataset_title2'] = [clean_training_text(x.lower()) for x in train_test['dataset_title']]

complete = pd.concat([train, train_test])

datalabels = pd.DataFrame()
datalabels['Id'] = complete['dataset_title2'].unique()
datalabels['Labels'] = [set(
    [clean_training_text(x.lower()) for x in complete[complete['dataset_title'] == label]['dataset_label'].unique()] +
    [clean_training_text(x.lower()) for x in complete[complete['dataset_title'] == label]['dataset_title'].unique()] +
    [clean_training_text(x.lower()) for x in complete[complete['dataset_title'] == label]['cleaned_label'].unique()]) 
                                  for label in complete['dataset_title'].unique()]

In [None]:
uniq = train_test['Id'].unique() #train_test.groupby(['Id'])

group_id = train_test.groupby(['Id'])

t_new = pd.DataFrame(columns=['Id', 'Predictions'])
for u, pred in tqdm(zip(uniq, final_predictions)):
    #rows = group_id.get_group(u)
    predictions = list(filter(None, pred.split('|')))

    label_predictions = []
    for row, labels in datalabels.iterrows():
        for label in labels['Labels']:
            if label in predictions:
                label_predictions.append(labels['Id'])

    t_new = t_new.append({'Id': u, 'Predictions': label_predictions}, ignore_index=True)


# t1 = []
# for i in t_new['Predictions']:
#     for j in i:
#         t1.append(j)
# t1 = list(set(t1))

# t2 = []
# for i in train_test['dataset_title']:
#     t2.append(clean_training_text(i.lower()))
# t2 = list(set(t2))
# # t_new['Predictions'].head()
# print(sorted(t1)[:5])
# print(sorted(t2)[:5])
t_new.head()

In [None]:
# print(train_test[:5])
# print("test")
# print(train[:5])
# train_test['Id'].unique()[:5]
#[len(i) for i in t_new['Predictions'][:5]]

In [None]:
# sample_submission.to_csv(f'submission.csv', index=False)
# t_new.to_csv(f'submission.csv', index=False)
# Verschillende namen voor datasets

In [None]:
len(train_test)

In [None]:
# Check how much we are matching #train_df
correct, correct2, wrong_assign, wrong_miss_test, wrong_miss_train = 0, 0, 0, 0, 0
empty = 0

# Check all predictions
for _, data in tqdm(t_new.iterrows()):
    # No predictions
    if len(data['Predictions']) == 0:
        empty += 1
    for label in data['Predictions']:
        # Correct prediction
        if ((train_test['Id']==data['Id']) & (train_test['dataset_title2']==label)).any():
            correct += 1
        # Prediction is in training set
        elif ((train['Id']==data['Id']) & (train['dataset_title2']==label)).any():
            correct2 += 1
        # Prediction is not in test or training set
        else:
            #In labels, but not test set likely comes from bad train value
            wrong_assign += 1

# Check if match with test set
for _, data in tqdm(train_test.iterrows()):
    check = True
    for _, sub in t_new.loc[t_new['Id']==data['Id']].iterrows():
        if (data['dataset_title2'] in sub['Predictions']):
            check = False
            break

    # Prediction is missing from test_set possibly
    if (check):
        wrong_miss_test += 1

# Check if match with training set
for _, data in tqdm(train.iterrows()):
    check = True
    for _, sub in t_new.loc[t_new['Id']==data['Id']].iterrows():
        if (data['dataset_title2'] in sub['Predictions']):
            check = False
            break

    # Prediction is missing from training_set
    if ((t_new['Id']==data['Id']).any() and check):
        wrong_miss_train += 1


print(f"Correct: {correct}, Correct2: {correct2}, Wrongly assigned: {wrong_assign}, Missed test: {wrong_miss_test}, Missed train: {wrong_miss_train}, Empty: {empty} ")

In [None]:
# BOTTOM SET
titles_sorted = complete.groupby('dataset_title') \
                        .count() \
                        .reset_index() \
                        .sort_values(['Id'], ascending=True)

titles_selected = []
count = 0
for row, data in titles_sorted.iterrows():
    titles_selected.append(data['dataset_title'])
    count += data['Id']
    if count >= len(complete) * 0.2:
        break

msk = np.array([(x in titles_selected) for x in train_test['dataset_title']])
check_test = train_test[msk]

msk = np.array([(x in titles_selected) for x in train['dataset_title']])
check_train = train[msk]

In [None]:
print(len(check_test))

In [None]:
# Check how much we are matching #train_df
correct, correct2, wrong_assign, wrong_miss_test, wrong_miss_train = 0, 0, 0, 0, 0
empty = 0
check_temp = 0

# Check all predictions
for _, data in tqdm(t_new.iterrows()):
    # No predictions
    if len(data['Predictions']) == 0:
        empty += 1
    for label in data['Predictions']:
        # Correct prediction
        if ((check_test['Id']==data['Id']) & (check_test['dataset_title2']==label)).any():
            correct += 1
        # Prediction is in training set
        elif ((check_train['Id']==data['Id']) & (check_train['dataset_title2']==label)).any():
            correct2 += 1
        # Prediction is not in test or training set
        elif (check_train['Id']==data['Id']).any() or (check_test['Id']==data['Id']).any():
            #In labels, but not test set likely comes from bad train value
            wrong_assign += 1

# Check if match with test set
for _, data in tqdm(check_test.iterrows()):
    check = True
    for _, sub in t_new.loc[t_new['Id']==data['Id']].iterrows():
        if (data['dataset_title2'] in sub['Predictions']):
            check = False
            break

    # Prediction is missing from test_set possibly
    if (check):
        wrong_miss_test += 1

# Check if match with training set
for _, data in tqdm(check_train.iterrows()):
    check = True
    for _, sub in t_new.loc[t_new['Id']==data['Id']].iterrows():
        if (data['dataset_title2'] in sub['Predictions']):
            check = False
            break

    # Prediction is missing from training_set
    if ((t_new['Id']==data['Id']).any() and check):
        wrong_miss_train += 1


print(f"Correct: {correct}, Correct2: {correct2}, Wrongly assigned: {wrong_assign}, Missed test: {wrong_miss_test}, Missed train: {wrong_miss_train}, Empty: {empty} ")

In [None]:
# print(len(check_test))
# [print(f"{col}:{len(check_test[col].unique())}") for col in check_test.columns]

train_unique = check_train['dataset_title'].unique()
test_unique = check_test['dataset_title'].unique()

completely_new = [i for i in test_unique if i not in train_unique]
print(len(train_unique))
# print(train_unique)
print(len(test_unique))
# print(test_unique)
print(len(completely_new))
print(completely_new)

# for _, data in tqdm(t_new.iterrows()):
#     for label in data['Predictions']:
#         temp = [(t['dataset_title'] in completely_new) for _,t in train_test[train_test['Id']==data['Id']].iterrows()]
#         if len(temp) > 0 and np.any(temp):
#             print(data)
            
for _, data in tqdm(check_test.iterrows()):
    if data['dataset_title'] in completely_new:
        print(data)
        print(t_new[t_new['Id']==data['Id']])