This notebook gives a simple combination of literal matching and Named Entity Recognition using BERT (base model from huggingface).

The training phase of the BERT model was done in another kernel: Pytorch BERT for Named Entity Recognition.

In [None]:
MAX_SAMPLE = None # set a small number for experimentation, set None for production.

# Install packages

In [None]:
!pip install datasets --no-index --find-links=file:///kaggle/input/coleridge-packages/packages/datasets
!pip install ../input/coleridge-packages/seqeval-1.2.2-py3-none-any.whl
!pip install ../input/coleridge-packages/tokenizers-0.10.1-cp37-cp37m-manylinux1_x86_64.whl
!pip install ../input/coleridge-packages/transformers-4.5.0.dev0-py3-none-any.whl

# Import

In [None]:
import os
import re
import json
import time
import datetime
import random
import glob
import importlib

import numpy as np
import pandas as pd

from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

random.seed(123)
np.random.seed(456)

# Load data

In [None]:
train_path = '../input/coleridgeinitiative-show-us-the-data/train.csv'
train = pd.read_csv(train_path)
train = train[:MAX_SAMPLE]

paper_train_folder = '../input/coleridgeinitiative-show-us-the-data/train'
papers = {}
for paper_id in train['Id'].unique():
    with open(f'{paper_train_folder}/{paper_id}.json', 'r') as f:
        paper = json.load(f)
        papers[paper_id] = paper

In [None]:
sample_submission_path = '../input/coleridgeinitiative-show-us-the-data/sample_submission.csv'
sample_submission = pd.read_csv(sample_submission_path)

paper_test_folder = '../input/coleridgeinitiative-show-us-the-data/test'
for paper_id in sample_submission['Id']:
    with open(f'{paper_test_folder}/{paper_id}.json', 'r') as f:
        paper = json.load(f)
        papers[paper_id] = paper

# Data exploration

In [None]:
#Notes
from collections import Counter
import matplotlib.pyplot as plt

print(f'The training dataset contains {len(train.Id)} rows with {len(train.Id.unique())}.')
print(f'So the number of dataset per publication is {round(len(train.Id)/len(train.Id.unique()),2)}.')

counts = Counter(train.Id)
counts = list(counts.values())
dict1 = Counter(counts)

sorted_dict = {}
sorted_keys = sorted(dict1)

for w in sorted_keys:
    sorted_dict[w] = dict1[w]

print(sorted_dict)

In [None]:
plt.hist(counts, bins=list(range(23)), log=True)
plt.title(f"Distribution of number of datasets per publications")
plt.xlabel("Datasets")
plt.ylabel("Counts")

In [None]:
def jaccard(str1, str2):
    a = set(str1.lower().split())
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [None]:
def remove_similar_predictions(prediction_string, treshhold):
    words = list(prediction_string.split("|"))
    trash = []
    for i in range(len(words)):
        word = words[i]
        for j in range(i+1,len(words)):
            if jaccard(word,words[j]) > treshhold:
                trash.append(word)
                break

    for prediction in trash:
        words.remove(prediction)

    return '|'.join(words)

# Literal matching

### Create a knowledge bank

In [None]:
all_labels = set()

for label_1, label_2, label_3 in train[['dataset_title', 'dataset_label', 'cleaned_label']].itertuples(index=False):
    all_labels.add(str(label_1).lower())
    all_labels.add(str(label_2).lower())
    all_labels.add(str(label_3).lower())
    
print(f'No. different labels: {len(all_labels)}')

### Matching on test data

In [None]:
def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower()).strip()

def totally_clean_text(txt):
    txt = clean_text(txt)
    txt = re.sub(' +', ' ', txt)
    return txt

In [None]:
literal_preds = []

for paper_id in sample_submission['Id']:
    paper = papers[paper_id]
    text_1 = '. '.join(section['text'] for section in paper).lower()
    text_2 = totally_clean_text(text_1)
    
    labels = set()
    for label in all_labels:
        if label in text_1 or label in text_2:
            labels.add(clean_text(label))
    
    literal_preds.append('|'.join(labels))


# Aggregate final predictions and write submission file

In [None]:
NaN = sample_submission.PredictionString[0]

In [None]:
final_predictions = []
for literal_match in literal_preds:
    if literal_match:
        final_predictions.append(remove_similar_predictions(literal_match,0.9))
    else:
        final_predictions.append(NaN)

In [None]:
sample_submission['PredictionString'] = final_predictions
sample_submission.head()

In [None]:
sample_submission.to_csv(f'submission.csv', index=False)