This notebook gives a simple combination of literal matching and Named Entity Recognition using BERT (base model from huggingface).

The training phase of the BERT model was done in another kernel: Pytorch BERT for Named Entity Recognition.

In [None]:
MAX_SAMPLE = None # set a small number for experimentation, set None for production.

# Install packages

In [None]:
!pip install datasets --no-index --find-links=file:///kaggle/input/coleridge-packages/packages/datasets
!pip install ../input/coleridge-packages/seqeval-1.2.2-py3-none-any.whl
!pip install ../input/coleridge-packages/tokenizers-0.10.1-cp37-cp37m-manylinux1_x86_64.whl
!pip install ../input/coleridge-packages/transformers-4.5.0.dev0-py3-none-any.whl
!pip install ../input/recordlinkage/jellyfish-0.8.2-cp37-cp37m-manylinux2014_x86_64.whl

# Import

In [None]:
import os
import re
import json
import time
import datetime
import random
import glob
import importlib

import numpy as np
import pandas as pd

from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.spatial import distance

random.seed(123)
np.random.seed(456)

import jellyfish as jf

# Load data

In [None]:
train_path = '../input/coleridgeinitiative-show-us-the-data/train.csv'
train = pd.read_csv(train_path)
train = train[:MAX_SAMPLE]

paper_train_folder = '../input/coleridgeinitiative-show-us-the-data/train'
papers = {}
for paper_id in train['Id'].unique():
    with open(f'{paper_train_folder}/{paper_id}.json', 'r') as f:
        paper = json.load(f)
        papers[paper_id] = paper
        
#papers

In [None]:
sample_submission_path = '../input/coleridgeinitiative-show-us-the-data/sample_submission.csv'
sample_submission = pd.read_csv(sample_submission_path)

paper_test_folder = '../input/coleridgeinitiative-show-us-the-data/test'
for paper_id in sample_submission['Id']:
    with open(f'{paper_test_folder}/{paper_id}.json', 'r') as f:
        paper = json.load(f)
        papers[paper_id] = paper

# Data Exploration

In [None]:
train.head()

In [None]:
train.info()

In [None]:
# finding unique values in each column
[print(f"{col}:{len(train[col].unique())}") for col in train.columns]

### Data Visualisation

In [None]:
from wordcloud import WordCloud, STOPWORDS
from nltk.probability import FreqDist

words = list(train['cleaned_label'].values)
stopwords=['ourselves', 'hers','the','of','and','in', 'between', 'yourself', 'but', 'again','of', 'there', 'about', 'once', 'during', 'out', 'very', 'having', 'with', 'they', 'own', 'an', 'be', 'some', 'for', 'do', 'its', 'yours', 'such', 'into', 'of', 'most', 'itself', 'other', 'off', 'is', 's', 'am', 'or', 'who', 'as', 'from', 'him', 'each', 'the', 'themselves', 'until', 'below', 'are', 'we', 'these', 'your', 'his', 'through', 'don', 'nor', 'me', 'were', 'her', 'more', 'himself', 'this', 'down', 'should', 'our', 'their', 'while', 'above', 'both', 'up', 'to', 'ours', 'had', 'she', 'all', 'no', 'when', 'at', 'any', 'before', 'them', 'same', 'and', 'been', 'have', 'in', 'will', 'on', 'does', 'yourselves', 'then', 'that', 'because', 'what', 'over', 'why', 'so', 'can', 'did', 'not', 'now', 'under', 'he', 'you', 'herself', 'has', 'just', 'where', 'too', 'only', 'myself', 'which', 'those', 'i', 'after', 'few', 'whom', 't', 'being', 'if', 'theirs', 'my', 'against', 'a', 'by', 'doing', 'it', 'how', 'further', 'was', 'here', 'than']
split_words=[]
for word in words:
    lo_w=[]
    list_of_words=str(word).split()
    for w in list_of_words:
        if w not in stopwords:
            lo_w.append(w)
    split_words.append(lo_w)
allwords = []
for wordlist in split_words:
    allwords += wordlist

In [None]:
mostcommon = FreqDist(allwords).most_common(100)
wordcloud = WordCloud(width=1600, height=800, background_color='white', stopwords=STOPWORDS).generate(str(mostcommon))
fig = plt.figure(figsize=(30,10), facecolor='white')
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.title('Top 100 Most Common Words in cleaned_label', fontsize=50)
plt.tight_layout(pad=0)
plt.show()

mostcommon_small = FreqDist(allwords).most_common(25)
x, y = zip(*mostcommon_small)
plt.figure(figsize=(50,30))
plt.margins(0.02)
plt.bar(x, y)
plt.xlabel('Words', fontsize=50)
plt.ylabel('Frequency of Words', fontsize=50)
plt.yticks(fontsize=40)
plt.xticks(rotation=60, fontsize=40)
plt.tight_layout(pad=0)
plt.title('Freq of 25 Most Common Words in cleaned_label', fontsize=60)
plt.show()

# Literal matching

### Create a knowledge bank

In [None]:
all_labels = set()

for label_1, label_2, label_3 in train[['dataset_title', 'dataset_label', 'cleaned_label']].itertuples(index=False):
    all_labels.add(str(label_1).lower())
    all_labels.add(str(label_2).lower())
    all_labels.add(str(label_3).lower())
    
print(f'No. different labels: {len(all_labels)}')

### Add extra data

In [None]:
# extDf = pd.read_csv('../input/bigger-govt-dataset-list/data_set_800.csv')
# extDf = pd.read_csv('../input/bigger-govt-dataset-list/data_set_26897.csv')

# extDf = pd.read_csv('../input/filtered-bigger-govt-dataset/ExtraLabelsCleaned.txt')
# extDf = extDf.rename(columns={'Label': 'title'})
# extDf = extDf.drop(' Hits',axis='columns')

# extDf = pd.read_csv('../input/coleridge-additional-gov-datasets-22000popular/additional_gov_datasets_22000popular.csv')
# extDf = pd.read_csv('../input/coleridge-additional-gov-datasets-22000popular/data_set_800_with8000popular.csv')

# print(len(extDf))

# extDf.head(20)

In [None]:
# added this in version 13
# for l in extDf.title:
#     all_labels.add(l)
    
# all_labels = set(all_labels)
# print(f'No. different labels: {len(all_labels)}')

### Matching on test data

In [None]:
def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower()).strip()

def totally_clean_text(txt):
    txt = clean_text(txt)
    txt = re.sub(' +', ' ', txt)
    return txt

In [None]:
literal_preds = []

for paper_id in sample_submission['Id']:
    paper = papers[paper_id]
    text_1 = '. '.join(section['text'] for section in paper).lower()
    text_2 = totally_clean_text(text_1)
    
    labels = set()
    for label in all_labels:
        if label in text_1 or label in text_2:
            labels.add(clean_text(label))
    
    literal_preds.append('|'.join(labels))


In [None]:
# def read_json_pub(filename, train_data_path=paper_train_folder, output='text'):
#     json_path = os.path.join(train_data_path, (filename+'.json'))
#     headings = []
#     contents = []
#     combined = []
#     with open(json_path, 'r') as f:
#         json_decode = json.load(f)
#         for data in json_decode:
#             headings.append(data.get('section_title'))
#             contents.append(data.get('text'))
#             combined.append(data.get('section_title'))
#             combined.append(data.get('text'))
    
#     all_headings = ' '.join(headings)
#     all_contents = ' '.join(contents)
#     all_data = '. '.join(combined)
    
#     if output == 'text':
#         return all_contents
#     elif output == 'head':
#         return all_headings
#     else:
#         return all_data

In [None]:
# # https://www.kaggle.com/chienhsianghung/external-datasets-matching-mlm
# # https://www.kaggle.com/mlconsult/isin-big-dataset

# def text_cleaning(text):
#     '''
#     Converts all text to lower case, Removes special charecters, emojis and multiple spaces
#     text - Sentence that needs to be cleaned
#     '''
#     text = re.sub('[^A-Za-z0-9]+', ' ', str(text).lower()).strip()
#     text = re.sub(' +', ' ', text)
#     emoji_pattern = re.compile("["
#                                u"\U0001F600-\U0001F64F"  # emoticons
#                                u"\U0001F300-\U0001F5FF"  # symbols & pictographs
#                                u"\U0001F680-\U0001F6FF"  # transport & map symbols
#                                u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
#                                "]+", flags=re.UNICODE)
#     text = emoji_pattern.sub(r'', text)
#     return text

# literal_preds = []
# to_append = []

# for index, row in sample_submission.iterrows():
#     to_append = [row['Id'],'']
#     large_string = str(read_json_pub(row['Id'], paper_test_folder))
#     clean_string = text_cleaning(large_string)
    
#     for index, row2 in extDf.iterrows():
#         query_string = str(row2['title'])
#         if query_string in clean_string:
#             if to_append[1] != '' and clean_text(query_string) not in to_append[1]:
#                 to_append[1] = to_append[1] + '|' + clean_text(query_string)
#             if to_append[1] == '':
#                 to_append[1] = clean_text(query_string)
#     literal_preds.append(*to_append[1:])

In [None]:
literal_preds[:5]

# Bert prediction

### Paths and Hyperparameters

In [None]:
# MAX_LENGTH = 64 # max no. words for each sentence.
# OVERLAP = 20 # if a sentence exceeds MAX_LENGTH, we split it to multiple sentences with overlapping

# PREDICT_BATCH = 64000 

# PRETRAINED_PATH = '../input/coleridge-bert-models/output'
# TEST_INPUT_SAVE_PATH = './input_data'
# TEST_NER_DATA_FILE = 'test_ner_input.json'
# TRAIN_PATH = '../input/coleridge-bert-models/train_ner.json'
# VAL_PATH = '../input/coleridge-bert-models/train_ner.json'

# PREDICTION_SAVE_PATH = './pred'
# PREDICTION_FILE = 'test_predictions.txt'

### Transform data to NER format

Group by publication, training labels should have the same form as expected output.

In [None]:
# train = train.groupby('Id').agg({
#     'pub_title': 'first',
#     'dataset_title': '|'.join,
#     'dataset_label': '|'.join,
#     'cleaned_label': '|'.join
# }).reset_index()

# print(f'No. grouped training rows: {len(train)}')

In [None]:
# def clean_training_text(txt):
#     """
#     similar to the default clean_text function but without lowercasing.
#     """
#     return re.sub('[^A-Za-z0-9]+', ' ', str(txt)).strip()

# def shorten_sentences(sentences):
#     short_sentences = []
#     for sentence in sentences:
#         words = sentence.split()
#         if len(words) > MAX_LENGTH:
#             for p in range(0, len(words), MAX_LENGTH - OVERLAP):
#                 short_sentences.append(' '.join(words[p:p+MAX_LENGTH]))
#         else:
#             short_sentences.append(sentence)
#     return short_sentences

In [None]:
# test_rows = [] # test data in NER format
# paper_length = [] # store the number of sentences each paper has

# for paper_id in sample_submission['Id']:
#     # load paper
#     paper = papers[paper_id]
    
#     # extract sentences
#     sentences = [clean_training_text(sentence) for section in paper 
#                  for sentence in section['text'].split('.')
#                 ]
#     sentences = shorten_sentences(sentences) # make sentences short
#     sentences = [sentence for sentence in sentences if len(sentence) > 10] # only accept sentences with length > 10 chars
#     sentences = [sentence for sentence in sentences if any(word in sentence.lower() for word in ['data', 'study'])]
        
#     # collect all sentences in json
#     for sentence in sentences:
#         sentence_words = sentence.split()
#         dummy_tags = ['O']*len(sentence_words)
#         test_rows.append({'tokens' : sentence_words, 'tags' : dummy_tags})
    
#     # track which sentence belongs to which data point
#     paper_length.append(len(sentences))
    
# print(f'total number of sentences: {len(test_rows)}')

### Do predict and collect results

In [None]:
# os.environ["MODEL_PATH"] = f"{PRETRAINED_PATH}"
# os.environ["TRAIN_FILE"] = f"{TRAIN_PATH}"
# os.environ["VALIDATION_FILE"] = f"{VAL_PATH}"
# os.environ["TEST_FILE"] = f"{TEST_INPUT_SAVE_PATH}/{TEST_NER_DATA_FILE}"
# os.environ["OUTPUT_DIR"] = f"{PREDICTION_SAVE_PATH}"

In [None]:
# # copy my_seqeval.py to the working directory because the input directory is non-writable
# !cp /kaggle/input/coleridge-packages/my_seqeval.py ./

# # make necessart directories and files
# os.makedirs(TEST_INPUT_SAVE_PATH, exist_ok=True)

In [None]:
# def bert_predict():
#     !python ../input/kaggle-ner-utils/kaggle_run_ner.py \
#     --model_name_or_path "$MODEL_PATH" \
#     --train_file "$TRAIN_FILE" \
#     --validation_file "$VALIDATION_FILE" \
#     --test_file "$TEST_FILE" \
#     --output_dir "$OUTPUT_DIR" \
#     --report_to 'none' \
#     --seed 123 \
#     --do_predict

In [None]:
# bert_outputs = []

# for batch_begin in range(0, len(test_rows), PREDICT_BATCH):
#     # write data rows to input file
#     with open(f'{TEST_INPUT_SAVE_PATH}/{TEST_NER_DATA_FILE}', 'w') as f:
#         for row in test_rows[batch_begin:batch_begin+PREDICT_BATCH]:
#             json.dump(row, f)
#             f.write('\n')
    
#     # remove output dir
#     !rm -r "$OUTPUT_DIR"
    
#     # do predict
#     bert_predict()
    
#     # read predictions
#     with open(f'{PREDICTION_SAVE_PATH}/{PREDICTION_FILE}') as f:
#         this_preds = f.read().split('\n')[:-1]
#         bert_outputs += [pred.split() for pred in this_preds]

### Restore Dataset labels from predictions

In [None]:
# # get test sentences
# test_sentences = [row['tokens'] for row in test_rows]

# del test_rows

In [None]:
# bert_dataset_labels = [] # store all dataset labels for each publication

# for length in paper_length:
#     labels = set()
#     for sentence, pred in zip(test_sentences[:length], bert_outputs[:length]):
#         curr_phrase = ''
#         for word, tag in zip(sentence, pred):
#             if tag == 'B': # start a new phrase
#                 if curr_phrase:
#                     labels.add(curr_phrase)
#                     curr_phrase = ''
#                 curr_phrase = word
#             elif tag == 'I' and curr_phrase: # continue the phrase
#                 curr_phrase += ' ' + word
#             else: # end last phrase (if any)
#                 if curr_phrase:
#                     labels.add(curr_phrase)
#                     curr_phrase = ''
#         # check if the label is the suffix of the sentence
#         if curr_phrase:
#             labels.add(curr_phrase)
#             curr_phrase = ''
    
#     # record dataset labels for this publication
#     bert_dataset_labels.append(labels)
    
#     del test_sentences[:length], bert_outputs[:length]

In [None]:
# bert_dataset_labels[:5]

### Filter based on Jaccard score and clean

In [None]:
def jaccard_similarity(s1, s2):
    l1 = s1.split(" ")
    l2 = s2.split(" ")    
    intersection = len(list(set(l1).intersection(l2)))
    union = (len(l1) + len(l2)) - intersection
    return float(intersection) / union

filtered_bert_labels = []

for labels in bert_dataset_labels:
    filtered = []
    print(labels)

    for label in sorted(labels, key=len):
        label = clean_text(label)
        if len(filtered) == 0 or all(jaccard_similarity(label, got_label) < 0.75 for got_label in filtered):
#         if len(filtered) == 0 or all(jf.jaro_winkler_similarity(label, got_label) < 0.75 for got_label in filtered):
            filtered.append(label)
            print(filtered)
    
    filtered_bert_labels.append('|'.join(filtered))

In [None]:
filtered_bert_labels[:5]

# Aggregate final predictions and write submission file

In [None]:
# final_predictions = []
# for literal_match, bert_pred in zip(literal_preds, filtered_bert_labels):
#     if literal_match:
#         final_predictions.append(literal_match)
#     else:
#         print("we used BERT")
#         final_predictions.append(bert_pred)
        
final_predictions = []
for literal_match in literal_preds:
    final_predictions.append(literal_match)

In [None]:
sample_submission['PredictionString'] = final_predictions
sample_submission.head()

In [None]:
sample_submission.to_csv(f'submission.csv', index=False)