In [None]:
!pip install datasets --no-index --find-links=file:///kaggle/input/coleridge-packages/packages/datasets
!pip install ../input/coleridge-packages/seqeval-1.2.2-py3-none-any.whl
!pip install ../input/coleridge-packages/tokenizers-0.10.1-cp37-cp37m-manylinux1_x86_64.whl
!pip install ../input/coleridge-packages/transformers-4.5.0.dev0-py3-none-any.whl

In [None]:
import os
import re
import json
import time
import datetime
import random
import glob
import importlib

import numpy as np
import pandas as pd

from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

random.seed(123)
np.random.seed(456)

MAX_SAMPLE = None

In [None]:
train_df = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/train.csv')
train_df = train_df[:MAX_SAMPLE]

In [None]:
paper_train_folder = '../input/coleridgeinitiative-show-us-the-data/train'
papers = {}
for paper_id in train_df['Id'].unique():
    with open(f'{paper_train_folder}/{paper_id}.json', 'r') as f:
        paper = json.load(f)
        papers[paper_id] = paper

In [None]:
sub_df = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/sample_submission.csv')

test_files = glob.glob("../input/coleridgeinitiative-show-us-the-data/test/*.json")

In [None]:
def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower()).strip()

def totally_clean_text(txt):
    txt = clean_text(txt)
    txt = re.sub(' +', ' ', txt)
    return txt

In [None]:
test_df = pd.DataFrame()

for test_file in test_files:
    text_data=pd.read_json(test_file)
    text_data.insert(0,'id', test_file.split('/')[-1].split('.')[0]) 
    test_df = pd.concat([test_df, text_data])

In [None]:
all_labels = set()

for label1, label2, label3 in train_df[['dataset_title', 'dataset_label', 'cleaned_label']].itertuples(index=False):
    all_labels.add(str(label1).lower())
    all_labels.add(str(label2).lower())
    all_labels.add(str(label3).lower())
    
print(f'No. different labels: {len(all_labels)}')

In [None]:
bigger_dataset_df = pd.read_csv('../input/filtered-bigger-govt-dataset/ExtraLabelsCleaned.txt')
bigger_dataset_df.head()

In [None]:
for label in bigger_dataset_df['Label']:
    all_labels.add(str(label).lower())

In [None]:
print(f'No. different labels: {len(all_labels)}')

In [None]:
test_df['cleaned_text'] = test_df.apply(lambda txt: clean_text(txt['text']).strip(), axis =1)

In [None]:
test_group = pd.DataFrame(test_df.groupby(['id'])['cleaned_text'].agg(' '.join))

In [None]:
result = test_group['cleaned_text'].apply(lambda txt: '|'.join({label for label in all_labels if label in txt})).reset_index()
result.columns =['Id', 'PredictionString']

In [None]:
literal_preds = []

for pred in result['PredictionString']:
    literal_preds.append(pred)

In [None]:
literal_preds[:5]

In [None]:
MAX_LENGTH = 64 
OVERLAP = 20 

PREDICT_BATCH = 64000 

PRETRAINED_PATH = '../input/coleridge-bert-models/output'
TEST_INPUT_SAVE_PATH = './input_data'
TEST_NER_DATA_FILE = 'test_ner_input.json'
TRAIN_PATH = '../input/coleridge-bert-models/train_ner.json'
VAL_PATH = '../input/coleridge-bert-models/train_ner.json'

PREDICTION_SAVE_PATH = './pred'
PREDICTION_FILE = 'test_predictions.txt'

In [None]:
train = train_df.groupby('Id').agg({
    'pub_title': 'first',
    'dataset_title': '|'.join,
    'dataset_label': '|'.join,
    'cleaned_label': '|'.join
}).reset_index()

print(f'No. grouped training rows: {len(train_df)}')

In [None]:
def clean_training_text(txt):

    return re.sub('[^A-Za-z0-9]+', ' ', str(txt)).strip()

def shorten_sentences(sentences):
    short_sentences = []
    for sentence in sentences:
        words = sentence.split()
        if len(words) > MAX_LENGTH:
            for p in range(0, len(words), MAX_LENGTH - OVERLAP):
                short_sentences.append(' '.join(words[p:p+MAX_LENGTH]))
        else:
            short_sentences.append(sentence)
    return short_sentences

In [None]:
test_rows = [] # test data in NER format
paper_length = [] # store the number of sentences each paper has

for paper_id in result['Id']:

    paper = papers[paper_id]
    
    sentences = [clean_training_text(sentence) for section in paper 
                 for sentence in section['text'].split('.')
                ]
    sentences = shorten_sentences(sentences) # make sentences short
    sentences = [sentence for sentence in sentences if len(sentence) > 10] # only accept sentences with length > 10 chars
    sentences = [sentence for sentence in sentences if any(word in sentence.lower() for word in ['data', 'study'])]
        
    for sentence in sentences:
        sentence_words = sentence.split()
        dummy_tags = ['O']*len(sentence_words)
        test_rows.append({'tokens' : sentence_words, 'tags' : dummy_tags})
    
    paper_length.append(len(sentences))
    
print(f'total number of sentences: {len(test_rows)}')

In [None]:
os.environ["MODEL_PATH"] = f"{PRETRAINED_PATH}"
os.environ["TRAIN_FILE"] = f"{TRAIN_PATH}"
os.environ["VALIDATION_FILE"] = f"{VAL_PATH}"
os.environ["TEST_FILE"] = f"{TEST_INPUT_SAVE_PATH}/{TEST_NER_DATA_FILE}"
os.environ["OUTPUT_DIR"] = f"{PREDICTION_SAVE_PATH}"

In [None]:
!cp /kaggle/input/coleridge-packages/my_seqeval.py ./

os.makedirs(TEST_INPUT_SAVE_PATH, exist_ok=True)

In [None]:
def bert_predict():
    !python ../input/kaggle-ner-utils/kaggle_run_ner.py \
    --model_name_or_path "$MODEL_PATH" \
    --train_file "$TRAIN_FILE" \
    --validation_file "$VALIDATION_FILE" \
    --test_file "$TEST_FILE" \
    --output_dir "$OUTPUT_DIR" \
    --report_to 'none' \
    --seed 123 \
    --do_predict

In [None]:
bert_outputs = []

for batch_begin in range(0, len(test_rows), PREDICT_BATCH):
    with open(f'{TEST_INPUT_SAVE_PATH}/{TEST_NER_DATA_FILE}', 'w') as f:
        for row in test_rows[batch_begin:batch_begin+PREDICT_BATCH]:
            json.dump(row, f)
            f.write('\n')
    
    !rm -r "$OUTPUT_DIR"
    
    bert_predict()
    
    with open(f'{PREDICTION_SAVE_PATH}/{PREDICTION_FILE}') as f:
        this_preds = f.read().split('\n')[:-1]
        bert_outputs += [pred.split() for pred in this_preds]

In [None]:
test_sentences = [row['tokens'] for row in test_rows]

del test_rows

In [None]:
bert_dataset_labels = [] 

for length in paper_length:
    labels = set()
    for sentence, pred in zip(test_sentences[:length], bert_outputs[:length]):
        curr_phrase = ''
        for word, tag in zip(sentence, pred):
            if tag == 'B': 
                if curr_phrase:
                    labels.add(curr_phrase)
                    curr_phrase = ''
                curr_phrase = word
            elif tag == 'I' and curr_phrase: 
                curr_phrase += ' ' + word
            else: 
                if curr_phrase:
                    labels.add(curr_phrase)
                    curr_phrase = ''
       
        if curr_phrase:
            labels.add(curr_phrase)
            curr_phrase = ''
    

    bert_dataset_labels.append(labels)
    
    del test_sentences[:length], bert_outputs[:length]

In [None]:
bert_dataset_labels[:5]

In [None]:
def jaccard_similarity(s1, s2):
    l1 = s1.split(" ")
    l2 = s2.split(" ")    
    intersection = len(list(set(l1).intersection(l2)))
    union = (len(l1) + len(l2)) - intersection
    return float(intersection) / union

filtered_bert_labels = []

for labels in bert_dataset_labels:
    filtered = []
    
    for label in sorted(labels, key=len):
        label = clean_text(label)
        if len(filtered) == 0 or all(jaccard_similarity(label, got_label) < 0.75 for got_label in filtered):
            filtered.append(label)
    
    filtered_bert_labels.append('|'.join(filtered))

In [None]:
filtered_bert_labels[:5]

In [None]:
final_predictions = []
for literal_match, bert_pred in zip(literal_preds, filtered_bert_labels):
    if literal_match:
        final_predictions.append(literal_match)
    else:
        final_predictions.append(bert_pred)

In [None]:
result['PredictionString'] = final_predictions
result.head()

In [None]:
result.to_csv(f'submission.csv', index=False)