In [None]:
import os
import pandas as pd
import json
import re
import numpy as np
import string
from functools import partial
from tqdm.notebook import tqdm
from collections import defaultdict


from transformers import TFElectraForPreTraining, ElectraTokenizerFast

import tensorflow as tf

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Input, LSTM, Bidirectional, SpatialDropout1D
from tensorflow.keras.models import Model
import tensorflow.keras.backend as K

from tensorflow_addons.text.crf import crf_log_likelihood
from tensorflow_addons.layers.crf import CRF

In [None]:
BASE_DIR = '../input/coleridgeinitiative-show-us-the-data'

test_dir = os.path.join(BASE_DIR, 'test')

sample_submission_path = os.path.join(BASE_DIR, 'sample_submission.csv')
sample_df = pd.read_csv(sample_submission_path)

In [None]:
url_regex = re.compile("https?://[\w!\?/\+\-_~=;\.,\*&@#\$%\(\)'\[\]]+[\w!\?/\+\-_~=\*&@#\$%']")
www_regex = re.compile("www\.[\w!\?/\+\-_~=;\.,\*&@#\$%\(\)'\[\]]+[\w!\?/\+\-_~=\*&@#\$%']")
def get_article(filename, dir_path=test_dir):
    json_path = os.path.join(test_dir, (filename+'.json'))
    contents = []
    with open(json_path, 'r') as f:
        json_decode = json.load(f)
        for data in json_decode:
            section_title = data['section_title']
            section_text= data['text']
            if len(section_text) >= len(section_title):
                contents.append(section_text)
            else:
                contents.append(section_title)
    all_contents = ' '.join(contents)

    return www_regex.sub('', url_regex.sub('', all_contents))

def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower()).strip()

def jaccard_similarity(s1, s2):
    a = set(s1.lower().split()) 
    b = set(s2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [None]:
tqdm.pandas()
sample_df['text'] = sample_df['Id'].progress_apply(get_article)

In [None]:
train_path = os.path.join(BASE_DIR, 'train.csv')
train_df = pd.read_csv(train_path)

bracket_regex = re.compile("\(.+\)")

train_path = os.path.join(BASE_DIR, 'train.csv')
train_tmp = pd.read_csv(train_path)
temp_1 = { 
    bracket_regex.sub('', x).lower().strip() 
    if len(x.split()) > 1 else bracket_regex.sub('', x).lower().strip() + ' ' 
    for x in train_df['dataset_label'].unique()
}
temp_2 = { 
    bracket_regex.sub('', x).lower().strip() 
    if len(x.split()) > 1 else bracket_regex.sub('', x).lower().strip() + ' ' 
    for x in train_df['dataset_title'].unique()
}
existing_labels ={ label for label in (temp_1 | temp_2)}
existing_labels

In [None]:
TOPIC_WORDS = {
    'Study', 'Studies', 'Survey', 'Data', 'Progress', 'Consortium', 'Surveillance', 'Assessment', '1972', ' Aging', 'Inventory', 'Atherosclerosis', 'Religious '
}
STOP_WORDS = {
    'Cooperative', 'Analysis', 'Board', 'Center', 'Climate', 'Report', 'Geodetic', 'Hydrography', 'Initiative', 'Institute', 'Integrated', 'Kindergarten', 
    'Layer', 'Mayo', 'Montreal', 'Panel', 'Questionnaire', 'Adequate', 'Quality', 'Information', 'Harvard ', 'State', 'Scale' 'Transcript',  'Research ', 
    'US ', 'Uniform'
}

DF_THRES = 20

def get_additional_labels(extract_results ,existing_labels, 
                          topic_words=TOPIC_WORDS, stop_words=STOP_WORDS, df_thres=DF_THRES):
    addtional_labels = set()
    for target, df_value in extract_results.items():
        if (df_value >= df_thres and
            any(topic_word in target for topic_word in topic_words) and
            all(stop_word not in target for stop_word in stop_words)):
                cleaned_target = clean_text(target)
                if all(jaccard_similarity(cleaned_target, label) < 0.5 for label in existing_labels):
                    addtional_labels.add(target.lower().strip())
    return addtional_labels


cleaned_existing_labels = {clean_text(label) for label in existing_labels}
extract_results_path = '../input/extract-result/extract_results.json'
with open(extract_results_path, 'r') as f:
    extract_results = json.load(f)
    addtional_sets = get_additional_labels(extract_results, cleaned_existing_labels)


In [None]:
in_bracket_regex = re.compile('(?<=\().+?(?=\))')
abb_extract_results_path = '../input/extract-results-with-abbreviation/extract_results_with_abbreviation.json'
with open(abb_extract_results_path, 'r') as f:
    extract_results = json.load(f)
    abbreviation_patterns = get_additional_labels(extract_results, cleaned_existing_labels, df_thres=100)
    abbreviations = set()
    for abbreviation_pattern in abbreviation_patterns:
        abbreviation = in_bracket_regex.findall(abbreviation_pattern)[0]
        if len(abbreviation) > 3:
            abbreviations.add(f'{abbreviation} ')  

addtional_sets |= abbreviations

In [None]:
addtional_labels = sorted(addtional_sets, key=lambda x: len(x.split()), reverse=True)
addtional_labels

In [None]:
MAX_LENGTH = 128
BATCH_SIZE = 128
ENCODER_DIR = '/kaggle/input/huggingfaceelectra/electra-base-discriminator'

In [None]:
tokenizer = ElectraTokenizerFast.from_pretrained(ENCODER_DIR)
label2id = {
    tokenizer.pad_token: 0,
    tokenizer.cls_token: 1,
    tokenizer.sep_token: 2,
    'B-DATA': 3,
    'I-DATA': 4,
    'O': 5
}

In [None]:
def unpack_data(data):
    if len(data) == 2:
        return data[0], data[1], None
    elif len(data) == 3:
        return data
    else:
        raise TypeError("Expected data to be a tuple of size 2 or 3.")

class ModelWithCRFLoss(tf.keras.Model):
    """Wrapper around the base model for custom training logic."""

    def __init__(self, base_model):
        super().__init__()
        self.base_model = base_model

    @tf.function
    def call(self, inputs):
        return self.base_model(inputs)

    def compute_loss(self, x, y, sample_weight, training=False):
        y_pred = self(x, training=training)
        _, potentials, sequence_length, chain_kernel = y_pred

        # we now add the CRF loss:
        crf_loss = -crf_log_likelihood(potentials, y, sequence_length, chain_kernel)[0]

        if sample_weight is not None:
            crf_loss = crf_loss * sample_weight

        return tf.reduce_mean(crf_loss), sum(self.losses)

    @tf.function
    def train_step(self, data):
        x, y, sample_weight = unpack_data(data)

        with tf.GradientTape() as tape:
            crf_loss, internal_losses = self.compute_loss(
                x, y, sample_weight, training=True
            )
            total_loss = crf_loss + internal_losses

        gradients = tape.gradient(total_loss, self.trainable_variables)
        self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))

        return {"crf_loss": crf_loss, "internal_losses": internal_losses}

    @tf.function
    def test_step(self, data):
        x, y, sample_weight = unpack_data(data)
        crf_loss, internal_losses = self.compute_loss(x, y, sample_weight)
        return {"crf_loss": crf_loss, "internal_losses": internal_losses}

def build_base_model(transformer, num_cls=1, max_len=512):
    input_ids = Input(shape=(max_len,), dtype=tf.int32, name='input_ids')
    input_attention_mask = Input(shape=(max_len,), dtype=tf.int32, name='attention_mask')
    sequence_output = transformer({
        'input_ids': input_ids, 
        'attention_mask': input_attention_mask
    }).hidden_states[0]
    mask = tf.cast(input_attention_mask, tf.bool)
    sequence_output = SpatialDropout1D(0.1)(sequence_output)
    sequence_output = Bidirectional(LSTM(256, return_sequences=True), name='bidirectional_lstm')(sequence_output, mask=mask)
    sequence_output = Dense(num_cls, activation='softmax', name='sequence_output')(sequence_output)
    out = CRF(num_cls, name='crf_output')(sequence_output, mask=mask)
    model = Model(inputs=[input_ids, input_attention_mask], outputs=out)
    return model

In [None]:
def select_sentence(text):
    text_list = text.split('\n')
    text_set = {x for sentence in text_list for x in sentence.split('.')}
    return {
        sentence for sentence in text_set 
        if len(sentence.split()) >= 6
    }

In [None]:
def decode_prediction(x, y, tokenizer, label2id):
    decoded_predictions = set()
    for input_ids, predictions in zip(x, y):
        words = []
        for i, prediction in enumerate(predictions[:len(input_ids)-1]):
            if prediction == label2id['B-DATA']:
                if words:
                    decoded_predictions.add(tokenizer.decode(words))
                    words.clear()
                words.append(input_ids[i])
            elif words:
                if prediction == label2id['I-DATA']:
                    words.append(input_ids[i])
                else:
                    decoded_predictions.add(tokenizer.decode(words))
                    words.clear()
        if words:
            decoded_predictions.add(tokenizer.decode(words))
    return decoded_predictions

## 1st Stage: Text Matching with Additional Labels

In [None]:
test_ids = []

prepared_data = {}
first_stage_predictions = {}
cleaned_addtional_labels = [clean_text(addtional_label) for addtional_label in addtional_labels]
for row in sample_df.itertuples():
    
    sample_text = row.text
    test_id = row.Id
    
    cleaned_labels = set()
    
    sample_text_lower = f'{sample_text} '.lower()
    for known_label in existing_labels:
        if known_label in sample_text_lower:
            cleaned_labels.add(clean_text(known_label))
            
    for addtional_label, cleaned_addtional_label in zip(addtional_labels, cleaned_addtional_labels):
        if addtional_label in sample_text_lower:
            if all(cleaned_addtional_label not in label for label in cleaned_labels):
                cleaned_labels.add(cleaned_addtional_label)           
    first_stage_predictions[test_id] = set(cleaned_labels)

    test_ids.append(test_id)
    
    # preparing data for 2nd stage prediction
    encoded_sentences = tokenizer(
        list(select_sentence(sample_text)),
        return_token_type_ids=False,
        max_length=MAX_LENGTH,
        truncation=True
    )
    prepared_data[test_id] = {
        'input_ids': pad_sequences(encoded_sentences['input_ids'], maxlen=MAX_LENGTH, padding='post'),
        'attention_mask': pad_sequences(encoded_sentences['attention_mask'], maxlen=MAX_LENGTH, padding='post'),
        'no_padded_input_ids': encoded_sentences['input_ids']
    }

## 2nd Stage: Named Entity Recognition

In [None]:
N_FOLDS = 4
CNT_THRES = 2
each_fold_predictions = {}
second_stage_predictions = {}
for fold in range(N_FOLDS):
    model_path = f'../input/coleridge-electra-base-ner4/fold{fold}/electra_base_crf'
    transformer_layer = TFElectraForPreTraining.from_pretrained(ENCODER_DIR, output_hidden_states=True)
    base_model = build_base_model(transformer_layer, num_cls=len(label2id), max_len=MAX_LENGTH)
    model = ModelWithCRFLoss(base_model)
    model.load_weights(model_path)
    for test_id in test_ids:
        x_test = prepared_data[test_id]
        y_pred = model.predict(
            {'input_ids': x_test['input_ids'], 'attention_mask': x_test['attention_mask']},
            batch_size=BATCH_SIZE)[0]
        labels = decode_prediction(
            x_test['no_padded_input_ids'], 
            y_pred, 
            tokenizer, 
            label2id
        )
        
        if test_id not in each_fold_predictions:
            each_fold_predictions[test_id] = defaultdict(int)
        
        for label in labels:
            each_fold_predictions[test_id][label] += 1

second_stage_predictions = {
    test_id: {
        clean_text(label) for label, cnt in each_fold_predictions[test_id].items() if cnt >= CNT_THRES
    } for test_id in test_ids
}

## Post Processing

In [None]:
prediction_string_list = []
for test_id in test_ids:
    first = first_stage_predictions[test_id]
    second = set()
    for ner_label in second_stage_predictions[test_id]:
        cleaned_ner_label = clean_text(ner_label)
        if all(
            jaccard_similarity(cleaned_ner_label, cleaned_matching_label) < 0.5 
            for cleaned_matching_label in first
        ):
            second.add(cleaned_ner_label)
    prediction_string_list.append('|'.join(first | second ))

In [None]:
submission = pd.DataFrame()
submission['Id'] = test_ids
submission['PredictionString'] = prediction_string_list
submission.to_csv('submission.csv', index=False)
submission.head()