In [None]:
# additional python packages
!pip install datasets --no-index --find-links=file:///kaggle/input/coleridge-packages/packages/datasets
!pip install -q ../input/coleridge-packages/seqeval-1.2.2-py3-none-any.whl
!pip install -q ../input/coleridge-packages/tokenizers-0.10.1-cp37-cp37m-manylinux1_x86_64.whl
!pip install -q ../input/coleridge-packages/transformers-4.5.0.dev0-py3-none-any.whl

In [None]:
import os
import re
import json
import simplejson
import time
import datetime
import random
import glob
import importlib

# dataset manipulation
import numpy as np
import pandas as pd

from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

# pytorch
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorForLanguageModeling, \
AutoModelForMaskedLM, Trainer, TrainingArguments, pipeline

from typing import List
import string
from functools import partial

import pickle
from joblib import Parallel, delayed

from collections import defaultdict, Counter
import gc

# tf
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# language preprocessing
import nltk

from typing import *

# spacy
import spacy
from spacy import displacy
from spacy.util import minibatch, compounding

# set seed
sns.set()
random.seed(123)
np.random.seed(456)
torch.manual_seed(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

print('packages loaded')

In [None]:
sample_submission = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/sample_submission.csv')

COMPUTE_CV = False

if len(sample_submission)>4: COMPUTE_CV = False
if COMPUTE_CV:
    print('this submission notebook will compute CV score but commit notebook will not')
else:
    print('this submission notebook will only be used to submit result')
    

In [None]:
train_path = '../input/coleridgeinitiative-show-us-the-data/train.csv'
train = pd.read_csv(train_path)

if COMPUTE_CV:
    sample_submission = train
    paper_test_folder = '../input/coleridgeinitiative-show-us-the-data/train'
    test_files_path = paper_test_folder
else:
    sample_submission = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/sample_submission.csv')
    paper_test_folder = '../input/coleridgeinitiative-show-us-the-data/test'
    test_files_path = paper_test_folder

adnl_govt_labels_path = '../input/bigger-govt-dataset-list/data_set_800.csv'

In [None]:
MAX_SAMPLE = 0

train = train[:MAX_SAMPLE]

paper_train_folder = '../input/coleridgeinitiative-show-us-the-data/train'
papers = {}
for paper_id in train['Id'].unique():
    with open(f'{paper_train_folder}/{paper_id}.json', 'r') as f:
        paper = json.load(f)
        papers[paper_id] = paper

In [None]:
for paper_id in tqdm(sample_submission['Id']):
    with open(f'{paper_test_folder}/{paper_id}.json', 'r') as f:
        paper = json.load(f)
        papers[paper_id] = paper

# Additional goverent dataset

In [None]:
def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower()).strip()

In [None]:
tmp3 = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/train.csv')

tmp3_ = [x for x in tmp3['cleaned_label'].unique() if len(str(x).split()) > 0]
tmp3_ += [x for x in tmp3['dataset_title'].unique()]
tmp3 = [clean_text(x) for x in np.unique(tmp3_)]

In [None]:
tmp8 = pd.read_csv('../input/ci-ext-datasets-found-in-train-v2/train_ext_data.csv')
tmp8['ext_cleaned_label'] = tmp8['ext_cleaned_label'].apply(lambda x: x.split('|'))
all_labels = []
for labels in tmp8['ext_cleaned_label'].values:
    for l in labels:
        all_labels.append(l)
tmp8 = list(np.unique(all_labels))
tmp8 = pd.DataFrame(tmp8, columns=['title'])
tmp8.head()

In [None]:
tmp8_ = []
for l in tmp8['title'].values:
    if l not in tmp3:
        tmp8_.append(l)
        
print(len(tmp8_))
tmp8_ = pd.DataFrame(tmp8_, columns=['title'])

In [None]:
not_datasets = ['about', 'climatologists', 'control', 'exploration', 'defense', 
                'american community', 'american landscape', 'current population survey',
                'gulf of maine', 'argonne national laboratory s greet', 
                'annual wholesale trade',
                'bird conservation areas', 'bird incidental take', 'new housing', 'business patterns',
                'create', 'federal aid to states', 'freedom of information act', 'fruit and vegetable prices',
                'guidance navigation and control', 'high school and beyond', 'human resource management', 
                'housing unit estimates', 'international data base', 'labor market analysts', 'major land uses',
                'mars exploration program', 'new residential construction', 'oxygen delivery system',
                'pilot boarding areas', 'profiles in science', 'state fact sheets', 'summary of business',
                'tsunamis general', 'virtual grower', # 0.620
                
                'advanced monthly', 
                'advanced telecommunications', 
                'agricultural productivity',
                'annual survey', 
                'breeding bird', 
                'bridged race population estimates', 
                'building permits survey',
                'census of governments', 
                'clinical laboratory', 'coastal energy facilities', 
                'commodity costs and returns',
                'comprehensive environmental', 'county typology codes', 'delta cost project', 
                'endangered species act',
                'energy policy act', 
                'fertilizer', 'geostationary', 'landfire', 'occupational projections', 
                'marine mammal protection act', 
                'meat price', 'medication therapy', 'mexican american', 
                'milk cost',
                'animal health', 'weather', 'national environmental policy', 'national outbreak', 'natural amenities scale',
                'office', 'services file', 'stores', 'right whale', 'shuttle radar', 'solar dynamics',
                'business owners', 'expedition', 'usa'
               ]
for l in not_datasets:
    tmp8_ = tmp8_[~tmp8_['title'].str.contains(l)]
    
tmp8_.loc[tmp8_['title'].str.contains('national assessment of educational progress'), 'title'] = 'national assessment of educational progress'
tmp8_.loc[tmp8_['title'].str.contains('national postsecondary student aid study'), 'title'] = 'national postsecondary student aid study'
tmp8_.loc[tmp8_['title'].str.contains('nursing home compare'), 'title'] = 'nursing home compare'
tmp8_.loc[tmp8_['title'].str.contains('private school universe survey'), 'title'] = 'private school universe survey'
tmp8_.loc[tmp8_['title'].str.contains('program for international student assessment'), 'title'] = 'program for international student assessment'
tmp8_.loc[tmp8_['title'].str.contains('progress in international reading literacy study'), 'title'] = 'progress in international reading literacy study'
tmp8_.loc[tmp8_['title'].str.contains('schools and staffing survey'), 'title'] = 'schools and staffing survey'

tmp8_ = list(tmp8_['title'].unique())
print(len(tmp8_))

In [None]:
all_datasets = np.unique(tmp3 + tmp8_)
all_datasets = np.unique([clean_text(x) for x in all_datasets])
print(len(all_datasets))
all_datasets[:5]

In [None]:
def clean_training_text(txt):
    """
    similar to the default clean_text function but without lowercasing.
    """
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt)).strip()

def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower()).strip()

def totally_clean_text(txt):
    txt = clean_text(txt)
    txt = re.sub(' +', ' ', txt)
    return txt

def text_cleaning(text):
    '''
    Converts all text to lower case, Removes special charecters, emojis and multiple spaces
    text - Sentence that needs to be cleaned
    '''
    text = re.sub('[^A-Za-z0-9]+', ' ', str(text).lower()).strip()
    text = re.sub(' +', ' ', text)
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    return text
    
def read_json_pub(filename, train_data_path=paper_train_folder, output='text'):
    json_path = os.path.join(train_data_path, (filename+'.json'))
    headings = []
    contents = []
    combined = []
    with open(json_path, 'r') as f:
        json_decode = json.load(f)
        for data in json_decode:
            headings.append(data.get('section_title'))
            contents.append(data.get('text'))
            combined.append(data.get('section_title'))
            combined.append(data.get('text'))
    
    all_headings = ' '.join(headings)
    all_contents = ' '.join(contents)
    all_data = '. '.join(combined)
    
    if output == 'text':
        return all_contents
    elif output == 'head':
        return all_headings
    else:
        return all_data

# Literal prediction

In [None]:
literal_preds = []
to_append = []
for index, row in tqdm(sample_submission.iterrows()):
    to_append = [row['Id'],'']
    large_string = str(read_json_pub(row['Id'], test_files_path))
    clean_string = text_cleaning(large_string)
    for row2 in all_datasets:
        query_string = str(row2)
        if query_string in clean_string:
            if to_append[1] != '' and clean_text(query_string) not in to_append[1]:
                to_append[1] = to_append[1] + '|' + clean_text(query_string)
            if to_append[1] == '':
                to_append[1] = clean_text(query_string)
    literal_preds.append(*to_append[1:])
literal_preds[:5]

# XLM Roberta prediction

In [None]:
# Auxiliary functions
def jaccard_similarity(s1, s2):
    l1 = s1.split(" ")
    l2 = s2.split(" ")    
    intersection = len(list(set(l1).intersection(l2)))
    union = (len(l1) + len(l2)) - intersection
    return float(intersection) / union

def clean_paper_sentence(s):
    """
    This function is essentially clean_text without lowercasing.
    """
    s = re.sub('[^A-Za-z0-9]+', ' ', str(s)).strip()
    s = re.sub(' +', ' ', s)
    return s

def shorten_sentences(sentences):
    """
    Sentences that have more than MAX_LENGTH words will be split
    into multiple sentences with overlappings.
    """
    short_sentences = []
    for sentence in sentences:
        words = sentence.split()
        if len(words) > MAX_LENGTH:
            for p in range(0, len(words), MAX_LENGTH - OVERLAP):
                short_sentences.append(' '.join(words[p:p+MAX_LENGTH]))
        else:
            short_sentences.append(sentence)
    return short_sentences

In [None]:
# Paths and Hyperparameters
MAX_LENGTH = 64 # max no. words for each sentence.
OVERLAP = 20 # if a sentence exceeds MAX_LENGTH, we split it to multiple sentences with overlapping

PREDICT_BATCH = 64000 

PRETRAINED_PATH = ['../input/coleridge-xlm-roberta-base-epoch-1-training/output']
TEST_INPUT_SAVE_PATH = './input_data'
TEST_NER_DATA_FILE = 'test_ner_input.json'
TRAIN_PATH = ['../input/coleridge-xlm-roberta-base-epoch-1-training/train_ner.json']
VAL_PATH = ['../input/coleridge-xlm-roberta-base-epoch-1-training/train_ner.json']

PREDICTION_SAVE_PATH = './pred'
PREDICTION_FILE = 'test_predictions.txt'

In [None]:
train = train.groupby('Id').agg({
    'pub_title': 'first',
    'dataset_title': '|'.join,
    'dataset_label': '|'.join,
    'cleaned_label': '|'.join
}).reset_index()

print(f'No. grouped training rows: {len(train)}')

In [None]:
test_rows = [] # test data in NER format
paper_length = [] # store the number of sentences each paper has

for paper_id in sample_submission['Id']:
    # load paper
    paper = papers[paper_id]
    
    # extract sentences
    sentences = [clean_training_text(sentence) for section in paper 
                 for sentence in section['text'].split('.')
                ]
    sentences = shorten_sentences(sentences) # make sentences short
    sentences = [sentence for sentence in sentences if len(sentence) > 5] # only accept sentences with length > 10 chars
    sentences = [sentence for sentence in sentences if any(word in sentence.lower() for word in ['data', 'study', 'from'])]
        
    # collect all sentences in json
    for sentence in sentences:
        sentence_words = sentence.split()
        dummy_tags = ['O']*len(sentence_words)
        test_rows.append({'tokens' : sentence_words, 'tags' : dummy_tags})
    
    # track which sentence belongs to which data point
    paper_length.append(len(sentences))
    
print(f'total number of sentences: {len(test_rows)}')

In [None]:
def set_os_env(
    pretrained_path,
    train_path,
    val_path
):
    os.environ["MODEL_PATH"] = f"{pretrained_path}"
    os.environ["TRAIN_FILE"] = f"{train_path}"
    os.environ["VALIDATION_FILE"] = f"{val_path}"
    
    os.environ["TEST_FILE"] = f"{TEST_INPUT_SAVE_PATH}/{TEST_NER_DATA_FILE}"
    os.environ["OUTPUT_DIR"] = f"{PREDICTION_SAVE_PATH}"

In [None]:
# copy my_seqeval.py to the working directory because the input directory is non-writable
!cp /kaggle/input/coleridge-packages/my_seqeval.py ./

# make necessart directories and files
os.makedirs(TEST_INPUT_SAVE_PATH, exist_ok=True)

In [None]:
def bert_predict():
    !python ../input/kaggle-ner-utils/kaggle_run_ner.py \
    --model_name_or_path "$MODEL_PATH" \
    --train_file "$TRAIN_FILE" \
    --validation_file "$VALIDATION_FILE" \
    --test_file "$TEST_FILE" \
    --output_dir "$OUTPUT_DIR" \
    --report_to 'none' \
    --seed 123 \
    --do_predict

In [None]:
final_bert_outputs = []
for i in range(1):
    print(f'Prediction Bert model {i}')
    set_os_env(PRETRAINED_PATH[i], TRAIN_PATH[i], VAL_PATH[i])
    
    bert_outputs = []

    for batch_begin in range(0, len(test_rows), PREDICT_BATCH):
        # write data rows to input file
        with open(f'{TEST_INPUT_SAVE_PATH}/{TEST_NER_DATA_FILE}', 'w') as f:
            for row in test_rows[batch_begin:batch_begin+PREDICT_BATCH]:
                json.dump(row, f)
                f.write('\n')

        # remove output dir
        !rm -r "$OUTPUT_DIR"

        # do predict
        bert_predict()

        # read predictions
        with open(f'{PREDICTION_SAVE_PATH}/{PREDICTION_FILE}') as f:
            this_preds = f.read().split('\n')[:-1]
            bert_outputs += [pred.split() for pred in this_preds]
        break
    final_bert_outputs.append(bert_outputs)

In [None]:
# get test sentences
test_sentences = [row['tokens'] for row in test_rows]

del test_rows

In [None]:
final_dataset_labels = []

for i in range(1):
    
    bert_dataset_labels = [] # store all dataset labels for each publication

    for length in paper_length:
        labels = set()
        for sentence, pred in zip(test_sentences[:length], final_bert_outputs[i][:length]):
            curr_phrase = ''
            for word, tag in zip(sentence, pred):
                if tag == 'B': # start a new phrase
                    if curr_phrase:
                        labels.add(curr_phrase)
                        curr_phrase = ''
                    curr_phrase = word
                elif tag == 'I' and curr_phrase: # continue the phrase
                    curr_phrase += ' ' + word
                else: # end last phrase (if any)
                    if curr_phrase:
                        labels.add(curr_phrase)
                        curr_phrase = ''
            # check if the label is the suffix of the sentence
            if curr_phrase:
                labels.add(curr_phrase)
                curr_phrase = ''

        # record dataset labels for this publication
        bert_dataset_labels.append(labels)

        del test_sentences[:length], final_bert_outputs[i][:length]
    final_dataset_labels.append(bert_dataset_labels)
    
final_dataset_labels[0][:5]

# Filter based on Jaccard

In [None]:
final_xlm_roberta_labels = []
for bert_dataset_labels in final_dataset_labels:
    filtered_bert_labels = []
    for labels in bert_dataset_labels:
        filtered = []

        for label in sorted(labels, key=len):
            label = clean_text(label)
            if len(filtered) == 0 or all(jaccard_similarity(label, got_label) < 0.4 for got_label in filtered):
                filtered.append(label)

        filtered_bert_labels.append('|'.join(filtered))
    final_xlm_roberta_labels.append(filtered_bert_labels)
del filtered_bert_labels

print(final_xlm_roberta_labels[0][:5])

# Spicy prediction

In [None]:
def clean_text(text: str) -> str:               return re.sub('[^A-Za-z0-9]+', ' ', str(text).lower()).strip()
def clean_texts(texts: List[str]) -> List[str]: return [ clean_text(text) for text in texts ] 

def read_json(index: str, test_train) -> Dict:
    filename = f"../input/coleridgeinitiative-show-us-the-data/{test_train}/{index}.json"
    with open(filename) as f:
        json = simplejson.load(f)
    return json
        
def json2text(index: str, test_train) -> str:
    json  = read_json(index, test_train)
    texts = [
        row["section_title"] + " " + row["text"] 
        for row in json
    ]
    text  = " ".join(texts)
    return text

def filename_to_index(filename):
    return re.sub("^.*/|\.[^.]+$", '', filename)

def glob_to_indices(globpath):
    return list(map(filename_to_index, glob.glob(globpath)))

# Inspired by: https://www.kaggle.com/hamditarek/merge-multiple-json-files-to-a-dataframe
def dataset_df(test_train="test"):
    indices = glob_to_indices(f"../input/coleridgeinitiative-show-us-the-data/{test_train}/*.json")    
    texts   = Parallel(-1)( 
        delayed(json2text)(index, test_train)
        for index in indices  
    )
    df = pd.DataFrame([
        { "id": index, "text": text}
        for index, text in zip(indices, texts)
    ])
    df.to_csv(f"{test_train}.json.csv", index=False)
    return df

In [None]:
papers = {}
for paper_id in sample_submission['Id'].values:
    with open(f'../input/coleridgeinitiative-show-us-the-data/test/{paper_id}.json', 'r') as f:
        sections = json.load(f)
        paper = ''
        for section in sections:
            paper = paper + section['text'] + ' .'
    papers[paper_id] = paper
    del paper

In [None]:
# load spacy classifier model
with open('../input/coleridge-spacy-classifier/spacy_model.pickle', 'rb') as f:
    nlp = pickle.load(f)

In [None]:
from gensim.parsing.preprocessing import remove_stopwords
from nltk.tokenize import sent_tokenize

#### remove >.5 jaccard matches from predicitons
def jaccard_similarity(s1, s2):
    l1 = s1.split()
    l2 = s2.split()    
    intersection = len(list(set(l1).intersection(l2)))
    union = (len(l1) + len(l2)) - intersection
    return float(intersection) / union

start_time = time.time()
column_names = ["Id", "PredictionString"]
submission = pd.DataFrame(columns = column_names)

no_delete = ['study', 'dataset', 'model','survey','data','adni','codes', 'genome', 'program','assessment','database','census','initiative','gauge','system','stewardship','surge']

spacy_predictions = []
for index, row in sample_submission.iterrows():
    to_append=[row['Id'],'']
    passage = papers[row['Id']]
    passage=passage.replace("'s","s")
    passage=passage.replace("-"," ")
    passage=passage.replace(","," ")
    
    ######## ACRONYMS
    for match in re.finditer(r"(\(([A-Z]{2,})\))", passage):
    #for match in re.finditer(r"(\((.*?)\))", data):
        caps=[]
        start_index = match.start()
        abbr = match.group(1)
        size = len(abbr)
        words = passage[:start_index].split()[-size:]
        for word in words:
            if word[0].isupper():
                caps.append(word)
        definition = " ".join(caps)
        if sum(1 for c in definition if c.isupper()) < 15:
            words = [word for word in no_delete if word in definition.lower()]
            doc=nlp(definition)
            score=doc.cats['POSITIVE']
            if len(words)>0 and  score > .99:
                if to_append[1]!='' and definition not in to_append[1]:
                    to_append[1] = to_append[1]+'|'+definition+'|'+abbr
                    to_append[1] = to_append[1]+'|'+abbr
                if to_append[1]=='':
                    to_append[1] = definition
                    to_append[1] = to_append[1]+'|'+abbr
                            
    #### cap word sequence
    if to_append[1]=='':        
        mylist=re.findall('([A-Z][\w-]*(?:\s+[A-Z][\w-]*)+)', remove_stopwords(passage))
        mylist = list(dict.fromkeys(mylist))
        for match in mylist:
            upper_score=sum(1 for c in match if c.isupper())
            if upper_score < 15:
                words = [word for word in no_delete if word in match.lower()]
                doc=nlp(match)
                score=doc.cats['POSITIVE']
                if len(words)>0 and len(match.split())>=2 and score > .99:
                    if to_append[1]!='' and match not in to_append[1]:
                        to_append[1] = to_append[1]+'|'+match
                    if to_append[1]=='':
                        to_append[1] = match
            
    ###### remove similar jaccard
    got_label=to_append[1].split('|')
    filtered=[]
    filtered_labels = ''
    for label in sorted(got_label, key=len):
        label = clean_text(label)
        if len(filtered) == 0 or all(jaccard_similarity(label, got_label) < .5 for got_label in filtered):
            filtered.append(label)
            if filtered_labels!='':
                filtered_labels=filtered_labels+'|'+label
            if filtered_labels=='':
                filtered_labels=label
    
    to_append[1] = filtered_labels  
    
    spacy_predictions.append(to_append[1])
    
print("--- %s seconds ---" % (time.time() - start_time))

spacy_predictions[:5]

# Custom transformer model

In [None]:
model_path = '../input/ci-transformers-model-v2/model/sent_transformer'
tokenizer_path = '../input/ci-transformers-model-v2/tokenizer.pickle'

In [None]:
""" build transformer model"""

maxlen = 500
num_classes = 2
vocab_size = 32824

embed_dim = 32  # Embedding size for each token
num_heads = 2  # Number of attention heads
ff_dim = 32  # Hidden layer size in feed forward network inside transformer

class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

inputs = layers.Input(shape=(maxlen,))
embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
x = embedding_layer(inputs)
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
x = transformer_block(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.1)(x)
x = layers.Dense(20, activation="relu")(x)
x = layers.Dropout(0.1)(x)
outputs = layers.Dense(2, activation="softmax")(x)

model_t = keras.Model(inputs=inputs, outputs=outputs)
model_t.load_weights(model_path)
model_t.compile(loss='sparse_categorical_crossentropy',
              optimizer=tf.keras.optimizers.Adam(), metrics=['accuracy'])
callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0,
                patience=0, verbose=1, mode='auto', baseline=None, restore_best_weights=True)

In [None]:
from fuzzywuzzy import fuzz

# prepare list of dataset titles to match
train_df = pd.read_csv('../input/bigger-govt-dataset-list/data_set_800.csv')
sample_sub = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/sample_submission.csv')

ds_titles = all_datasets

filtered = []
labels = ds_titles
for label in sorted(labels, key=len):
    label = clean_text(label)
    if len(filtered) == 0 or all(jaccard_similarity(label, got_label) < 0.2 for got_label in filtered):
        filtered.append(label)
        
ds_titles = np.array(filtered)
ds_titles.shape

In [None]:
tokenizer = ''
with open(tokenizer_path, "rb") as openfile:
    tokenizer = pickle.load(openfile)
            
test_data_path = '../input/coleridgeinitiative-show-us-the-data/test'
test_sentences = {}
candidate_threshold = 0.3
acceptance_score = 80

def read_json_pub(Id):
    filename = os.path.join(test_data_path, Id+'.json')
    with open(filename) as f:
        json_pub = json.load(f)
    return json_pub

transformers_preds = []
for index, row in tqdm(sample_submission.iterrows(), total = sample_submission.shape[0]):
    # Load text
    raw_text = read_json_pub(row['Id'])
    text = '\n'.join([z for y in raw_text for z in y.values()])

    # split and clean sentences
    sentences = nltk.sent_tokenize(re.sub(r'\.?\n', '. ', text))
    sentences = [re.sub(r"[^a-z ]+","", s.lower()) for s in sentences]
    
    # tokenize
    tokens = tokenizer.texts_to_sequences(sentences)
    tokens = tf.keras.preprocessing.sequence.pad_sequences(
        tokens, maxlen=maxlen, padding='pre',)

    # Predict candidates sentences that may contain DS references
    y_pred = model_t.predict(tokens, batch_size=32)
    sent_candidates = np.array(sentences)[y_pred[:,1] > candidate_threshold]
    test_sentences[row['Id']] = sent_candidates

    ds_candidates = set()
    for sent in sent_candidates:
        scores = [fuzz.partial_ratio(sent.lower(), title) for title in ds_titles]
        best_fit_title_index = np.argmax(scores)
        if max(scores) > acceptance_score:
            ds_candidates.add(ds_titles[np.argmax(scores)])
    prediction_string = ' | '.join(ds_candidates)
    transformers_preds.append(prediction_string)
    
transformers_preds[:5]

# Aggregate all predictions

In [None]:
final_predictions = []
for bert_pred, trans_pred, spacy_pred, literal_match in zip(
    final_xlm_roberta_labels[0], 
    transformers_preds, 
    spacy_predictions,
    literal_preds
):        
    pred1 = [x for x in bert_pred.split('|') if x not in ['']]
    pred2 = [x for x in trans_pred.split('|') if x not in ['']]
    pred3 = [x for x in spacy_pred.split('|') if x not in ['']]

    labels = np.unique(pred1+pred2+pred3)
    if len(labels)>0:
        filtered = []
        for label in tqdm(sorted(labels, key=len)):
            label = clean_text(label)
            if len(filtered) == 0 or all(jaccard_similarity(label, got_label) < 0.4 for got_label in filtered):
                filtered.append(label)

        final_predictions.append('|'.join(filtered))
    else:
        final_predictions.append(literal_match)        

final_predictions[:5]

In [None]:
sample_submission['PredictionString'] = final_predictions
sample_submission[['Id', 'PredictionString']].to_csv('submission.csv', index=False)

sample_submission.head()