# Initialization

## Imports

In [None]:
import re
import os
import gc
import glob
import json
import torch
import tokenizers
import numpy as np
import transformers
import pandas as pd
import torch.nn as nn
import seaborn as sns
import matplotlib.pyplot as plt

from tokenizers import *
from transformers import *
from tqdm.notebook import tqdm
from torch.utils.data import DataLoader, Dataset

## Params

In [None]:
SEED = 2020

DATA_PATH = "../input/coleridgeinitiative-show-us-the-data/"
DATA_PATH_TRAIN = DATA_PATH + 'train/'
DATA_PATH_TEST = DATA_PATH + 'test/'

CP_PATH = '../input/coleridge-bert-qa/'

NUM_WORKERS = 4

VOCABS = {
    "bert-base-uncased": "../input/vocabs/bert-base-uncased-vocab.txt",
    "roberta-base": "../input/vocabs/roberta-base-vocab.json",
}

MERGES = {
    "roberta-base": "../input/vocabs/roberta-base-merges.txt"
}

MODEL_PATHS = {
    'bert-base-uncased': '../input/bertconfigs/uncased_L-12_H-768_A-12/uncased_L-12_H-768_A-12/',
    'bert-large-uncased-whole-word-masking-finetuned-squad': '../input/bertconfigs/wwm_uncased_L-24_H-1024_A-16/wwm_uncased_L-24_H-1024_A-16/',
    'albert-large-v2': '../input/albert-configs/albert-large-v2/albert-large-v2/',
    'albert-base-v2': '../input/albert-configs/albert-base-v2/albert-base-v2/',
    'distilbert': '../input/albert-configs/distilbert/distilbert/',
    'roberta-base': '../input/robertabaseconf/',
}

# Utils

In [None]:
def load_model_weights(model, filename, verbose=1, cp_folder=""):
    """
    Loads the weights of a PyTorch model. The exception handles cpu/gpu incompatibilities.
    Args:
        model (torch model): Model to load the weights to.
        filename (str): Name of the checkpoint.
        verbose (int, optional): Whether to display infos. Defaults to 1.
        cp_folder (str, optional): Folder to load from. Defaults to "".
    Returns:
        torch model: Model with loaded weights.
    """

    if verbose:
        print(f"\n -> Loading weights from {os.path.join(cp_folder,filename)}\n")
    try:
        model.load_state_dict(os.path.join(cp_folder, filename), strict=True)
    except BaseException:
        model.load_state_dict(
            torch.load(os.path.join(cp_folder, filename), map_location="cpu"),
            strict=True,
        )
    return model


def seed_everything(seed):
    """
    Seeds basic parameters for reproductibility of results.
    Args:
        seed (int): Number of the seed.
    """
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    

class Config:
    """
    Placeholder to load a config from a saved json
    """
    def __init__(self, dic):
        for k, v in dic.items():
            setattr(self, k, v)

# Data

## Tokenizer

In [None]:
def create_tokenizer_and_tokens(config):
    if "roberta" in config.selected_model:
        tokenizer = ByteLevelBPETokenizer(
            vocab=VOCABS[config.selected_model],
            merges=MERGES[config.selected_model],
            lowercase=config.lowercase,
            add_prefix_space=True,
        )

        tokens = {
            "cls": tokenizer.token_to_id("<s>"),
            "sep": tokenizer.token_to_id("</s>"),
            "pad": tokenizer.token_to_id("<pad>"),
        }

    elif "albert" in config.selected_model:
        raise NotImplementedError

    else:
        tokenizer = BertWordPieceTokenizer(
            VOCABS[config.selected_model],
            lowercase=config.lowercase,
        )

        tokens = {
            "cls": tokenizer.token_to_id("[CLS]"),
            "sep": tokenizer.token_to_id("[SEP]"),
            "pad": tokenizer.token_to_id("[PAD]"),
        }

    return tokenizer, tokens


## Process sample

In [None]:
def load_text(id_, root=""):
    with open(os.path.join(root, id_ + ".json")) as f:
        text = json.load(f)
    return text


def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower()).strip()


def remove_spaces(txt):
    txt = re.sub('\n', ' ', txt)
    txt = re.sub('\t', ' ', txt)
    txt = re.sub(r'\s+', ' ', txt)
    return txt.strip()


def process_data_inf(
    text,
    tokenizer,
    tokens,
    max_len=512,
    model_name="bert",
):
    text = " " + " ".join(str(text).split())
    tokenized = tokenizer.encode(text)

    input_ids = tokenized.ids
    offsets = tokenized.offsets

    if len(input_ids) > max_len - 2:
        input_ids = input_ids[:max_len - 2]
        offsets = offsets[:max_len - 2]

    input_ids = [tokens["cls"]] + input_ids + [tokens["sep"]]
    offsets = [(0, 0)] + offsets + [(0, 0)]

    assert len(input_ids) <= max_len
    assert len(input_ids) == len(offsets)

    if "roberta" in model_name:
        token_type_ids = [0] * len(input_ids)
    else:
        token_type_ids = [1] * len(input_ids)

    padding_length = max_len - len(input_ids)
    if padding_length > 0:
        input_ids += [tokens["pad"]] * padding_length
        token_type_ids += [0] * padding_length
        offsets += [(0, 0)] * padding_length

    return {
        "ids": input_ids,
        "token_type_ids": token_type_ids,
        "text": text,
        "offsets": offsets,
    }


## Dataset

In [None]:
class ArticleDataset(Dataset):
    def __init__(
        self,
        id_,
        tokenizer,
        tokens,
        max_len=512,
        model_name="bert",
        root="",
    ):
        self.tokens = tokens
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.model_name = model_name

        self.article = load_text(id_, root=root)
        self.texts = self.article_to_texts()

    def __len__(self):
        return len(self.texts)

    def article_to_texts(self):
        texts = []
        for section in self.article:
            sentences = remove_spaces(section["text"]).split(". ")

            text = ""
            for i in range(len(sentences)):
                if (
                    len(self.tokenizer.encode(text + sentences[i] + ". ")) < self.max_len - 2
                ):
                    text += sentences[i] + ". "
                else:
                    texts.append(text)
                    text = sentences[i] + ". "

            if len(text):
                texts.append(text)

        return texts

    def article_to_texts_2(self):
        texts = []
        text = ""
        for section in self.article:
            sentences = remove_spaces(section["text"]).split(". ")

            for i in range(len(sentences)):
                if (
                    len(self.tokenizer.encode(text + sentences[i] + ". "))
                    < self.max_len - 2
                ):
                    text += sentences[i] + ". "
                else:
                    texts.append(text)
                    text = sentences[i] + ". "

        if len(text):
            texts.append(text)

        return texts

    def __getitem__(self, idx):
#         print(self.texts[idx])
        data = process_data_inf(
            self.texts[idx],
            self.tokenizer,
            self.tokens,
            max_len=self.max_len,
            model_name=self.model_name,
        )

        return {
            "ids": torch.tensor(data["ids"], dtype=torch.long),
            "token_type_ids": torch.tensor(data["token_type_ids"], dtype=torch.long),
            "text": data["text"],
            "offsets": torch.tensor(data["offsets"], dtype=torch.float),
        }


# Model

In [None]:
TRANSFORMERS = {
    "roberta-base": (RobertaModel, "roberta-base", RobertaConfig),
    "albert-base-v2": (AlbertModel, "albert-base-v2", AlbertConfig),
    "albert-large-v2": (AlbertModel, "albert-large-v2", AlbertConfig),
    "albert-xlarge-v2": (AlbertModel, "albert-xlarge-v2", AlbertConfig),
    "albert-xxlarge-v2": (AlbertModel, "albert-xxlarge-v2", AlbertConfig),
    "bert-base-uncased": (BertModel, "bert-base-uncased", BertConfig),
    "bert-base-cased": (BertModel, "bert-base-cased", BertConfig),
    "bert-large-uncased-whole-word-masking": (
        BertModel,
        "bert-large-uncased-whole-word-masking",
        BertConfig,
    ),
    "distilbert-base-uncased-distilled-squad": (
        DistilBertModel,
        "distilbert-base-uncased-distilled-squad",
        BertConfig,
    ),
}


class NERTransformer(nn.Module):
    def __init__(
        self,
        model,
        nb_layers=1,
        nb_ft=None,
        k=5,
        drop_p=0.1,
        multi_sample_dropout=False,
        use_squad_weights=False,
    ):
        super().__init__()
        self.name = model
        self.nb_layers = nb_layers
        self.multi_sample_dropout = multi_sample_dropout

        self.pad_idx = 1 if "roberta" in self.name else 0

        model_class, _, config_class = TRANSFORMERS[model]

        try:
            config = config_class.from_json_file(MODEL_PATHS[model] + 'bert_config.json')
        except:
            config = config_class.from_json_file(MODEL_PATHS[model] + 'config.json')
        config.output_hidden_states = True

        self.transformer =  model_class(config)

        if "distil" in self.name:
            self.nb_features = self.transformer.transformer.layer[
                -1
            ].ffn.lin2.out_features
        elif "albert" in self.name:
            self.nb_features = (
                self.transformer.encoder.albert_layer_groups[-1]
                .albert_layers[-1]
                .ffn_output.out_features
            )
        else:
            self.nb_features = self.transformer.pooler.dense.out_features

        if nb_ft is None:
            nb_ft = self.nb_features

        self.logits = nn.Sequential(
            # nn.Linear(nb_ft * 2, nb_ft),
            # nn.Tanh(),
            nn.Linear(nb_ft, 1),
        )
        self.cnn = nn.Sequential(
            nn.Conv1d(self.nb_features * self.nb_layers, nb_ft * 2, kernel_size=k, padding=k // 2),
            nn.Tanh(),
            nn.Dropout(drop_p),
            nn.Conv1d(nb_ft * 2, nb_ft, kernel_size=k, padding=k // 2),
            nn.Tanh(),
            nn.Dropout(drop_p),
        )

        self.high_dropout = nn.Dropout(p=0.5)

    def forward(self, tokens, token_type_ids):
        """
        Usual torch forward function

        Arguments:
            tokens {torch tensor} -- Sentence tokens
            token_type_ids {torch tensor} -- Sentence tokens ids
        """

        if "distil" in self.name:
            hidden_states = self.transformer(
                tokens,
                attention_mask=(tokens != self.pad_idx).long(),
            )[-1]
        else:
            hidden_states = self.transformer(
                tokens,
                attention_mask=(tokens != self.pad_idx).long(),
                token_type_ids=token_type_ids,
            )[-1]

        hidden_states = hidden_states[::-1]
        features = torch.cat(hidden_states[:self.nb_layers], -1)

        if self.multi_sample_dropout and self.training:
            logits = torch.mean(
                torch.stack(
                    [
                        self.logits(
                            self.cnn(self.high_dropout(features).transpose(1, 2)).transpose(1, 2)
                        )
                        for _ in range(5)
                    ],
                    dim=0,
                ),
                dim=0,
            )
        else:
            logits = self.logits(
                self.cnn(features.transpose(1, 2)).transpose(1, 2)
            )

        return logits


# Inference


## Predict

In [None]:
def predict(model, dataset, batch_size=32, activation="sigmoid"):
    """
    Usual predict torch function
    """
    model.eval()

    loader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=NUM_WORKERS,
        pin_memory=True,
    )

    preds = []
    with torch.no_grad():
        for data in loader:
            # ids, token_type_ids = trim_tensors(
            #     [data["ids"], data["token_type_ids"]],
            #     model.name
            # )  # TODO : doesn't work with concat
            ids, token_type_ids = data["ids"], data["token_type_ids"]

            y_pred = model(ids.cuda(), token_type_ids.cuda())

            if activation == "sigmoid":
                y_pred = torch.sigmoid(y_pred)
            elif activation == "softmax":
                y_pred = torch.softmax(y_pred, 2)

            preds.append(y_pred.detach().cpu().numpy())

    return np.concatenate(preds)

## Predicted strings from probas

In [None]:
def get_predicted_string(text, offset, pred, threshold=0.5):
    text_preds, text_probs, starts = [], [], []
    pred_str, proba, start = "", [], None
    for j in range(1, len(pred) - 1):
        if pred[j] > threshold:
            pred_str += text[offset[j][0]: offset[j][1]]
            if start is None:
                start = offset[j][0]
            proba.append(pred[j])
            if j + 1 < len(offset) and offset[j][1] < offset[j + 1][0]:
                pred_str += " "

        elif pred[j] < threshold and len(pred_str):
            text_preds.append(pred_str)
            text_probs.append(proba)
            starts.append(start)
            pred_str, proba, start = "", [], None

    if len(pred_str):
        text_preds.append(pred_str)
        text_probs.append(proba)

    return text_preds, text_probs, starts


def get_pred_from_probas(dataset, probas, threshold=0.5, min_confidence=0.9):
    predicted_strings = []
    for i in range(len(dataset)):
        data = dataset[i]
        pred = probas[i]
        offset = data["offsets"].cpu().numpy().astype(int)
        text = data["text"]

        text_preds, text_probs, starts = get_predicted_string(
            text, offset, pred, threshold=0.5
        )

        preds_pp = []
        for text_pred, text_prob, start in zip(text_preds, text_probs, starts):
            new_pred = text_pred
            
            # not full word
            if start > 0:
                if text[start - 1] != " " and text[start] != " ":
                    new_pred = " ".join(text_pred.split(" ")[1:])

            if start + len(text_pred) < len(text):
                if (
                    text[start + len(text_pred)] != " "
                    and text[start + len(text_pred) - 1] != " "
                ):
                    new_pred = " ".join(new_pred.split(" ")[:-1])

            # Way too short
            if len(new_pred.split(" ")) < 2:
                continue
            elif len(new_pred) < 5:
                continue

            # Low confidence
            if np.max(text_prob) < min_confidence:
                continue

            preds_pp.append(new_pred.strip())

        predicted_strings += preds_pp

    return predicted_strings

## Post-processing

### Overlap

In [None]:
nationalities = [clean_text(txt) for txt in pd.read_csv('../input/coleridge-resources/nationalities.csv')['Nationality'].values]
countries = [clean_text(txt) for txt in pd.read_csv('../input/coleridge-resources/countries.csv')['Name'].values]
us_states = [clean_text(txt) for txt in pd.read_csv('../input/coleridge-resources/us_states.csv')['State'].values]

us_states_abv = [clean_text(txt) for txt in pd.read_csv('../input/coleridge-resources/us_states.csv')['Abbreviation'].values]
us_states_abv += [a[0] + ' ' + a[1] for a in us_states_abv]

SUFFIXES = [
    'data', 'dataset', 'data set', 'data sets', 'datasets', 'database', 'databases', 'catalog', 'catalogs',
    'survey', 'surveys', 'study', 'studies', 'census', 'program', 'programs', 'assessment', 'assessments',
    'registry', 'registries', 'list', 'lists', 'network', 'networks', 'archive', 'archives'
]

PREFIXES = nationalities + countries + us_states + us_states_abv + [
    'international', 'national', 'european', 'europe', 'american', 'america', 'african', 'africa', 'asian', 'asia', 
    'u s', 'us', 'uk', 'u k']


def post_process_overlap(preds, text):
#     preds = np.unique([clean_text(p) for p in preds])
    
    to_remove = []
    for i in range(len(preds)):
        for j in range(len(preds)):
            if i != j:
                if preds[i] in preds[j]:
                    count_i = text.count(preds[i])
                    count_j = text.count(preds[j])
                    
                    if count_i == count_j:  
                        # i is included in j everywhere, we keep j
                        to_keep = [j]
                        to_remove.append(i)
                    else:
                        before = preds[j][:preds[j].find(preds[i])].strip()
                        after = preds[j][preds[j].find(preds[i]) + len(preds[i]):].strip()
                        
                        if after in SUFFIXES and not len(before):
                            pass
                        
                        elif before in PREFIXES and not len(after):
                            pass
                            
                        elif after in SUFFIXES and before in PREFIXES:
                            pass
                            
                        elif len(after.split(' ')) < 4 and after.split(' ')[-1] in SUFFIXES:
                            to_remove.append(i)
                            
                        else:
                            to_remove.append(j)
                    
    preds_pp = [p for i, p in enumerate(preds) if i not in to_remove]
    return preds_pp

### Fuzzywuzzy

In [None]:
df = pd.read_csv(DATA_PATH + 'train.csv')
original_datasets = set(
    list(df['dataset_title'].apply(clean_text)) +
    list(df['dataset_label'].apply(clean_text)) +
    list(df['cleaned_label'])
)

# extra_datasets = pd.read_csv('../input/bigger-govt-dataset-list/data_set_800.csv')['title'].values.tolist()
extra_datasets = pd.read_csv('../input/coleridge-resources/datasets_100k.csv')['title'].values.tolist()
extra_datasets = list(dict.fromkeys([clean_text(d) for d in extra_datasets]))
extra_datasets = set(extra_datasets)

candidates = pd.read_csv("../input/coleridge-resources/candidates_done_2.csv", sep=";").dropna()
candidates = candidates[candidates['status'] == 'y']
candidates_datasets = candidates[candidates['status_sure'] == 'y']['label']
candidates_datasets = set([clean_text(d) for d in candidates_datasets])

DATASETS  = list(original_datasets.union(extra_datasets).union(candidates_datasets))

print(f'Retrieved {len(DATASETS)} datasets')

In [None]:
from fuzzywuzzy import fuzz

IS_OK = []
IS_NOT_OK = []

def post_process_fuzz(all_preds, datasets, threshold=60):
    all_preds_pp = []

    for preds in all_preds:
        preds_pp = []
        for pred in preds:
            
            if pred in IS_OK:
                preds_pp.append(pred)
                break
                
            elif pred in IS_NOT_OK:
                break
            
            found = False
            for dataset in datasets:
                score = fuzz.ratio(dataset, pred)
                if score > threshold:
                    preds_pp.append(pred)
                    IS_OK.append(pred)
                    found = True
                    break
            
            if not found:
#                 print(f'removed {pred}')
                IS_NOT_OK.append(pred)
        all_preds_pp.append(preds_pp)
    return all_preds_pp

### Others

In [None]:
KEYWORDS = [
    'initiative',
    "data",
    "dataset",
    "database",
    "catalog",
    "survey",
    "study",
    "studies",
    "census",
    "program",
    'assessment',
    "registry",
    "registries",
    "list",
    "index",
    'archive',
    'inventory',
    'inventories',
    'network',
    'model',
    'system',
    'report',
]

def post_process_indicators(preds):
    preds_pp = []
    for pred in preds:
        pred_pp = []
        for p in pred:
            if any([r in clean_text(p) for r in KEYWORDS]):
                pred_pp.append(p)
        preds_pp.append(pred_pp)
    
    return preds_pp


def post_process_len(preds, min_len=10):
    new_preds = []
    for pred in preds:
        new_pred = [p for p in pred if len(p) > min_len]
        new_preds.append(new_pred)
    return new_preds

### Overall

In [None]:
def post_process(preds, text, preds_matching=[], min_len=10):
    preds = [clean_text(pred) for pred in preds]
    preds = post_process_len([preds])[0]
    
    preds = post_process_indicators([preds])[0]
    
    preds += preds_matching
    preds = np.unique(preds)
    
    preds = post_process_fuzz([preds], DATASETS, threshold=90)[0]
    
    preds = post_process_overlap(preds, text)

    
    return '|'.join(preds)

## $k$-fold

In [None]:
def k_fold_inference(config, df, weights, root="", min_confidence=0.5, min_len=5):
    tokenizer, tokens = create_tokenizer_and_tokens(config)

    models = []
    for w in weights:
        model = NERTransformer(
            config.selected_model,
            nb_layers=config.nb_layers,
            nb_ft=config.nb_ft,
            k=config.conv_kernel,
            drop_p=config.drop_p,
            multi_sample_dropout=config.multi_sample_dropout,
        ).cuda()
        model.zero_grad()
        load_model_weights(model, w)
        models.append(model)

    original_datasets = pd.read_csv('../input/bigger-govt-dataset-list/data_set_800.csv')['title'].values.tolist()
    original_datasets = list(dict.fromkeys([clean_text(d) for d in original_datasets]))
    original_datasets.remove("clinicaltrials gov")

    all_preds = []
    for text_id in tqdm(df["Id"]):

        dataset = ArticleDataset(
            text_id,
            tokenizer,
            tokens,
            max_len=config.max_len,
            model_name=config.selected_model,
            root=root,
        )
        
        text = ''
        for section in dataset.article:
            text += section['text'] + ' '
        text = (clean_text(text))

        probas = []
        for model in models:
            proba = predict(
                model, dataset, batch_size=config.val_bs, activation=config.activation
            )
            probas.append(proba)
        probas = np.mean(probas, 0)

        preds = get_pred_from_probas(dataset, probas, min_confidence=min_confidence)
        
        preds_matching = []
#         for dataset in original_datasets:
#             if dataset in text:
#                 preds_matching.append(dataset)
                    
        preds = post_process(preds, text, preds_matching, min_len=min_len)

        all_preds.append(preds)

    return all_preds

# Main

In [None]:
# EXP_FOLDER = "../input/coleridge-cp/roberta_04-06_12/roberta_04-06_12/"
# EXP_FOLDER = "../input/coleridge-cp/roberta_07-06_0/roberta_07-06_0/"
# EXP_FOLDER = "../input/coleridge-cp/roberta_29-05_0/roberta_29-05_0/"
EXP_FOLDER = "../input/coleridge-cp/roberta_09-06_2/roberta_09-06_2/"
# EXP_FOLDER = "../input/coleridge-cp/roberta_12-06_2/roberta_12-06_2/"

In [None]:
weights = sorted(glob.glob(EXP_FOLDER + "*.pt"))

# weights = weights[:1]
print(f" -> Found {len(weights)} models")

In [None]:
config = Config(json.load(open(EXP_FOLDER + "config.json", 'r')))

In [None]:
df = pd.read_csv(DATA_PATH + "sample_submission.csv")

In [None]:
MIN_LEN = 10
MIN_CONFIDENCE = 0.9

In [None]:
preds = k_fold_inference(
    config,
    df,
    weights,
    root=DATA_PATH_TEST,
    min_len=MIN_LEN,
    min_confidence=MIN_CONFIDENCE,
)

df['PredictionString'] = preds
df.to_csv('submission.csv', index=False)
df.head()

### Matching

In [None]:
def read_json_pub(filename, data_path="", output='text'):
    json_path = os.path.join(data_path, (filename + '.json'))
    contents = []
    with open(json_path, 'r') as f:
        json_decode = json.load(f)
        for data in json_decode:
            contents.append(data.get('text'))

    return ' '.join(contents)

def text_cleaning(text):
    '''
    Converts all text to lower case, Removes special charecters, emojis and multiple spaces
    text - Sentence that needs to be cleaned
    '''
    text = re.sub('[^A-Za-z0-9]+', ' ', str(text).lower()).strip()
    text = re.sub(' +', ' ', text)
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    return text

def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower()).strip()

In [None]:
# datasets = pd.read_csv('../input/bigger-govt-dataset-list/data_set_800.csv')['title'].values
# sample_sub = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/sample_submission.csv')

# path = '../input/coleridgeinitiative-show-us-the-data/test'

# submission = pd.DataFrame(columns=["Id", "PredictionString"])

# for idx, row in sample_sub.iterrows():
#     to_append = [row['Id'], '']
    
#     text = read_json_pub(row['Id'], path)
#     clean_string = text_cleaning(text)
    
#     for query_string in datasets:
#         if query_string in clean_string:
#             if to_append[1] != '' and clean_text(query_string) not in to_append[1]:
#                 to_append[1] = to_append[1] + '|' + clean_text(query_string)
#             if to_append[1] == '':
#                 to_append[1] = clean_text(query_string)
    
#     # Complete with predictions
#     if len(df['PredictionString'][idx]):
#         for query_string in df['PredictionString'][idx].split('|'):
#             already_found = False

#             matched = to_append[1].split('|')
#             for match in matched:
#                 if match in query_string or query_string in match:

#                     already_found = True

#             if not already_found:
#                 to_append[1] = to_append[1] + '|' + query_string
        
#     # Replace only if empty
# #     if not len(to_append[1]): # if none found, use model pred
# #         to_append[1] = df['PredictionString'][idx]

#     submission.loc[idx] = to_append

# submission.to_csv('submission.csv', index=False)
# submission.head()