# This notebook prototypes masking organizations/proper nouns/dataset names for training a binary classifier

In [1]:
import json

import pandas as pd
import spacy

nlp = spacy.load("en_core_web_trf")
from spacy import displacy

In [2]:
training_labels = pd.read_csv("../ml_pipeline/data/kaggle/train.csv")

In [3]:
print(training_labels.columns)
print(len(training_labels))

Index(['Id', 'pub_title', 'dataset_title', 'dataset_label', 'cleaned_label'], dtype='object')
19661


In [4]:
training_labels.loc[training_labels["dataset_title"] != training_labels["dataset_label"], ["Id", "dataset_title", "dataset_label"]]

Unnamed: 0,Id,dataset_title,dataset_label
293,f70051bf-a763-415b-aa66-97ae57f2efc1,NOAA Tide Gauge,NOAA tidal station
294,0d4e13ca-47ec-4827-b814-a39e5b8fede3,NOAA Tide Gauge,NOAA tidal station
295,c5cf06e5-182f-4c33-bf15-e06a0d353efd,NOAA Tide Gauge,NOAA tidal station
296,da25e497-208d-4ed5-9c51-37c69a5524d3,NOAA Tide Gauge,NOAA tidal station
297,50d6879b-1c6b-4434-965e-19a7271e8c49,NOAA Tide Gauge,NOAA tidal station
...,...,...,...
19654,f89dd9fa-07af-4384-aa0c-0d14602c0cea,RSNA International COVID-19 Open Radiology Dat...,RSNA International COVID-19 Open Radiology Dat...
19655,922a5f2c-2d1c-46a7-a07a-acaf2222c0c6,RSNA International COVID-19 Open Radiology Dat...,RSNA International COVID-19 Open Radiology Dat...
19656,b3498176-8832-4033-aea6-b5ea85ea04c4,RSNA International COVID-19 Open Radiology Dat...,RSNA International COVID Open Radiology Database
19659,fd23e7e0-a5d2-4f98-992d-9209c85153bb,CAS COVID-19 antiviral candidate compounds dat...,CAS COVID-19 antiviral candidate compounds dat...


Presumably `dataset_label` exists in the document.

In [6]:
def get_document_by_id(id):
    with open(f"../ml_pipeline/data/kaggle/train/{id}.json", "r") as f:
        return json.load(f)

def get_document_text_by_id(id):
    document = get_document_by_id(id)
    return " ".join([section["text"] for section in document])

def get_document_label_by_id(id):
    return training_labels.loc[training_labels["Id"] == id, "dataset_label"].values

def get_document_and_label_by_id(id):
    return get_document_text_by_id(id), get_document_label_by_id(id)

In [17]:
doc, lbl = get_document_and_label_by_id("f70051bf-a763-415b-aa66-97ae57f2efc1")
processed_doc = nlp(doc)

In [22]:
lbl

array(['NOAA tidal station'], dtype=object)

In [40]:
def label_in_snippet(snippet:str, processed_label) -> bool:
    # This is pretty much the goal of the whole project, so it's a bit tricky

    expansion, acronym = processed_label
    if acronym:
        complete_label = expansion + " (" + acronym + ")"
    else:
        complete_label = expansion

    if complete_label in snippet:
        print("complete_label")
        return True

    if expansion in snippet:
        print("expansion")
        return True

    if acronym and acronym in snippet:
        print("acronym")
        return True

    return False


def split_document_into_snippets(
    document,
    n_sentences_per_snippet:int,
):
    document, label = document

    processed = nlp(text_only)

    snippets = []
    for i in range(0, len(processed.sents), n_sentences_per_snippet):
        snippet = processed.sents[i:i+n_sentences_per_snippet].text
        snippets.append(snippet)        

    return snippets

In [20]:

n_sentences_per_snippet = 3
snippets = []
tmp_snippet = []
for i, s in enumerate(processed_doc.sents):
    if i % n_sentences_per_snippet == 0:
        if tmp_snippet:
            snippets.append(" ".join(tmp_snippet))
        tmp_snippet = []
    tmp_snippet.append(s.text.strip())   

In [45]:
print(f"----{lbl[0]}----")
for snippet in snippets:
    modified_lbl = (lbl[0], "")
    found = label_in_snippet(snippet, modified_lbl)
    if found:
        
        print(snippet)
        print("")

----NOAA tidal station----
complete_label
The multi-parameter sonde was used to collect synoptic vertical profile of water quality within the screened monitoring well intervals by lowering the sensor to the bottom and stopping at pre-determined intervals until parameters stabilized. Verification of water-column height above a transducer was done by collecting a depth to water less than 1 minute before the scheduled sample collection. The datum used in this report is Port Townsend Bay Mean Lower Low Water (MLLW) tidal level at NOAA tidal station 9444900 (National Oceanic and Atmospheric Administration, 2018).

complete_label
In 2012, the Navy contracted a survey of the monitoring wells (top-of-casing) and two control points, CP1 and CP2, referencing mean sea level (MSL). CP2 survey information states that vertical control was derived from photogrammetric survey control referencing Port Townsend Bay MLLW tidal level at NOAA tidal station 9444900 (Sealaska Environmental, 2012) . The diffe