In [2]:
import spacy
from spacy.lang.en import English
from spacy.pipeline import EntityRuler
import json

In [3]:
def load_data(file):
    with open(file, "r", encoding="utf-8") as f:
        data = json.load(f)
    return (data)

In [4]:
def save_data(file, data):
    with open (file, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4)

In [10]:
def create_training_data(file, type):
    data = load_data(file)
    data = data["datasets"]
    patterns = []
    for item in data:
        pattern = {
                    "label": type,
                    "pattern": item
                    }
        patterns.append(pattern)
    return (patterns)

In [15]:
def generate_rules(patterns):
    #Build upon the spaCy Small English Model
    nlp = spacy.blank("en")
    #Create the EntityRuler
    ruler = nlp.add_pipe("entity_ruler")
    ruler.add_patterns(patterns)
    nlp.to_disk("difa_ner")

In [22]:
def test_model(model, text):
    doc = model(text)
    results = []
    for ent in doc.ents:
        results.append(ent.text)
    return (results)

In [23]:
patterns = create_training_data("data/difa_datasets.json", "DATASET")
generate_rules(patterns)

In [24]:
nlp = spacy.load("difa_ner")
ie_data = {}
text_data = ["This study uses the National Household Food Acquisition and Purchase Survey data set to examine the effect of reporting error on food-related outcomes.",
             "This research uses the Survey of Consumer Finances data set to analyze the relationship between income and wealth.",
             "The data used in this study is from the American Community Survey.",
             "In conjunction with medical claims data, we use data from SafeGraph"]

In [25]:
for text in text_data:
    results = test_model(nlp, text)
    print(type(results))

['National Household Food Acquisition and Purchase Survey']
['Survey of Consumer Finances']
['American Community Survey']
['SafeGraph']
