In [2]:
import spacy
import json
import random
import pandas as pd

from spacy.lang.en import English
from spacy.pipeline import EntityRuler
from spacy.training.example import Example

In [3]:
def load_data(file):
    with open(file, "r", encoding="utf-8") as f:
        data = json.load(f)
    return (data)

In [4]:
def save_data(file, data):
    with open (file, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4)

In [5]:
def create_training_data(file, type):
    data = load_data(file)
    data = data["datasets"]
    patterns = []
    for item in data:
        pattern = {
                    "label": type,
                    "pattern": item.lower()
                    }
        patterns.append(pattern)
    return (patterns)

In [6]:
def generate_rules(patterns):
    #Build upon the spaCy Small English Model
    nlp = spacy.blank("en")
    #Create the EntityRuler
    ruler = nlp.add_pipe("entity_ruler")
    ruler.add_patterns(patterns)
    nlp.to_disk("difa_ner")

In [7]:
def test_model(model, text):
    doc = model(text)
    res = []
    entities = []
    for ent in doc.ents:
        entities.append((ent.start_char, ent.end_char, ent.label_))
        res.append(ent.text)
    print("RES:",text,'-->',res)
    if len(entities) > 0:
        results = [text, {"entities": entities}]
        return results

In [8]:
patterns = create_training_data("data/difa_datasets.json", "DATASET")
generate_rules(patterns)


In [9]:
nlp = spacy.load("difa_ner")
dataframe = pd.read_excel('data/Data Text in Papers.xlsx')
text_data=dataframe["text"].to_list()
TRAIN_DATA = []
# text_data = ["This study uses the National Household Food Acquisition and Purchase Survey data set to examine the effect of reporting error on food-related outcomes.",
#              "This research uses the Survey of Consumer Finances data set to analyze the relationship between income and wealth.",
#              "The data used in this study is from the American Community Survey.",
#              "In conjunction with medical claims data, we use data from SafeGraph.",
#              "We have used FoodAPS, ACS and Irish National Farm Survey datasets in this paper."]

In [10]:
for text in text_data:
    results = test_model(nlp, text.lower())
    if results!=None:
        TRAIN_DATA.append(results)
print(TRAIN_DATA)
save_data('data/difa_training_data.json',TRAIN_DATA)

RES: the administrative database of the paying agency contains the land use data and limited livestock numbers for all farms receiving direct payments. --> ['administrative database of the paying agency']
RES: the fadn database contains a large accountancy dataset for a low number of farms. --> ['fadn']
RES: the income data is provided by the dairy farms in fadn sample in the larger region (sample). --> ['fadn']
RES: for each farm in the population of the municipality the three most similar farms in the fadn sample in the larger region are selected. --> ['fadn']
RES: kelly (2004) works out a method for re-weighting fadn sample farms to micro-regional census data. --> ['fadn']
RES: the two most important data input for the model are the irish national farm survey (fadn farms) and the irish census of agriculture. --> ['irish national farm survey', 'fadn', 'irish census of agriculture']
RES: the income statistics either cannot be disaggregated to micro-regional level (agricultural account

In [11]:
def train_spacy(data, iterations):
    TRAIN_DATA = data
    nlp = spacy.blank("en")
    if "ner" not in nlp.pipe_names:
        ner=nlp.add_pipe("ner", last=True)
    print(nlp.pipe_names)
    for _, annotations in TRAIN_DATA:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    with nlp.disable_pipes(*other_pipes):
        optimizer = nlp.begin_training()
        for itn in range(iterations):
            print ("Starting iteration " + str(itn))
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in TRAIN_DATA:
                doc = nlp.make_doc(text)
                example = Example.from_dict(doc, annotations)
                nlp.update(
                            [example],
                            drop=0.2,
                            sgd=optimizer,
                            losses=losses
                )
            print (losses)
    return (nlp)

In [12]:
TRAIN_DATA = load_data("data/difa_training_data.json")
nlp = train_spacy(TRAIN_DATA, 30)
nlp.to_disk("difa_trained_ner_model")

['ner']
Starting iteration 0
{'ner': 322.6007955793981}
Starting iteration 1
{'ner': 38.31254938405427}
Starting iteration 2
{'ner': 24.19563965574363}
Starting iteration 3
{'ner': 39.036004620678014}
Starting iteration 4
{'ner': 17.30791372912674}
Starting iteration 5
{'ner': 13.82068746366809}
Starting iteration 6
{'ner': 7.78678392241713}
Starting iteration 7
{'ner': 10.528758106542348}
Starting iteration 8
{'ner': 6.378180382224752}
Starting iteration 9
{'ner': 5.092725759954292}
Starting iteration 10
{'ner': 4.197315811023526}
Starting iteration 11
{'ner': 4.528735097782598}
Starting iteration 12
{'ner': 4.614722842773164}
Starting iteration 13
{'ner': 1.994908606616367}
Starting iteration 14
{'ner': 3.07988773333658}
Starting iteration 15
{'ner': 5.9886547414775535}
Starting iteration 16
{'ner': 0.15393110424860992}
Starting iteration 17
{'ner': 7.039495447728945}
Starting iteration 18
{'ner': 2.05385400199866}
Starting iteration 19
{'ner': 6.669104655601258}
Starting iteration 2

In [15]:
test_data = "We have used National Household Food Acquisition and Purchase Survey and American Community Survey datasets in this paper."
# test_data = "The two most important data input for the model are the Irish National Farm Survey (FADN farms) and the Irish Census of Agriculture."
# test_data = "At first, we thought of using FoodAPS but then finally we used ACS dataset."
nlp = spacy.load("difa_trained_ner_model")
doc = nlp(test_data)
for ent in doc.ents:
    print(ent.text,'-->',ent.label_)

American Community Survey --> DATASET
