# NER inference

This is a notebook for performing inference from the NER model on the list of pages being trialled for the new whole user journey approaches.

Before you start, make sure your GPU is running.

In [None]:
!pip install transformers datasets seqeval >/dev/null

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
datascience 0.10.6 requires folium==0.2.1, but you have folium 0.8.3 which is incompatible.[0m


In [None]:
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification
import torch
import json
import pandas as pd
import spacy
from spacy import displacy
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen
import os
from google.colab import drive

drive.mount("/content/gdrive")

MODEL_DIR = os.path.join("/content/gdrive/Shared drives/",
                         "GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Models"
                         )

Mounted at /content/gdrive


## Load model

Choose a model. These are available here:

https://drive.google.com/drive/folders/1-6n2iyiUpicm2BK4ybycJd2lY424U2BP 

Replace the `checkpoint` variable below if you wish to use a different model checkpoint.



In [None]:
checkpoint = 'distilbert-base-uncased-selfsupervised-ner-govuk-08-02-2022-govuk'
MODEL_PATH = os.path.join(MODEL_DIR, checkpoint)

In [None]:
model = AutoModelForTokenClassification.from_pretrained(MODEL_PATH)
device = torch.device("cuda")
model.to(device)
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
nlp = pipeline("ner",
               model=model,
               tokenizer=tokenizer,
               aggregation_strategy="first",
               device=0
               )

## Inference test for demonstration

In [None]:
def show_entities(examples):
  '''
  This function visualises the named entities in some text.

  Arguments:
    examples: a list of strings, where each string is a document (e.g., sentence)
  '''

  # identify named entities using the model
  ner_results = nlp(examples) 

  s = spacy.blank("en")

  # format the text and named entities to comply with displacy
  for example, results in zip(examples, ner_results):
    doc = s(example)

    ents = []

    if results:
      for result in results: 
        ents.append(doc.char_span(result['start'],
                                  result['end'],
                                  result['entity_group']))
      doc.ents = ents

      displacy.render(doc, style="ent", jupyter=True)

In [None]:
# In reality, each item in this list will be the parsed content from a page
examples = ["My name is David, I live in London and today is a Monday",
           "Welcome to the jungle, my name is John and I am unable to receive Universal Credit",
           "The DIO awarded contracts worth a total of £150 million to the firms to deliver a range of new buildings for service personnel from 1 and 16 Signal Regiment who have moved to Stafford from Germany.",
           "Statement by Ambassador Karen Pierce, UK Permanent Representative to the UN, at the Security Council briefing on Women, Peace and Security.",
           "PHE are warning pregnant women against using a potentially poisonous product, 'Calabash chalk', as a nutritional supplement or morning sickness ‘antidote’",
           "How the government will make teaching an even higher status profession that attracts even more of the best graduates."]

show_entities(examples)

# Main script

In [None]:
def extract_entities(examples):
    '''
    This function extracts the named entities into a pandas df.

    Arguments:
        - examples: a list of strings, where each string is a document (e.g., sentence)
    '''
    
    # identify named entities using the model
    ner_results = nlp(examples)
    
    # this line converts the list of lists of dictionaries to a pandas df
    # with the keys of each dictionary as the columns and indexed to the 
    # webpage index and the entity index on that webpage
    df = pd.DataFrame(ner_results).stack().apply(pd.Series)

    # removes the columns `score`, `start` and `end` from the df as these are irrelevant in this case
    df = df.drop(columns=["score", "start", "end"])

    return df

In [None]:
pages = ["https://www.gov.uk/foreign-travel-advice/slovakia",
         "https://www.gov.uk/foreign-travel-advice/ukraine",
         "https://www.gov.uk/foreign-travel-advice/czech-republic",
         "https://www.gov.uk/foreign-travel-advice/poland",
         "https://www.gov.uk/foreign-travel-advice/hungary",
         "https://www.gov.uk/guidance/find-help-and-support-if-you-have-long-covid"]

In [None]:
def extract_entities_to_csv(pages):
    '''
    This function extracts all the named-entities from the list of pages into
    a .csv file in the format:

    || page || index_of_named_entity || type_of_named_entity || word  ||
    ||  0   ||           0           ||     organisation     ||  nhs  ||
    ||  0   ||           1           ||         date         || today ||

    Arguments:
        - pages: a list of html links for pages
    '''

    # this scrapes the raw html of all the pages in `pages` into the `pages_html` list
    pages_html = []

    for page in range(len(pages)):
        pages_html.append(urlopen(pages[page]).read().decode('utf-8'))

    
    # this extracts all of the text from the html stored in `pages_html`
    # this text is then appended to `data_extracted_strs`

    data_extracted_strs = []

    for page in pages_html:
        topic_soup = soup(page, "html.parser")  # necessary to convert to this form for extraction of text

        data = '' 
  
        # for data in topic_soup.find_all("div", {"class": {"gem-c-govspeak govuk-govspeak ", "gem-c-govspeak govuk-govspeak direction-ltr"}}):  
        # I think the argument of find_all below is the general wrapper for all text but I'm leaving the above commented incase it isn't
        for data in topic_soup.find_all("div",{"data-module":"govspeak"}):  
            data_extracted = data.get_text()
            data_extracted_strs.append(str(data_extracted))  # it is necessary to convert to str format here for named-entity extraction

    
    # this finds and displays the named entities from the text stored in `data_extracted_strs`
    # uncomment if want to display the named-entities, but will produce a long output if `pages` is long
    # for i in range(len(data_extracted_strs)):
    #     show_entities([data_extracted_strs[i]])

    # this finds and extracts the named entities from the text stored in `data_extracted_strs`
    df = extract_entities(data_extracted_strs)

    return df

In [None]:
df = extract_entities_to_csv(pages)

In [None]:
df

Unnamed: 0,Unnamed: 1,entity_group,word
0,0,LOCATION,slovakia
0,1,LOCATION,slovakia
0,2,ORGANIZATION,travel provider.
0,3,LOCATION,uk
0,4,LOCATION,england
...,...,...,...
5,23,ORGANIZATION,colleges
5,24,PERSON,pupils
5,25,PERSON,professional
5,26,ORGANIZATION,school


In [None]:
df.to_csv('df.csv', index=True)