# NER inference

This is a notebook for performing inference from the NER model on the list of pages being trialled for the new whole user journey approaches.

Before you start, make sure your GPU is running.

This version of the notebook aims to handle the no value exceptions that arise from scraping some pages. 

In [None]:
!pip install transformers datasets seqeval >/dev/null

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
datascience 0.10.6 requires folium==0.2.1, but you have folium 0.8.3 which is incompatible.[0m


In [None]:
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification
import torch
import json
import pandas as pd
import spacy
from spacy import displacy
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen
import os
from google.colab import drive
import time

drive.mount("/content/gdrive")

MODEL_DIR = os.path.join("/content/gdrive/Shared drives/",
                         "GOV.UK teams/2020-2021/Data labs/content-metadata-2021/Models"
                         )

Mounted at /content/gdrive


## Load model

Choose a model. These are available here:

https://drive.google.com/drive/folders/1-6n2iyiUpicm2BK4ybycJd2lY424U2BP 

Replace the `checkpoint` variable below if you wish to use a different model checkpoint.



In [None]:
checkpoint = 'distilbert-base-uncased-selfsupervised-ner-govuk-08-02-2022-govuk'
MODEL_PATH = os.path.join(MODEL_DIR, checkpoint)

In [None]:
model = AutoModelForTokenClassification.from_pretrained(MODEL_PATH)
device = torch.device("cuda")
model.to(device)
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
nlp = pipeline("ner",
               model=model,
               tokenizer=tokenizer,
               aggregation_strategy="first",
               device=0
               )

## Inference test for demonstration

In [None]:
def show_entities(examples):
  '''
  This function visualises the named entities in some text.

  Arguments:
    examples: a list of strings, where each string is a document (e.g., sentence)
  '''

  # identify named entities using the model
  ner_results = nlp(examples) 

  s = spacy.blank("en")

  # format the text and named entities to comply with displacy
  for example, results in zip(examples, ner_results):
    doc = s(example)

    ents = []

    if results:
      for result in results: 
        ents.append(doc.char_span(result['start'],
                                  result['end'],
                                  result['entity_group']))
      doc.ents = ents

      displacy.render(doc, style="ent", jupyter=True)

In [None]:
# In reality, each item in this list will be the parsed content from a page
examples = ["My name is David, I live in London and today is a Monday",
           "Welcome to the jungle, my name is John and I am unable to receive Universal Credit",
           "The DIO awarded contracts worth a total of £150 million to the firms to deliver a range of new buildings for service personnel from 1 and 16 Signal Regiment who have moved to Stafford from Germany.",
           "Statement by Ambassador Karen Pierce, UK Permanent Representative to the UN, at the Security Council briefing on Women, Peace and Security.",
           "PHE are warning pregnant women against using a potentially poisonous product, 'Calabash chalk', as a nutritional supplement or morning sickness ‘antidote’",
           "How the government will make teaching an even higher status profession that attracts even more of the best graduates."]

show_entities(examples)

# Main script

In [None]:
def extract_entities(examples):
    '''
    This function extracts the named entities into a pandas df.

    Arguments:
        - examples: a list of strings, where each string is a document (e.g., sentence)
    '''
    
    # identify named entities using the model
    ner_results = nlp(examples)
    
    # this line converts the list of lists of dictionaries to a pandas df
    # with the keys of each dictionary as the columns and indexed to the 
    # webpage index and the entity index on that webpage
    df = pd.DataFrame(ner_results).stack().apply(pd.Series)

    # removes the columns `score`, `start` and `end` from the df as these are irrelevant in this case
    df = df.drop(columns=["score", "start", "end"])

    return df

In [None]:
pages = ["https://www.gov.uk/foreign-travel-advice/slovakia",
         "https://www.gov.uk/foreign-travel-advice/ukraine",
         "https://www.gov.uk/foreign-travel-advice/czech-republic",
         "https://www.gov.uk/foreign-travel-advice/poland",
         "https://www.gov.uk/foreign-travel-advice/hungary",
         "https://www.gov.uk/guidance/find-help-and-support-if-you-have-long-covid"]

In [None]:
import csv

# To pull in the list of webpages from a .csv file and convert them to a list
with open('ER_pages_cleaned.csv', 'r') as f:
    pages = [row[1] for row in csv.reader(f)] 

In [None]:
print(pages)



In [None]:
n = 2
pages = pages[n:]
print(pages)



In [None]:
# pages = ["https://www.gov.uk" + s for s in pages]

In [None]:
# necessary as these pages no longer exist and redirect to NHS
pages.remove("https://www.gov.uk/find-covid-19-lateral-flow-test-site")
pages.remove("https://www.gov.uk/guidance/coronavirus-covid-19-getting-tested")
pages.remove("https://www.gov.uk/register-coronavirus-antibody-test")

In [None]:
def extract_entities_to_csv(pages):
    '''
    This function extracts all the named-entities from the list of pages into
    a .csv file in the format:

    || page || index_of_named_entity || type_of_named_entity || word  ||
    ||  0   ||           0           ||     organisation     ||  nhs  ||
    ||  0   ||           1           ||         date         || today ||

    Arguments:
        - pages: a list of html links for pages
    '''

    # this scrapes the raw html of all the pages in `pages` into the `pages_html` list
    pages_html = []
    pages_id = []
    
    for page in range(len(pages)):
        print(pages[page], page, "/", len(pages))
        pages_html.append(urlopen(pages[page]).read().decode('utf-8'))

        # this extracts all of the text from the html stored in `pages_html`
        # this text is then appended to `data_extracted_strs`
    
        data_extracted_strs = []


        for html in pages_html:
            topic_soup = soup(html, "html.parser")  # necessary to convert to this form for extraction of text
            
            data = '' 

            # container = topic_soup.select_one('#wrapper')
            b_tags = topic_soup.find_all("div",{"data-module":"govspeak"})

            text = ''.join(b.get_text(strip=True) for b in b_tags)

            data_extracted_strs.append(text)

        # for data in topic_soup.find_all("div", {"class": {"gem-c-govspeak govuk-govspeak ", "gem-c-govspeak govuk-govspeak direction-ltr"}}):  
        # I think the argument of find_all below is the general wrapper for all text but I'm leaving the above commented incase it isn't
        # for data in topic_soup.findAll("div",{"data-module":"govspeak"}): 
            
        #     data_extracted = data.get_text()
            
        #     # print(pages[page], len(data_extracted))

        #     # print("Data extracted: ", len(data_extracted))
        #     data_extracted_strs.append(str(data_extracted))  # it is necessary to convert to str format here for named-entity extraction
            
        #     pages_id.append(pages[page])

    '''
    - if `data_extracted` is less than e.g., 20 in size, need to not put it into data_extracted_strs
    - need a way to retain unique id of page
    '''
    print(len(data_extracted_strs), len(pages))

    # This line removes pages for which little/no text is extracted wheb scraping 
    # This prevents no-value exceptions when creating the df
    data_extracted_strs_cleaned = [word for word in data_extracted_strs if len(word) >= 20]        

    # getting the index of removed pages
    def find_indices(lst):
        return [i for i, elem in enumerate(lst) if len(elem) < 20]

    data_extracted_idx_strs_removed = find_indices(data_extracted_strs)

    # this finds and displays the named entities from the text stored in `data_extracted_strs`
    # uncomment if want to display the named-entities, but will produce a long output if `pages` is long
    # for i in range(len(data_extracted_strs)):
    #     show_entities([data_extracted_strs[i]])

    # this finds and extracts the named entities from the text stored in `data_extracted_strs`
    df = extract_entities(data_extracted_strs_cleaned)

    # return df, data_extracted_idx_strs_removed, data_extracted_strs_cleaned, pages_id
    return df, data_extracted_strs, data_extracted_idx_strs_removed, data_extracted_strs_cleaned

In [None]:
# df = extract_entities_to_csv(pages[1:10])
df, data_extracted_strs, data_extracted_idx_strs_removed, data_extracted_strs_cleaned = extract_entities_to_csv(pages[:1001])
# len(pages) # = 10,627
#  none type error in 51-100 and 101-150

In [None]:
df
# indexes_removed

Unnamed: 0,Unnamed: 1,entity_group,word
0,0,ORGANIZATION,home office
0,1,PERSON,employee
0,2,PERSON,employee
0,3,STATE,immigration status
0,4,STATE,immigration status
...,...,...,...
78,8,PERSON,employer
78,9,PERSON,employees
78,10,ORGANIZATION,employer
78,11,PERSON,driver


In [None]:
pages_id

In [None]:
data_extracted_idx_strs_removed

[2,
 3,
 14,
 15,
 21,
 26,
 28,
 33,
 37,
 41,
 42,
 50,
 63,
 65,
 67,
 70,
 73,
 78,
 80,
 92,
 95,
 96]

In [None]:
df.to_csv('df_1_100.csv', index=True)

In [None]:
indexes_removed = pd.DataFrame(indexes_removed)

In [None]:
indexes_removed.to_csv('indexes_removed_1_100.csv', index=True)