# Basic NER of Fatal Encounter Supporting Links


In [6]:
!pip install --quiet transformers newspaper3k pyvis spacy pandas ipywidgets


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


## Extract Article Text

Create a function that uses the newspaper module to parse a newspaper article from the web. Log any errors that are encountered for analysis later.

In [16]:
import logging

from newspaper import Article, ArticleException

logger = logging.basicConfig(filename="fe-links.log", level=logging.INFO)

def get_article(url):
    try:
        logging.info('fetching url: %s', url)
        article = Article(url)
        article.download()
        article.parse()
        return article
    except ArticleException as e:
        logging.error("caught article error: {}".format(e))
        return None

article = get_article('https://www.latimes.com/archives/la-xpm-2002-may-10-me-award10-story.html')
article.title

'City to Appeal Judgment in Police-Custody Death'

## Download All Articles

Use the Fatal Encounters Google Sheet to download all the text for each case. It can take between 1 and 2 seconds to download each article , and there are over 30,000 articles, so this will take about 8 hours. It's helpful to do this step separate from the entity extraction since we know we want to try multiple methods for entity extraction and we wouldn't want to have to rerun the text fetching each time.

In [3]:
import pandas as pd

sheet_id = "1dKmaV_JiWcG8XBoRgP8b4e9Eopkpgt7FL7nyspvzAsE"
sheet_name = "sample_1"
url = f"https://docs.google.com/spreadsheets/d/{sheet_id}/gviz/tq?tqx=out:csv&sheet={sheet_name}"
df_fe = pd.read_csv(url)
df_fe.head(5)

Unnamed: 0,Unique ID,Name,Age,Gender,Race,Race with imputations,Imputation probability,URL of image (PLS NO HOTLINKS),Date of injury resulting in death (month/day/year),Location of injury (address),...,URL Temp,Brief description,"Dispositions/Exclusions INTERNAL USE, NOT FOR ANALYSIS",Intended use of force (Developing),Supporting document link,"Foreknowledge of mental illness? INTERNAL USE, NOT FOR ANALYSIS",Unnamed: 32,Unnamed: 33,Unique ID formula,Unique identifier (redundant)
0,31495.0,Ashley McClendon,28.0,Female,African-American/Black,African-American/Black,Not imputed,https://fatalencounters.org/wp-content/uploads...,12/31/2021,South Pearl Street and Tory Road,...,,"Ashley McClendon's boyfriend, 33-year-old Marc...",Criminal,Pursuit,https://www.wsoctv.com/news/1-person-dead-afte...,No,,,,31495.0
1,31496.0,Name withheld by police,,Female,Race unspecified,,,,12/31/2021,1500 21st Street,...,,Police responded to a man causing a disturbanc...,Pending investigation,Deadly force,https://www.wtok.com/2022/01/01/officer-involv...,No,,,,31496.0
2,31497.0,Name withheld by police,,Male,Race unspecified,,,,12/31/2021,1500 21st Street,...,,Police responded to a man causing a disturbanc...,Pending investigation,Deadly force,https://www.wtok.com/2022/01/01/officer-involv...,No,,,,31497.0
3,31491.0,Johnny C. Martin Jr.,36.0,Male,Race unspecified,,,,12/30/2021,Martinez Lane,...,,"Johnny C. Martin, Jr. arrived at a gas station...",Suicide,Suicide,https://gbi.georgia.gov/press-releases/2021-12...,No,,,,31491.0
4,31492.0,Dennis McHugh,44.0,Male,European-American/White,,,,12/30/2021,435 E 4th Street,...,,Deputies responded to a domestic violence call...,Pending investigation,Deadly force,https://kesq.com/news/2021/12/31/officer-invol...,No,,,,31492.0


Download the text of each article and write to the filesystem using the ID. If the file is already present there's no need to redownload. This lets you stop the collecting and restart later if you are running on your laptop.

In [4]:
from pathlib import Path
from tqdm.notebook import tqdm

# ignore any spreadsheet rows without an ID or URL
df_fe = df_fe.dropna(subset=['Unique ID', 'Supporting document link'])

for i, row in tqdm(list(df_fe.iterrows())):
    fe_id = int(row['Unique ID'])
    
    text_file = Path("data") / f"{fe_id}.txt"
    if text_file.is_file():
        continue

    url = row['Supporting document link']
    article = get_article(url)
    if article:
        text_file.open("w").write(article.text)

  0%|          | 0/31496 [00:00<?, ?it/s]

## Spacy

Create a function that will get the entities from some text using spacy's [en_core_web_sm](https://spacy.io/models/en/#en_core_web_sm) model, which we may need to download:

In [None]:
! spacy download en_core_web_sm

In [7]:
import spacy

nlp = spacy.load("en_core_web_sm")

def get_entities(text):
    doc = nlp(text)
    for ent in doc.ents:
        yield {"text": ent.text, "label": ent.label_}

article = get_article('https://www.latimes.com/archives/la-xpm-2002-may-10-me-award10-story.html')
list(get_entities(article.text))

[{'text': '$1-million', 'label': 'MONEY'},
 {'text': 'Army', 'label': 'ORG'},
 {'text': 'Los Angeles', 'label': 'GPE'},
 {'text': 'Thursday', 'label': 'DATE'},
 {'text': 'Anthony Eady', 'label': 'PERSON'},
 {'text': '52', 'label': 'DATE'},
 {'text': 'three days', 'label': 'DATE'},
 {'text': 'Aug. 26, 2000', 'label': 'DATE'},
 {'text': 'Amar Eady', 'label': 'PERSON'},
 {'text': '28', 'label': 'DATE'},
 {'text': '669,100', 'label': 'MONEY'},
 {'text': 'Rashidah Thomas-Eady', 'label': 'PERSON'},
 {'text': '24', 'label': 'DATE'},
 {'text': '350,000', 'label': 'MONEY'},
 {'text': 'Wednesday', 'label': 'DATE'},
 {'text': 'Compton', 'label': 'ORG'},
 {'text': 'Los Angeles County Superior Court', 'label': 'GPE'},
 {'text': 'Marlene Kristovich', 'label': 'PERSON'},
 {'text': 'Jeffrey Wilson', 'label': 'PERSON'},
 {'text': 'Randy McMurray', 'label': 'PERSON'},
 {'text': 'City Atty', 'label': 'PERSON'},
 {'text': 'Richard M. Arias', 'label': 'PERSON'},
 {'text': 'Eady', 'label': 'PERSON'},
 {'tex

Fetch all the entities from each text file and write them as a CSV file along side it.

In [21]:
from glob import glob
from pathlib import Path
from tqdm.notebook import tqdm

for text_file in tqdm(glob('data/*.txt')):
    text_file = Path(text_file)
    fe_id = text_file.stem
    csv_filename = Path("data") / f"{fe_id}-spacy.csv"

    # don't regenerate if we have it already
    if csv_filename.is_file():
        continue

    text = text_file.open().read()
    df = pd.DataFrame(get_entities(text))
    df['fe_id'] = fe_id
    
    if len(df) > 0:
        df.to_csv(csv_filename, index=False)
        logging.info('wrote %s', csv_filename)
    else:
        logging.info('no spacy entities for %s', text_file)

  0%|          | 0/24386 [00:00<?, ?it/s]

Load in all the CSVs into one DataFrame:

In [22]:
ents_spacy = pd.concat([
    pd.read_csv(csv_file) for csv_file in glob('data/*-spacy.csv')
])
ents_spacy

Unnamed: 0,text,label,fe_id
0,Homewood,GPE,30113
1,Tuesday,DATE,30113
2,42-year-old,DATE,30113
3,Homewood,ORG,30113
4,two,CARDINAL,30113
...,...,...,...
36,Harbison,PERSON,18411
37,Park Ridge Hospital,FAC,18411
38,Harbison,PERSON,18411
39,The North Carolina State Highway Patrol,ORG,18411


In [23]:
ents_spacy.value_counts('label')

label
PERSON         280606
DATE           188609
ORG            186063
GPE            154273
CARDINAL       142616
TIME            53631
FAC             31123
ORDINAL         15007
LOC             13805
PRODUCT          9131
NORP             8755
WORK_OF_ART      7563
QUANTITY         7187
MONEY            4295
EVENT            1852
LAW              1628
PERCENT          1213
LANGUAGE          404
Name: count, dtype: int64

What are the top 50 mentioned organizations?

In [28]:
pd.set_option('display.max_rows', 50)

ents_spacy[ents_spacy.label == 'ORG'].value_counts('text')[0:50]

text
MIAMI                                                                                             4051
SWAT                                                                                              2369
Taser                                                                                             1676
State                                                                                             1222
FBI                                                                                               1008
Court                                                                                              971
Ford                                                                                               883
CHP                                                                                                787
HIALEAH                                                                                            784
Honda                                                               

How about people?

In [29]:
ents_spacy[ents_spacy.label == 'PERSON'].value_counts('text')[0:50]

text
Sgt                    1757
Johnson                1713
Williams               1707
Brown                  1411
Jones                  1147
marijuana               999
Davis                   984
Thomas                  901
Taylor                  860
Garcia                  772
Harris                  743
Jackson                 743
Smith                   711
Anderson                696
Martinez                688
Lewis                   636
Miller                  614
Robinson                569
Martin                  539
Clark                   507
Evans                   466
Lee                     461
Gonzalez                448
Baker                   429
Zip Code                401
Canada Newfoundland     401
Perez                   396
Carter                  356
Twitter                 355
Nelson                  351
Wright                  350
Corey                   347
Gilbert                 344
Christopher             341
Cpl                     339
Graham         

Maybe we should remove single names?

In [34]:
ents_spacy[(ents_spacy.label == 'PERSON') & ents_spacy.text.str.match('.* ')].value_counts('text')[0:50]

text
Canada Newfoundland     401
Zip Code                401
BONITA SPRINGS          240
Crime Stoppers          180
William Saunders        120
Justin Roedel           120
Share Copy Link Copy    112
Officer Culpepper        90
Jessie Lee Williams      82
George Floyd             80
Graham v. Connor         78
Weather Alert            74
Circle K                 73
Peggy Jo                 72
Donna Mills              72
Michael Davis            70
Sergeant Graham          69
Pat Camden               67
Crown Victoria           67
Donald Smith             67
David Baker              67
Matthew Casey            66
Robert Johnson           66
William Kennan           65
Officer Smith            65
Daytona Beach            64
Rudy Eugene              64
Richard Williams         64
Bobby Earl Driggers      63
Ian Burlakoff            63
Leslie Vaughn Prater     63
Brian Naab               63
Della Ventura            63
Daniel Jopek             63
Frank Jackson            63
Jermaine McBean

## Rebel Large

Use the [Rebel Large](https://huggingface.co/Babelscape/rebel-large) model to try to extract relations from the text.

> REBEL(Relation Extraction By End-to-end Language generation) is an autoregressive seq2seq model based on BART (Bidirectional and AutoRegressive Transformers) designed for end-to-end relation extraction. Relation extraction involves extracting relation triplets from raw text, an important task in Information extraction for various applications such as knowledge base population, fact-checking, and other downstream tasks.
>
> [Understanding knowledge graphs: A key to effective data governance](https://www.leewayhertz.com/knowledge-graph-in-machine-learning/) by Akash Takyar

The `extract_relations_from_model_output()` function and `KB` class below for extracting the relations from text are copied from Akash's post.

We start by loading the modela dn tokenizer for *rebel-large*.

In [1]:
import torch

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("Babelscape/rebel-large")
model = AutoModelForSeq2SeqLM.from_pretrained("Babelscape/rebel-large")

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

This function will parse the transformed text to get the assertion as a data structure. It was pulled directly from Akash's post.

In [30]:
def extract_relations_from_model_output(text):
    relations = []
    relation, subject, relation, object_ = '', '', '', ''
    text = text.strip()
    current = 'x'
    text_replaced = text.replace("<s>", "").replace("<pad>", "").replace("</s>", "")
    for token in text_replaced.split():
        if token == "<triplet>":
            current = 't'
            if relation != '':
                relations.append({
                    'subject': subject.strip(),
                    'predicate': relation.strip(),
                    'object': object_.strip()
                })
                relation = ''
            subject = ''
        elif token == "<subj>":
            current = 's'
            if relation != '':
                relations.append({
                    'subject': subject.strip(),
                    'predicate': relation.strip(),
                    'object': object_.strip()
                })
            object_ = ''
        elif token == "<obj>":
            current = 'o'
            relation = ''
        else:
            if current == 't':
                subject += ' ' + token
            elif current == 's':
                object_ += ' ' + token
            elif current == 'o':
                relation += ' ' + token
    if subject != '' and relation != '' and object_ != '':
        relations.append({
            'subject': subject.strip(),
            'predicate': relation.strip(),
            'object': object_.strip()
        })
    return relations

The `parse_text()` function extracts the relations from a chunk of text.

In [31]:
def parse_text(text):
    model_inputs = tokenizer(
        text, 
        max_length=512,
        padding=True,
        truncation=True,
        return_tensors='pt'
    )
    
    logging.info(f"Num tokens: {len(model_inputs['input_ids'][0])}")

    # Generate
    gen_kwargs = {
        "max_length": 216,
        "length_penalty": 0,
        "num_beams": 3,
        "num_return_sequences": 3
    }
    
    generated_tokens = model.generate(
        **model_inputs,
        **gen_kwargs,
    )
    decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=False)

    for sentence_pred in decoded_preds:
        yield from extract_relations_from_model_output(sentence_pred)

print(list(parse_text('Ed went to the market. He saw a protest there.')))

[{'subject': 'protest', 'predicate': 'participant', 'object': 'Ed'}, {'subject': 'protest', 'predicate': 'location', 'object': 'market'}, {'subject': 'Ed went to the market', 'predicate': 'instance of', 'object': 'protest'}]


And now we step through our text files and generate our knowledge graph! It takes

In [29]:
for text_file in tqdm(glob('data/*.txt')):
    text_file = Path(text_file)
    fe_id = text_file.stem

    # don't regenrate output if we already have it
    csv_file = Path('data') / f"{fe_id}-rebel.csv"
    if csv_file.is_file():
        continue

    text = Path(text_file).open().read()
    df = pd.DataFrame(list(parse_text(text)))

    if len(df) > 0:
        logging.info('saving %s assertions to %s', len(df), csv_file)
        df['fe_id'] = fe_id
        df.to_csv(csv_file, index=False)    

  0%|          | 0/24386 [00:00<?, ?it/s]

Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "/Users/edsummers/Library/Caches/pypoetry/virtualenvs/ksr-notebooks-1rQVZpoN-py3.11/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3508, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/var/folders/9g/t4bgvmr566190l4nl87nrtkh0000gs/T/ipykernel_4534/414531693.py", line 10, in <module>
    df = pd.DataFrame(list(parse_text(text)))
                      ^^^^^^^^^^^^^^^^^^^^^^
  File "/var/folders/9g/t4bgvmr566190l4nl87nrtkh0000gs/T/ipykernel_4534/723612258.py", line 20, in parse_text
    generated_tokens = model.generate(
                       ^^^^^^^^^^^^^^^
  File "/Users/edsummers/Library/Caches/pypoetry/virtualenvs/ksr-notebooks-1rQVZpoN-py3.11/lib/python3.11/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/Users/edsummers/Library/Caches/pypoetry/virtualenvs/ksr-notebooks-1rQVZpoN-py3.11/lib/python