# Notebook for Named Entity Recognition

Using spaCy for named entity recognition, we want to create relative frequency tables for the entities by year. At this point, we are only interested in the entities that appear most frequently.

Currently processes the "Fakespeak-ENG modified.xlsx" file (I've renamed my copy to "Fakespeak_ENG_modified.xlsx" to create a more consistent path), but will eventually be run on data from MisInfoText as well.

From the original data file, we use the following columns: ID, combinedLabel, originalTextType, originalBodyText, originalDateYear

We are processing text from the "originalBodyText" column.

In [1]:
import spacy
import pandas as pd
from spacy.tokens.span import Span
from spacy.tokens.doc import Doc
from spacy_entity_linker.EntityElement import EntityElement
from dataset_config import BASE_FAKESPEAK_CONFIG, BASE_MISINFOTEXT_CONFIG
from helpers import get_groups, make_output_path, make_output_path_for_type

## Loading the articles

In [2]:
fakespeak_config = BASE_FAKESPEAK_CONFIG | {
    "usecols": BASE_FAKESPEAK_CONFIG["usecols"] + ["originalHeadline"]
}

misinfotext_config = BASE_MISINFOTEXT_CONFIG

In [3]:
using_dataset = fakespeak_config

In [4]:
dataset_df = pd.read_excel(
    using_dataset["input_path"], 
    sheet_name=using_dataset["sheet_name"], 
    usecols=using_dataset["usecols"]
)

# Removing 2007 and 2008 years because little data in them
dataset_df = dataset_df[~(dataset_df[using_dataset["year_col"]] == 2007) & ~(dataset_df[using_dataset["year_col"]] == 2008)]

dataset_df.head()

Unnamed: 0,ID,combinedLabel,originalTextType,originalBodyText,originalHeadline,originalDateYear
0,Politifact_FALSE_Social media_687276,False,Social media,Mexico is paying for the Wall through the new ...,,2019
1,Politifact_FALSE_Social media_25111,False,Social media,"Chuck Schumer: ""why should American citizens b...",,2019
2,Politifact_FALSE_Social media_735424,False,Social media,Billions of dollars are sent to the State of C...,,2019
3,Politifact_FALSE_Social media_594307,False,Social media,If 50 Billion $$ were set aside to go towards ...,,2019
4,Politifact_FALSE_Social media_839325,False,Social media,Huge@#CD 9 news. \n@ncsbe\n sent letter to eve...,,2019


## Tagging named entities using spaCy

To make up for the difficulties of consolidating similar named entities, we use spaCy's large web model to ensure higher tagging accuracy in the initial NER step.

Documentation for entityLinker: https://github.com/egerber/spaCy-entity-linker

In [5]:
# load spacy model
nlp = spacy.load("en_core_web_lg")

# add custom entityLinker pipeline
entity_linker = nlp.add_pipe("entityLinker", last=True)

  import pkg_resources


In [6]:
dataset_df["text_doc"] = list(nlp.pipe(dataset_df[using_dataset["text_col"]]))
dataset_df["headline_doc"] = list(nlp.pipe(dataset_df[using_dataset["headline_col"]].fillna("")))
dataset_df.head()

Unnamed: 0,ID,combinedLabel,originalTextType,originalBodyText,originalHeadline,originalDateYear,text_doc,headline_doc
0,Politifact_FALSE_Social media_687276,False,Social media,Mexico is paying for the Wall through the new ...,,2019,"(Mexico, is, paying, for, the, Wall, through, ...",()
1,Politifact_FALSE_Social media_25111,False,Social media,"Chuck Schumer: ""why should American citizens b...",,2019,"(Chuck, Schumer, :, "", why, should, American, ...",()
2,Politifact_FALSE_Social media_735424,False,Social media,Billions of dollars are sent to the State of C...,,2019,"(Billions, of, dollars, are, sent, to, the, St...",()
3,Politifact_FALSE_Social media_594307,False,Social media,If 50 Billion $$ were set aside to go towards ...,,2019,"(If, 50, Billion, $, $, were, set, aside, to, ...",()
4,Politifact_FALSE_Social media_839325,False,Social media,Huge@#CD 9 news. \n@ncsbe\n sent letter to eve...,,2019,"(Huge@#CD, 9, news, ., \n, @ncsbe, \n , sent, ...",()


In [7]:
# For some reason, any spans of just "President" (or similar)
# get tagged as Zhong Chenle, maybe because he has an alias "President".
# The following code fixes that to point to the correct Wikidata entry
# for the generic term "president".

zhong_chenle_president_aliases = {'PRESIDENT', 'President', 'Presidents'}
zhong_chenle_wikidata_id = 30945670
president_wikidata_id = 30461

def clean_incorrect_president_entity(df: pd.DataFrame):
    zhong_chenle_as_president_filter = (df["Wikidata_id"] == zhong_chenle_wikidata_id) & (df["Span_text"].isin(zhong_chenle_president_aliases))
    df.loc[zhong_chenle_as_president_filter, "Entity"] = "president"
    df.loc[zhong_chenle_as_president_filter, "Wikidata_id"] = president_wikidata_id
    df.loc[zhong_chenle_as_president_filter, "Wikidata_url"] = f"https://www.wikidata.org/wiki/Q{president_wikidata_id}"

In [8]:
# A similar thing is happening where the state of Texas
# is sometimes confused for a musical play named "Texas". 

texas_musical_wikidata_id = 7707415
texas_state_wikidata_id = 1439

def clean_incorrect_texas_entity(df: pd.DataFrame):
    texas_musical_filter = df["Wikidata_id"] == texas_musical_wikidata_id
    df.loc[texas_musical_filter, "Wikidata_id"] = texas_state_wikidata_id
    df.loc[texas_musical_filter, "Wikidata_url"] = f"https://www.wikidata.org/wiki/Q{texas_state_wikidata_id}"

The spacy_entity_linker package doesn't include NER tags like PERSON, ORG, GPE, etc. So to extract them, we have to try to match the linked entities to the original spacy entities, and grab the NER tag from those. This doesn't always work because the entities don't always line up, but it's the best we can do.

In [9]:
def get_entity_tag(row: pd.Series, doc_col: str):
    linked_entity: EntityElement = row["entity"]
    linked_entity_span: Span = linked_entity.get_span()

    doc: Doc = row[doc_col]

    for entity in doc.ents:
        if linked_entity_span.start >= entity.start and linked_entity_span.end <= entity.end:
            return entity.label_

    return None

In [10]:
def get_entity_details_df(df: pd.DataFrame, doc_col: str):
    copied_df = df.copy()
    copied_df["entity"] = copied_df[doc_col].apply(lambda doc: doc._.linkedEntities.entities)

    entity_df = copied_df.explode("entity").dropna()
    entity_df["tag"] = entity_df.apply(get_entity_tag, args=(doc_col,), axis=1)

    entity_details_df = pd.DataFrame(
        data={
            "year": entity_df[using_dataset["year_col"]],
            "type": entity_df[using_dataset["type_col"]],
            "Entity": entity_df["entity"].apply(lambda ent: ent.get_label()),
            "tag": entity_df["tag"],
            "Wikidata_id": entity_df["entity"].apply(lambda ent: ent.get_id()),
            "Wikidata_url": entity_df["entity"].apply(lambda ent: ent.get_url()),
            "Span": entity_df["entity"].apply(lambda ent: ent.get_span()),
            "Span_text": entity_df["entity"].apply(lambda ent: ent.get_span().text)
        }
    )

    clean_incorrect_president_entity(entity_details_df)
    clean_incorrect_texas_entity(entity_details_df)

    # If the entity label is missing, fill it in with the span text.
    # This is rare, but sometimes happens
    entity_details_df["Entity"] = entity_details_df["Entity"].fillna(entity_details_df["Span_text"])

    return entity_details_df

In [11]:
text_entity_details_df = get_entity_details_df(dataset_df, "text_doc")
text_entity_details_df.head()

Unnamed: 0,year,type,Entity,tag,Wikidata_id,Wikidata_url,Span,Span_text
16,2019,News and blog,Joe Biden,PERSON,6279,https://www.wikidata.org/wiki/Q6279,"(Joe, Biden)",Joe Biden
16,2019,News and blog,letter,,133492,https://www.wikidata.org/wiki/Q133492,(message),message
16,2019,News and blog,audience,,211198,https://www.wikidata.org/wiki/Q211198,(public),public
16,2019,News and blog,scandal,,192909,https://www.wikidata.org/wiki/Q192909,(scandal),scandal
16,2019,News and blog,vice president,,42178,https://www.wikidata.org/wiki/Q42178,"(vice, president)",vice president


In [12]:
headline_entity_details_df = get_entity_details_df(dataset_df, "headline_doc")
headline_entity_details_df.head()

Unnamed: 0,year,type,Entity,tag,Wikidata_id,Wikidata_url,Span,Span_text
16,2019,News and blog,Joe Biden,PERSON,6279,https://www.wikidata.org/wiki/Q6279,"(Joe, Biden)",Joe Biden
16,2019,News and blog,agency,,3951828,https://www.wikidata.org/wiki/Q3951828,(Thoughts),Thoughts
19,2019,News and blog,Tom Selleck,PERSON,213706,https://www.wikidata.org/wiki/Q213706,"(Tom, Selleck)",Tom Selleck
19,2019,News and blog,Fuck,WORK_OF_ART,4480654,https://www.wikidata.org/wiki/Q4480654,(F*ck),F*ck
19,2019,News and blog,Donald Trump,PERSON,22686,https://www.wikidata.org/wiki/Q22686,"(Donald, Trump)",Donald Trump


In [13]:
entity_types_to_keep = [
    "EVENT",
    "FAC",
    "GPE",
    "LANGUAGE",
    "LAW",
    "LOC",
    "NORP",
    "ORG",
    "PERSON",
    "PRODUCT",
    "WORK_OF_ART",
]

In [14]:
filtered_text_entity_details_df = text_entity_details_df[text_entity_details_df["tag"].isin(entity_types_to_keep)]
filtered_text_entity_details_df.head()

Unnamed: 0,year,type,Entity,tag,Wikidata_id,Wikidata_url,Span,Span_text
16,2019,News and blog,Joe Biden,PERSON,6279,https://www.wikidata.org/wiki/Q6279,"(Joe, Biden)",Joe Biden
19,2019,News and blog,Tom Selleck,PERSON,213706,https://www.wikidata.org/wiki/Q213706,"(Tom, Selleck)",Tom Selleck
19,2019,News and blog,Donald Trump,PERSON,22686,https://www.wikidata.org/wiki/Q22686,"(Donald, J., Trump)",Donald J. Trump
19,2019,News and blog,United States of America,GPE,30,https://www.wikidata.org/wiki/Q30,"(United, States)",United States
19,2019,News and blog,Breitbart News,ORG,4960434,https://www.wikidata.org/wiki/Q4960434,"(Breitbart, News)",Breitbart News


In [15]:
filtered_headline_entity_details_df = headline_entity_details_df[headline_entity_details_df["tag"].isin(entity_types_to_keep)]
filtered_headline_entity_details_df.head()

Unnamed: 0,year,type,Entity,tag,Wikidata_id,Wikidata_url,Span,Span_text
16,2019,News and blog,Joe Biden,PERSON,6279,https://www.wikidata.org/wiki/Q6279,"(Joe, Biden)",Joe Biden
19,2019,News and blog,Tom Selleck,PERSON,213706,https://www.wikidata.org/wiki/Q213706,"(Tom, Selleck)",Tom Selleck
19,2019,News and blog,Fuck,WORK_OF_ART,4480654,https://www.wikidata.org/wiki/Q4480654,(F*ck),F*ck
19,2019,News and blog,Donald Trump,PERSON,22686,https://www.wikidata.org/wiki/Q22686,"(Donald, Trump)",Donald Trump
21,2019,News and blog,Border,ORG,53736577,https://www.wikidata.org/wiki/Q53736577,(Border),Border


## Group dataframes by year and count named entities

In [16]:
def get_count(df: pd.DataFrame):
  copied_df = df.copy()
  copied_df['Count'] = copied_df.groupby(['Wikidata_id'])['Wikidata_id'].transform('count')
  sorted_df = copied_df.sort_values(by=['Count'], ascending=False)
  unique_df = sorted_df.drop_duplicates(subset=["Wikidata_id"])

  return unique_df

In [17]:
def get_count_dfs_for_years(df: pd.DataFrame):
    years, years_dfs = get_groups(df, "year")

    year_counts_dfs = [get_count(df) for df in years_dfs]

    return years, year_counts_dfs

In [18]:
years_text, years_text_dfs = get_count_dfs_for_years(filtered_text_entity_details_df)
years_text_dfs[0].head()

Unnamed: 0,year,type,Entity,tag,Wikidata_id,Wikidata_url,Span,Span_text,Count
1919,2019,News and blog,United States of America,GPE,30,https://www.wikidata.org/wiki/Q30,"(United, States)",United States,49
2345,2019,News and blog,Democratic Party,NORP,29552,https://www.wikidata.org/wiki/Q29552,(Democrats),Democrats,22
1919,2019,News and blog,Corazon Aquino,PERSON,1480,https://www.wikidata.org/wiki/Q1480,(Cory),Cory,20
40,2019,News and blog,Donald Trump,PERSON,22686,https://www.wikidata.org/wiki/Q22686,(Trump),Trump,17
2347,2019,News and blog,Virginia,ORG,1370,https://www.wikidata.org/wiki/Q1370,(Virginia),Virginia,17


In [19]:
years_headline, years_headline_dfs = get_count_dfs_for_years(filtered_headline_entity_details_df)
years_headline_dfs[0].head()

Unnamed: 0,year,type,Entity,tag,Wikidata_id,Wikidata_url,Span,Span_text,Count
16,2019,News and blog,Joe Biden,PERSON,6279,https://www.wikidata.org/wiki/Q6279,"(Joe, Biden)",Joe Biden,2
2351,2019,News and blog,Alexandria Ocasio-Cortez,PERSON,55223040,https://www.wikidata.org/wiki/Q55223040,"(Ocasio, -, Cortez)",Ocasio-Cortez,2
1936,2019,Press release,Donald Trump,PERSON,22686,https://www.wikidata.org/wiki/Q22686,(Trump),Trump,2
2352,2019,News and blog,Bionic,WORK_OF_ART,286119,https://www.wikidata.org/wiki/Q286119,(Bionic),Bionic,1
2333,2019,News and blog,George VI,PERSON,280856,https://www.wikidata.org/wiki/Q280856,(George),George,1


## Write results to Excel spreadsheet

In [20]:
def save_entity_counts_for_years(years: list[int], dfs: list[pd.DataFrame], output_path: str):
    writer = pd.ExcelWriter(output_path, engine="xlsxwriter")
    
    for year, df in zip(years, dfs):
        df.to_excel(
            writer,
            sheet_name=str(year),
            index=False,
            columns=["Entity", "tag", "Wikidata_id", "Wikidata_url", "Span_text", "Count"]
        )
    
    writer.close()

In [21]:
save_entity_counts_for_years(
    years=years_text, 
    dfs=years_text_dfs, 
    output_path=make_output_path(using_dataset, "named_entities_frequency")
)

save_entity_counts_for_years(
    years=years_headline, 
    dfs=years_headline_dfs, 
    output_path=make_output_path(using_dataset, "named_entities_frequency_headlines")
)

In [22]:
def save_entity_counts_for_types(entity_details_df: pd.DataFrame, suffix = ""):
    types, types_dfs = get_groups(entity_details_df, "type")

    for type, df in zip(types, types_dfs):
        years_text, years_text_dfs = get_count_dfs_for_years(df)

        save_entity_counts_for_years(
            years=years_text, 
            dfs=years_text_dfs, 
            output_path=make_output_path_for_type(using_dataset, type, f"named_entities_frequency{suffix}")
        )

In [23]:
save_entity_counts_for_types(text_entity_details_df)
save_entity_counts_for_types(headline_entity_details_df, "_headlines")