# Notebook for Named Entity Recognition

Using spaCy for named entity recognition, we want to create relative frequency tables for the entities by year. At this point, we are only interested in the entities that appear most frequently.

Currently processes the "Fakespeak-ENG modified.xlsx" file (I've renamed my copy to "Fakespeak_ENG_modified.xlsx" to create a more consistent path), but will eventually be run on data from MisInfoText as well.

From the original data file, we use the following columns: ID, combinedLabel, originalTextType, originalBodyText, originalDateYear

We are processing text from the "originalBodyText" column.

In [None]:
!pip install "spacy~=3.0.6"

In [None]:
!python -m spacy download en_core_web_md

In [None]:
!python -m spacy download en_core_web_lg

In [None]:
!pip install spacy-entity-linker==1.0.3

In [None]:
!python -m spacy_entity_linker "download_knowledge_base"

In [1]:
from typing import Iterable
from itertools import chain
import spacy
from spacy.language import Language
from spacy.tokens.doc import Doc
from spacy.tokens.span import Span
from spacy_entity_linker.EntityElement import EntityElement
from spacy_entity_linker.EntityLinker import EntityLinker
import pandas as pd

In [None]:
from google.colab import drive
drive.mount('/content/drive')

## Loading the articles

In [2]:
input = './data/Fakespeak-ENG/Fakespeak-ENG modified.xlsx'

In [4]:
fakespeak_df = pd.read_excel(input, sheet_name="Working", usecols=['ID', 'combinedLabel', 'originalTextType', 'originalBodyText', 'originalDateYear'])

In [5]:
fakespeak_df.head()

Unnamed: 0,ID,combinedLabel,originalTextType,originalBodyText,originalDateYear
0,Politifact_FALSE_Social media_687276,False,Social media,Mexico is paying for the Wall through the new ...,2019
1,Politifact_FALSE_Social media_25111,False,Social media,"Chuck Schumer: ""why should American citizens b...",2019
2,Politifact_FALSE_Social media_735424,False,Social media,Billions of dollars are sent to the State of C...,2019
3,Politifact_FALSE_Social media_594307,False,Social media,If 50 Billion $$ were set aside to go towards ...,2019
4,Politifact_FALSE_Social media_839325,False,Social media,Huge@#CD 9 news. \n@ncsbe\n sent letter to eve...,2019


## Tagging named entities using spaCy

To make up for the difficulties of consolidating similar named entities, we use spaCy's large web model to ensure higher tagging accuracy in the initial NER step.

Documentation for entityLinker: https://github.com/egerber/spaCy-entity-linker

In [6]:
# load spacy model
nlp = spacy.load("en_core_web_md")

# add custom entityLinker pipeline
entity_linker = nlp.add_pipe("entityLinker", last=True)

In [7]:
def get_entities_from_doc(doc: Doc) -> Iterable[EntityElement]:
    return doc._.linkedEntities

def get_entity_data(row: pd.Series):
    entities: Iterable[EntityElement] = row["entities"]
    return [{
        "Entity": entity.get_label(),
        "Wikidata_id": entity.get_id(),
        "Wikidata_url": entity.get_url(),
        "Year": row["originalDateYear"],
        "Article_id": row["ID"],
        "Span": entity.get_span(),
        "Span_text": entity.get_span().text,
    } for entity in entities]

In [8]:
fakespeak_df["doc"] = list(nlp.pipe(fakespeak_df['originalBodyText']))
fakespeak_df["entities"] = fakespeak_df["doc"].apply(get_entities_from_doc)

all_entities_data = list(chain.from_iterable(fakespeak_df.apply(get_entity_data, axis=1)))
entities_df = pd.DataFrame(all_entities_data)
entities_df

Unnamed: 0,Entity,Wikidata_id,Wikidata_url,Year,Article_id,Span,Span_text
0,Mexico,96,https://www.wikidata.org/wiki/Q96,2019,Politifact_FALSE_Social media_687276,(Mexico),Mexico
1,The Wall,27964590,https://www.wikidata.org/wiki/Q27964590,2019,Politifact_FALSE_Social media_687276,(Wall),Wall
2,United States–Mexico–Canada Agreement,56839716,https://www.wikidata.org/wiki/Q56839716,2019,Politifact_FALSE_Social media_687276,(USMCA),USMCA
3,The Wall,27964590,https://www.wikidata.org/wiki/Q27964590,2019,Politifact_FALSE_Social media_687276,(Wall),Wall
4,parking lot,6501349,https://www.wikidata.org/wiki/Q6501349,2019,Politifact_FALSE_Social media_687276,(lot),lot
...,...,...,...,...,...,...,...
109926,UPDATE,1076005,https://www.wikidata.org/wiki/Q1076005,2023,Politifact_Pants on Fire_Social media_621529,(UPDATES),UPDATES
109927,INSANE,3153089,https://www.wikidata.org/wiki/Q3153089,2023,Politifact_Pants on Fire_Social media_621529,(INSANE),INSANE
109928,tax,8161,https://www.wikidata.org/wiki/Q8161,2023,Politifact_Pants on Fire_Social media_621529,(TAXES),TAXES
109929,Ontario,1904,https://www.wikidata.org/wiki/Q1904,2023,Politifact_Pants on Fire_Social media_621529,(ON),ON


In [None]:
# For some reason, any spans of just "President" (or similar)
# get tagged as Zhong Chenle, maybe because he has an alias "President".
# The following code fixes that to point to the correct Wikidata entry
# for the generic term "president".

zhong_chenle_president_aliases = {'PRESIDENT', 'President', 'Presidents'}
zhong_chenle_wikidata_id = 30945670
zhong_chenle_as_president_filter = (entities_df["Wikidata_id"] == zhong_chenle_wikidata_id) & (entities_df["Span_text"].isin(zhong_chenle_president_aliases))
president_wikidata_id = 30461

entities_df.loc[zhong_chenle_as_president_filter, "Entity"] = "president"
entities_df.loc[zhong_chenle_as_president_filter, "Wikidata_id"] = president_wikidata_id
entities_df.loc[zhong_chenle_as_president_filter, "Wikidata_url"] = f"https://www.wikidata.org/wiki/Q{president_wikidata_id}"

In [12]:
entities_df.head()

Unnamed: 0,Entity,Wikidata_id,Wikidata_url,Year,Article_id,Span,Span_text
0,Mexico,96,https://www.wikidata.org/wiki/Q96,2019,Politifact_FALSE_Social media_687276,(Mexico),Mexico
1,The Wall,27964590,https://www.wikidata.org/wiki/Q27964590,2019,Politifact_FALSE_Social media_687276,(Wall),Wall
2,United States–Mexico–Canada Agreement,56839716,https://www.wikidata.org/wiki/Q56839716,2019,Politifact_FALSE_Social media_687276,(USMCA),USMCA
3,The Wall,27964590,https://www.wikidata.org/wiki/Q27964590,2019,Politifact_FALSE_Social media_687276,(Wall),Wall
4,parking lot,6501349,https://www.wikidata.org/wiki/Q6501349,2019,Politifact_FALSE_Social media_687276,(lot),lot


## Filter dataframes by year and named entities
Currently, entityLinker catches all entities, not just proper nouns. To get around this, we first create dataframes filtering by year, then get the POS tags using spacy. This will then allow us to filter the dataframes further by excluding any counted nouns.

In [None]:
# create filtered dataframes
entities_2019_df = entities_df[entities_df['Year'] == 2019]
entities_2020_df = entities_df[entities_df['Year'] == 2020]
entities_2021_df = entities_df[entities_df['Year'] == 2021]
entities_2022_df = entities_df[entities_df['Year'] == 2022]
entities_2023_df = entities_df[entities_df['Year'] == 2023]
entities_2024_df = entities_df[entities_df['Year'] == 2024]

In [None]:
# helper function for counting entities in each year
def get_count(df: pd.DataFrame):
  df['Count'] = df.groupby(['Entity'])['Wikidata_id'].transform('count')
  sorted_df = df.sort_values(by=['Count', 'Entity', 'Wikidata_id'], ascending=False)
  unique_df = sorted_df.drop_duplicates()

  return unique_df

In [None]:
# from each dataframe, obtain the counts of entities, sort by count, then keep unique values
# dropping N/A values to account for error in entityLinker tagging
entity_counts_2019_df = get_count(entities_2019_df).dropna()
entity_counts_2020_df = get_count(entities_2020_df).dropna()
entity_counts_2021_df = get_count(entities_2021_df).dropna()
entity_counts_2022_df = get_count(entities_2022_df).dropna()
entity_counts_2023_df = get_count(entities_2023_df).dropna()
entity_counts_2024_df = get_count(entities_2024_df).dropna()

In [None]:
entity_counts_2019_df.head()

In [None]:
tagger = spacy.load("en_core_web_md")

In [None]:
entity_counts_2019_df['POS'] = [doc[0].pos_ for doc in tagger.pipe(entity_counts_2019_df['Entity'].tolist())]
entity_counts_2020_df['POS'] = [doc[0].pos_ for doc in tagger.pipe(entity_counts_2020_df['Entity'].tolist())]
entity_counts_2021_df['POS'] = [doc[0].pos_ for doc in tagger.pipe(entity_counts_2021_df['Entity'].tolist())]
entity_counts_2022_df['POS'] = [doc[0].pos_ for doc in tagger.pipe(entity_counts_2022_df['Entity'].tolist())]
entity_counts_2023_df['POS'] = [doc[0].pos_ for doc in tagger.pipe(entity_counts_2023_df['Entity'].tolist())]
entity_counts_2024_df['POS'] = [doc[0].pos_ for doc in tagger.pipe(entity_counts_2024_df['Entity'].tolist())]

In [None]:
# filter dataframes by proper noun only
entity_counts_2019_df = entity_counts_2019_df[entity_counts_2019_df['POS'] == 'PROPN']
entity_counts_2020_df = entity_counts_2020_df[entity_counts_2020_df['POS'] == 'PROPN']
entity_counts_2021_df = entity_counts_2021_df[entity_counts_2021_df['POS'] == 'PROPN']
entity_counts_2022_df = entity_counts_2022_df[entity_counts_2022_df['POS'] == 'PROPN']
entity_counts_2023_df = entity_counts_2023_df[entity_counts_2023_df['POS'] == 'PROPN']
entity_counts_2024_df = entity_counts_2024_df[entity_counts_2024_df['POS'] == 'PROPN']

In [None]:
entity_counts_2019_df.head()

In [None]:
# helper function to calculate frequency in percentage
def get_prop(df):
  df['Proportion'] = df['Count'] / df['Count'].sum()

  return df

In [None]:
entity_counts_2019_df = get_prop(entity_counts_2019_df)
entity_counts_2020_df = get_prop(entity_counts_2020_df)
entity_counts_2021_df = get_prop(entity_counts_2021_df)
entity_counts_2022_df = get_prop(entity_counts_2022_df)
entity_counts_2023_df = get_prop(entity_counts_2023_df)
entity_counts_2024_df = get_prop(entity_counts_2024_df)

In [None]:
entity_counts_2019_df.head()

## Write results to Excel spreadsheet

In [None]:
!pip install xlsxwriter

In [None]:
output = '/content/drive/My Drive/fake_news_over_time/named_entities_frequency.xlsx'

In [None]:
# create excel writer object to initialize new workbook
writer = pd.ExcelWriter(output, engine="xlsxwriter")

# write dataframes to different worksheets
entity_counts_2019_df.to_excel(writer, sheet_name="2019", columns=['Entity', 'Wikidata_id', 'Wikidata_url', 'Count', 'Proportion'], index=False)
entity_counts_2020_df.to_excel(writer, sheet_name="2020", columns=['Entity', 'Wikidata_id', 'Wikidata_url', 'Count', 'Proportion'], index=False)
entity_counts_2021_df.to_excel(writer, sheet_name="2021", columns=['Entity', 'Wikidata_id', 'Wikidata_url', 'Count', 'Proportion'], index=False)
entity_counts_2022_df.to_excel(writer, sheet_name="2022", columns=['Entity', 'Wikidata_id', 'Wikidata_url', 'Count', 'Proportion'], index=False)
entity_counts_2023_df.to_excel(writer, sheet_name="2023", columns=['Entity', 'Wikidata_id', 'Wikidata_url', 'Count', 'Proportion'], index=False)
entity_counts_2024_df.to_excel(writer, sheet_name="2024", columns=['Entity', 'Wikidata_id', 'Wikidata_url', 'Count', 'Proportion'], index=False)

# close the excel writer and output file
writer.close()

In [None]:
tnlp = nlp("John Doe would be disastrous as President!")
tnlp