In [None]:
#NER routines using Spacy and NLTK, with sorted lists and visualization
#Sonia Yaco
#Rutgers University
#2024

In [None]:
#Load the Drive helper and mount
from google.colab import drive#
drive.mount('/content/drive/', force_remount=True)

In [None]:
#pips
!pip -q install spacy nltk
!python -m spacy download en_core_web_sm
!pip-q install matplotlib

In [None]:
# Import necessary libraries
import spacy
from spacy import displacy
from spacy.tokens import Doc, Span

import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk
from collections import defaultdict

# Load the English tokenizer, tagger, parser, NER, and word vectors
nlp = spacy.load("en_core_web_sm")
# Load Spacy's NER model
nlp_spacy = spacy.load('en_core_web_sm')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

In [None]:
#paths and file locations
# modify as needed
text_path = "data"
# Define the path to your text files
input_file = text_path + "/griffis_diary_by_day.txt"
output_file = text_path + "/output.txt"
output_file2 = text_path + "/output2.txt"

In [None]:
#NER alpha order by word
# create NER with two different routines, then combine and output to text file
#NER with Spacy
def spacy_ner(text):
    doc = nlp_spacy(text)
    return [(X.text, X.label_) for X in doc.ents]

#NER with NLTK
def nltk_ner(text):
    tokenized = word_tokenize(text)
    tagged = pos_tag(tokenized)
    entities = ne_chunk(tagged)
    return [(leaf[0], 'NE') for tree in entities if hasattr(tree, 'label') and tree.label() == 'NE' for leaf in tree.leaves()]

# Read text from the input file
with open(input_file, 'r') as file:
    text = file.read()

# Perform NER with different libraries
entities_spacy = spacy_ner(text)
entities_nltk = nltk_ner(text)

# Combine entities from all libraries
combined_entities = defaultdict(list)
for entity, label in entities_spacy:
    combined_entities[label].append(entity)

for entity, label in entities_nltk:
    combined_entities[label].append(entity)

# Write recognized entities to the output file
with open(output_file, 'w') as file:
    for label, ents in combined_entities.items():
        file.write(f"Label: {label}\nEntities:\n")
        for ent in set(ents):  # Using set to avoid duplicate entities
            file.write(f"{ent}\n")
        file.write("\n")



In [None]:
# NER in category order
# create NER with same first Spacy routine, but tokenized NLTK routine and output to text file
def spacy_ner(text):
    doc = nlp_spacy(text)
    return [(X.text, X.label_) for X in doc.ents]

# Define a function to perform NER using NLTK
def nltk_ner(text):
    nltk_entities = ne_chunk(pos_tag(word_tokenize(text)))
    return [(leaf[0], 'NE' if type(leaf) is nltk.Tree else leaf[1]) for leaf in nltk_entities]

# Function to write entities to a file
def write_entities_to_file(entities, file_name):
    with open(file_name, 'w') as f:
        for entity in entities:
            f.write(f"{entity[0]} ({entity[1]})\n")

# Read the text file
with open(input_file, 'r') as file:
    text = file.read()

# Perform NER using SpaCy
spacy_entities = spacy_ner(text)

# Perform NER using NLTK
nltk_entities = nltk_ner(text)

# Write entities to output file
write_entities_to_file(spacy_entities, output_file)

# Combine entities from all libraries
combined_entities = defaultdict(list)
for entity, label in spacy_entities:
    combined_entities[label].append(entity)

for entity, label in nltk_entities:
    combined_entities[label].append(entity)

# Write recognized entities to the output file
with open(output_file2, 'w') as file:
    for label, ents in combined_entities.items():
        file.write(f"Label: {label}\nEntities:\n")
        for ent in set(ents):  # Using set to avoid duplicate entities
            file.write(f"{ent}\n")
        file.write("\n")



In [None]:
# NER color coded word visualizations

# Load the English language model
nlp = spacy.load("en_core_web_sm")

# Process the text with spaCy NLP pipeline
doc = nlp(text)

#first display all NERs color coded in context
displacy.render(doc, style='ent', jupyter=True)

#then display just three filtered labels, with no context
# Filter entities based on labels
filtered_entities = [ent for ent in doc.ents if ent.label_ in ['PERSON', 'ORG', 'GPE']]

# Extract tokens corresponding to filtered entities
filtered_tokens = [token.text for ent in filtered_entities for token in ent]

# Create a new Doc for filtered tokens
filtered_doc = Doc(doc.vocab, words=filtered_tokens)

# Adjust entities for the new Doc
adjusted_entities = []
start_offset = 0
for ent in filtered_entities:
    end_offset = start_offset + len(ent)
    span = Span(filtered_doc, start_offset, end_offset, label=ent.label_)
    adjusted_entities.append(span)
    start_offset = end_offset

# Update the entities in the new Doc
filtered_doc.ents = adjusted_entities

displacy.render(filtered_doc, style='ent', jupyter=True)
