In [None]:
#1. use standard spacy and show working NER video
#2. modify #1 to train spacy to support a new entity called fruit and see results
#3. create standard spacy with CFR and see NER working, create #1 just with CFR model
#4. modify #3 train spacy with CFR to support a new entity called fruit
#5. the paper says NER with CFR is better than with non CFR so here you will see that #4 is better than #2

#6. ["text": "thththththththth "]

In [11]:
sample_txt = "Albert Einstein was born on March 14, 1879, in Ulm, in the Kingdom of Württemberg in the German Empire. He made significant contributions to the field of theoretical physics, especially in the development of the theory of relativity. Einstein received the Nobel Prize in Physics in 1921 for his explanation of the photoelectric effect. He later moved to the United States, where he continued his scientific work and became a prominent figure in academia."


In [12]:
# Step 1
import spacy
from spacy import displacy

nlp = spacy.load("en_core_web_sm")
doc = nlp(sample_txt)

displacy.render(doc, style='ent')


In [5]:
from spacy.training.example import Example
import random

nlp = spacy.blank("en")
ner = nlp.add_pipe("ner")
ner.add_label("Fruit")

training_data = [
    # ("I love apples and bananas.", {"entities": [(7, 13, "fruit"), (18, 25, "fruit")]}),
    ("I love Albert Einstein.", {"entities": [(7, 13, "fruit"), (18, 25, "fruit")]}),

    ("Oranges are a great source of vitamin C.", {"entities": [(0, 7, "fruit")]}),
]

collected_training_data = []
for text, entity_map_item in training_data:
    converted_data = Example.from_dict(nlp.make_doc(text), entity_map_item)
    collected_training_data.append(converted_data)

nlp.begin_training()
for i in range(100):
    random.shuffle(collected_training_data)
    for data in collected_training_data:
        nlp.update([data], drop=0.5)

nlp.to_disk("custom_ner_model")

# Step 4: Test the Model
custom_ner_modl = spacy.load("custom_ner_model")
doc = custom_ner_modl(sample_txt)

# Specify a solid color for the entity
style = {
    "ents": ["Fruit"],
    "colors": {"Fruit": "green"},
}

# Render the visualization with custom colors
displacy.render(doc, style='ent', options=style)




In [1]:
import spacy
import ast
from spacy.training.example import Example

# load the training dataset
import pandas as pd
training_data_file_path = "/Users/shubham/Desktop/ner.csv"

# read csv into pandas DataFrame
df = pd.read_csv(training_data_file_path)
# Grab any basic information df.head() or: Summary statistics, Access a specific column, etc...

'''
One thing to note here: is that this NER dataset that I found is split into the following columns:
    - Sentence #, Sentence, POS(Word type description, NOUN, etc..), Tag(O-per with IOB-named entity)
But spaCy doesn't understand the data in this format. spaCy understands data in the format of a tuple
    - (string, dict) where dict = {entities : [tuples of (start, end, entity)]}
So in order to train the model with spaCy like the paper suggests we first need to reformat the dataFrame to match
into this form which is what taggedFormat(df) is doing
'''
def taggedFormat(df):
    tagged_spacy_data = []
    for index, row in df.iterrows():
        sentence = row['Sentence']
        ner_tags = row['Tag']

        # right now here ner_tags is of type string, so in order to get the elements lets convert to a list
        ner_tags = ast.literal_eval(ner_tags)

        entities = []
        start = 0
        end = 0
        for tagIndex, word in enumerate(sentence.split()):
            end = start + len(word)

            if tagIndex >= len(ner_tags): 
                break
            if(ner_tags[tagIndex] != 'O'):
                entities.append((start, end, ner_tags[tagIndex]))

            start = end + 1
        
        if len(entities) > 0:
            tagged_spacy_data.append((sentence, {"entities": entities}))

    return tagged_spacy_data

'''
1st Function(taggedFormat(df)): gives you a line of output like this, ignoring the sentence part in the beginning:
    - ({'entities': [(0, 4, 'B-gpe'), (12, 21, 'B-per'), (22, 29, 'I-per'), (30, 41, 'I-per'), (47, 54, 'B-tim'), (60, 68, 'B-gpe'), (100, 104, 'B-gpe'), (158, 165, 'B-gpe')]})

2nd Function(combineTaggedData(tagged_spacy_data)): looking at the above example notice how (12, 41) are all B,I,I-per, they are talking about the same named entity PER, so we need to combine these somehow to turn it into this
    - ({'entities': [(0, 4, 'B-gpe'), (12, 41, 'B-per???'), (47, 54, 'B-tim'), (60, 68, 'B-gpe'), (100, 104, 'B-gpe'), (158, 165, 'B-gpe')]})

3rd Function. But when we make our training data we dont include the tags: 'B-', 'I-', etc.. and we also capitalize the 'per' -> 'PER' to follow a standard so we need to do some processing of this formatted data
and put in this form(using .upper() and [2:] we can do this):
    - {'entities': [(0, 4, 'GPE'), (12, 41, 'PER'), (47, 54, 'TIM'), (60, 68, 'GPE'), (100, 104, 'GPE'), (158, 165, 'GPE')]}

This is one of the main techniques that the research paper proposes in the methodology section and how to extract sentence data to create a proper training data to build our model.
Obviously in the paper they go directly from HTML to sentence parsing my usecase is a bit different since I am using a dataset from kaggle but the technique here is still the same.
'''

def combineTaggedData(tagged_spacy_data):
    to_combine = []

    for sentence, entities_dict in tagged_spacy_data:
        entities_list = entities_dict["entities"]

        i = 0
        j = 1
        currArr = []
        found = False
        while j < len(entities_list):
            start1, end1, entity_type1 = entities_list[i]
            start2, end2, entity_type2 = entities_list[j]

            splitI = entity_type1.split('-')
            splitJ = entity_type2.split('-')

            if splitI[0] == 'B' and splitJ[0] == 'I':
                if j == len(entities_list) - 1:
                    currArr.append((start1, end2, entity_type1))
                    found = True
                j += 1
            else:
                _, endPrev, _ = entities_list[j-1]
                currArr.append((start1, endPrev, entity_type1))
                i = j
                j += 1
        
        if i == len(entities_list) - 1:
            currArr.extend(entities_list[i:])

        to_combine.append((sentence, {"entities" : currArr}))
    
    return to_combine

def convertSpacyFormat(to_combine):
    spacy_formatted_data = []

    for sentence, entities_dict in to_combine:

        entities_list = entities_dict["entities"]
        modified_entities = []

        for start, end, type in entities_list:
            currType = type
            formattedType = currType[2:].upper() # take 'B-Geo' -> 'GEO'
            modified_entities.append((start, end, formattedType))
        
        spacy_formatted_data.append((sentence, {"entities" : modified_entities}))

    return spacy_formatted_data


# Convert the DataFrame to spaCy format but still the tags are in there
spacy_training_data = taggedFormat(df)
# print(spacy_training_data[45][1])

# remove the tags and now we are officially ready to train our model, spacy_formatted_data is a list of training data
combined_spacy_data = combineTaggedData(spacy_training_data)
# print(combined_spacy_data[45])
# print(f"{combined_spacy_data[10][0][12:21]}, {combined_spacy_data[10][0][22:29]}, {combined_spacy_data[10][0][30:41]}")

spacy_formatted_data = convertSpacyFormat(combined_spacy_data)
# print(spacy_training_data[45][1])
# print(combined_spacy_data[45])

# print(spacy_training_data[6][1])
# print(combined_spacy_data[6])

# print(spacy_training_data[13][1])
# print(combined_spacy_data[13])

# print(spacy_training_data[1381][1])
# print(combined_spacy_data[1381])

# for rows in spacy_formatted_data:
#     print(f"{rows}")

# TESTING: make sure I get the right entities, types, etc...
# for i in range(len(spacy_data)):
#   print(spacy_data[i])
# print(type(spacy_formatted_data))
# for i in range(len(spacy_formatted_data)):
#         print(spacy_formatted_data[i])
# print(spacy_training_data[0])
# print(spacy_training_data[0][0][48:54], spacy_training_data[0][0][77:81], spacy_training_data[0][0][111:118])

In [3]:
from spacy.training.example import Example
import random

sample_txt = "Albert Einstein was born on March 14, 1879, in Ulm, in the Kingdom of Württemberg in the German Empire. He made significant contributions to the field of theoretical physics, especially in the development of the theory of relativity. Einstein received the Nobel Prize in Physics in 1921 for his explanation of the photoelectric effect. He later moved to the United States, where he continued his scientific work and became a prominent figure in academia."

nlp = spacy.blank("en")
ner = nlp.add_pipe("ner")

# midpoint = len(spacy_formatted_data) // 2
# dataset1 = spacy_formatted_data[:midpoint]
# dataset2 = spacy_formatted_data[midpoint:]

collected_training_data = []
for text, entity_map_item in spacy_formatted_data:
    converted_data = Example.from_dict(nlp.make_doc(text), entity_map_item)
    collected_training_data.append(converted_data)

nlp.begin_training()
for i in range(10):
    random.shuffle(collected_training_data)
    for data in collected_training_data:
        nlp.update([data], drop=0.4)

nlp.to_disk("custom_ner_model")

# Step 4: Test the Model
custom_ner_modl = spacy.load("custom_ner_model")
doc = custom_ner_modl(sample_txt)

# Render the visualization with custom colors
displacy.render(doc, style='ent')

NameError: name 'displacy' is not defined

In [9]:
import spacy
from spacy import displacy

displacy.render(doc, style='ent')



In [130]:
def convert_to_spacy(df):
    spacy_data = []
    # for index, row in df.iterrows():
    #     sentence = row['Sentence']
    #     ner_tags = row['Tag']

    #     # print(ner_tags)
    #     # convert string -> list
    #     ner_tags = ast.literal_eval(ner_tags)

    #     # print(index, sentence)

    sentence = df['Sentence'][47591]
    print(sentence)

    totalWords = sentence.split()
    print(f"Words: {totalWords}")
    print(f"Word Count: {len(totalWords)}")


    ner_tags = df['Tag'][47591]
    ner_tags = ast.literal_eval(ner_tags)
    print(len(ner_tags))

    entities = []
    start = 0
    end = 0
    for tagIndex, word in enumerate(sentence.split()):
        end = start + len(word)
        if tagIndex >= len(ner_tags):
            break
        if(ner_tags[tagIndex] != 'O'):
            print(f"({start}, {end}, {ner_tags[tagIndex]} | {tagIndex}")
            entities.append((start, end, ner_tags[tagIndex]))

        start = end + 1
        

        # entities.append((start, end, ner_tags[tagIndex])) if(ner_tags[tagIndex] != 'O')
        

    return spacy_data

# Convert the DataFrame to spaCy format
spacy_training_data = convert_to_spacy(df)

U.S. weather forecasters say Hurricane Wilma has strengthened to a powerful category 5 storm and a key low-pressure measurement indicates it is the most powerful storm of the year .
Words: ['U.S.', 'weather', 'forecasters', 'say', 'Hurricane', 'Wilma', 'has', 'strengthened', 'to', 'a', 'powerful', 'category', '5', 'storm', 'and', 'a', 'key', 'low-pressure', 'measurement', 'indicates', 'it', 'is', 'the', 'most', 'powerful', 'storm', 'of', 'the', 'year', '.']
Word Count: 30
29
(0, 4, B-geo | 0


In [8]:
import requests
from bs4 import BeautifulSoup
import spacy
from spacy import displacy
from spacy.cli.download import download as spacy_download

# Download the spaCy model if not already downloaded
# try:
#     spacy.load("en_core_web_lg")
# except OSError:
#     spacy_download("en_core_web_lg")


def get_text_from_website(url):
    # Send an HTTP request to the website
    response = requests.get(url)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Parse the HTML content of the page
        soup = BeautifulSoup(response.text, "html.parser")

        # Extract text content
        text_content = soup.get_text()

        return text_content
    else:
        # If the request was not successful, print an error message
        print(
            f"Error: Unable to retrieve content from {url}. Status code: {response.status_code}"
        )
        return None


def transform_text_to_ner(text):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)
    with open("test.html", "w") as fd:
        fd.write(displacy.render(doc, style="ent"))

# Example usage
url = "https://www.cnet.com/home/internet/best-wifi-extender/"
website_text = get_text_from_website(url)

if website_text:
    transform_text_to_ner(website_text)

TypeError: write() argument must be str, not None

In [10]:
import requests
from bs4 import BeautifulSoup
import spacy
from spacy import displacy
from spacy.cli.download import download as spacy_download

# Download the spaCy model if not already downloaded
try:
    spacy.load("en_core_web_lg")
except OSError:
    spacy_download("en_core_web_lg")


def get_text_from_website(url):
    # Send an HTTP request to the website
    response = requests.get(url)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Parse the HTML content of the page
        soup = BeautifulSoup(response.text, "html.parser")

        # Extract text content
        text_content = soup.get_text()

        return text_content
    else:
        # If the request was not successful, print an error message
        print(
            f"Error: Unable to retrieve content from {url}. Status code: {response.status_code}"
        )
        return None


def transform_text_to_ner(text):
    nlp = spacy.load("en_core_web_lg")
    doc = nlp(text)
    displacy.render(doc, style="ent")
    # with open("test.html", "w") as fd:
    #     fd.write(displacy.render(doc, style="ent"))

# Example usage
url = "https://www.cnet.com/home/internet/best-wifi-extender/"
website_text = get_text_from_website(url)

if website_text:
    transform_text_to_ner(website_text)

In [12]:
import requests
from bs4 import BeautifulSoup
import spacy
from spacy import displacy

def get_text_from_website(url):
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, "html.parser")
        text_content = soup.get_text()
        return text_content
    else:
        print(f"Error: Unable to retrieve content from {url}. Status code: {response.status_code}")
        return None

def transform_text_to_ner_and_save(text, output_file="test.html"):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)

    # Display the entities directly in the notebook
    displacy.render(doc, style="ent", jupyter=True)

# Example usage
url = "https://once.com/?ref=onepagelove"
website_text = get_text_from_website(url)

if website_text:
    transform_text_to_ner_and_save(website_text)


In [8]:
import requests
from bs4 import BeautifulSoup
import spacy
from spacy import displacy
import html

def get_text_from_website(url):
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, "html.parser")
        text_content = soup.get_text()
        return text_content
    else:
        print(f"Error: Unable to retrieve content from {url}. Status code: {response.status_code}")
        return None

def transform_text_to_ner_and_save(text, output_file="test.html"):
    nlp = spacy.load("en_core_web_lg")
    doc = nlp(text)

    # Highlight entities with CSS classes
    highlighted_text = ""
    for ent in doc.ents:
        highlighted_text += f"<span class='entity-{ent.label_}'>{ent.text}</span>"

    # Create HTML document with highlighted entities
    highlighted_html = f"""
    <!DOCTYPE html>
    <html>
    <head>
        <title>NER Highlighted Text</title>
        <style>
            .entity-PERSON {{ background-color: yellow; }}
            .entity-ORG {{ background-color: lightblue; }}
            .entity-LOC {{ background-color: lightgreen; }}
        </style>
    </head>
    <body>
        {highlighted_text}
    </body>
    </html>
    """

    # Save the HTML to a file
    with open(output_file, "w") as fd:
        fd.write(highlighted_html)

    # Return the highlighted HTML
    return highlighted_html

# Example usage
url = "https://www.cnet.com/home/internet/best-wifi-extender/"
website_text = get_text_from_website(url)

if website_text:
    highlighted_html = transform_text_to_ner_and_save(website_text)

    # Replace original HTML with modified HTML
    original_html = requests.get(url).text
    modified_html = highlighted_html
    new_html = original_html.replace(original_html, modified_html)

    # Display the modified HTML
    print(new_html)


    <!DOCTYPE html>
    <html>
    <head>
        <title>NER Highlighted Text</title>
        <style>
            .entity-PERSON { background-color: yellow; }
            .entity-ORG { background-color: lightblue; }
            .entity-LOC { background-color: lightgreen; }
        </style>
    </head>
    <body>
        <span class='entity-DATE'>Monday 2023</span><span class='entity-LAW'>NameDrop</span><span class='entity-ORG'>ToysPortable Power Stations        
            Tech</span><span class='entity-ORG'>Home Home</span><span class='entity-DATE'>2023</span><span class='entity-PERSON'>Ry Crist</span><span class='entity-ORG'>Reviews - Labs Originally</span><span class='entity-GPE'>Troy</span><span class='entity-GPE'>Ohio</span><span class='entity-PERSON'>Ry Crist</span><span class='entity-ORG'>CNET</span><span class='entity-DATE'>2013</span><span class='entity-ORG'>Ry</span><span class='entity-DATE'>10 years</span><span class='entity-ORG'>CNET Home</span><span class='entity-PERSON'

In [6]:
with open("test.html", "w") as fd:
        fd.write(displacy.render(doc, style="ent"))

NameError: name 'doc' is not defined

In [9]:
import requests
from bs4 import BeautifulSoup
import spacy
from spacy import displacy
import tempfile
import webbrowser

def get_text_from_website(url):
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, "html.parser")
        text_content = soup.get_text()
        return text_content
    else:
        print(f"Error: Unable to retrieve content from {url}. Status code: {response.status_code}")
        return None

def transform_text_to_ner_and_save(text, output_file="test.html"):
    nlp = spacy.load("en_core_web_lg")
    doc = nlp(text)

    # Highlight entities with CSS classes
    highlighted_text = ""
    for ent in doc.ents:
        highlighted_text += f"<span class='entity-{ent.label_}'>{ent.text}</span>"

    # Create HTML document with highlighted entities
    highlighted_html = f"""
    <!DOCTYPE html>
    <html>
    <head>
        <title>NER Highlighted Text</title>
        <style>
            .entity-PERSON {{ background-color: yellow; }}
            .entity-ORG {{ background-color: lightblue; }}
            .entity-LOC {{ background-color: lightgreen; }}
        </style>
    </head>
    <body>
        {highlighted_text}
    </body>
    </html>
    """

    # Save the HTML to a file
    with open(output_file, "w") as fd:
        fd.write(highlighted_html)

    # Return the highlighted HTML
    return highlighted_html

# Example usage
url = "https://www.cnet.com/home/internet/best-wifi-extender/"
website_text = get_text_from_website(url)

if website_text:
    highlighted_html = transform_text_to_ner_and_save(website_text)

    # Replace original HTML with modified HTML
    original_html = requests.get(url).text
    modified_html = highlighted_html
    new_html = original_html.replace(original_html, modified_html)

    # Create a temporary file
    with tempfile.NamedTemporaryFile(mode="w", suffix=".html", delete=False) as f:
        f.write(new_html)
        temporary_file_path = f.name

    # Open the temporary file in a web browser
    webbrowser.open(temporary_file_path)

In [6]:
from spacy.training.example import Example
import random
import spacy
from spacy import displacy

sample_txt = "Albert Einstein was born on March 14, 1879, in Ulm, in the Kingdom of Württemberg in the German Empire. He made significant contributions to the field of theoretical physics, especially in the development of the theory of relativity. Einstein received the Nobel Prize in Physics in 1921 for his explanation of the photoelectric effect. He later moved to the United States, where he continued his scientific work and became a prominent figure in academia."

nlp = spacy.blank("en")
ner = nlp.add_pipe("ner")

# midpoint = len(spacy_formatted_data) // 2
# dataset1 = spacy_formatted_data[:midpoint]
# dataset2 = spacy_formatted_data[midpoint:]

collected_training_data = []
for text, entity_map_item in spacy_formatted_data:
    converted_data = Example.from_dict(nlp.make_doc(text), entity_map_item)
    collected_training_data.append(converted_data)

nlp.begin_training()
for i in range(30):
    random.shuffle(collected_training_data)
    for data in collected_training_data:
        nlp.update([data], drop=0.4)

nlp.to_disk("custom_ner_model")

# Step 4: Test the Model
custom_ner_modl = spacy.load("custom_ner_model")
doc = custom_ner_modl(sample_txt)

# Render the visualization with custom colors
displacy.render(doc, style='ent')

In [17]:
import os
print(os.getcwd())





/Users/shubham/ForwardDataLabTask1
