In [1]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
from bs4 import BeautifulSoup
import requests
import re

# Load English language model
nlp = en_core_web_sm.load()

# Process a text document
doc = nlp('''Hello! My name is Aurin and I live in Dhaka. 
          I study "Robotics and Mechatronics 
          Engineering" at University of Dhaka
          which was established
          on 15th May, 2015. Currently, I work as a Teacher and earn about $8000/month''')

In [2]:
# Print entities with their IOB (Inside, Outside, Beginning) and entity type
print([(X, X.ent_iob_, X.ent_type_) for X in doc])



[(Hello, 'O', ''), (!, 'O', ''), (My, 'O', ''), (name, 'O', ''), (is, 'O', ''), (Aurin, 'B', 'PERSON'), (and, 'O', ''), (I, 'O', ''), (live, 'O', ''), (in, 'O', ''), (Dhaka, 'B', 'GPE'), (., 'O', ''), (
          , 'O', ''), (I, 'O', ''), (study, 'O', ''), (", 'O', ''), (Robotics, 'B', 'WORK_OF_ART'), (and, 'I', 'WORK_OF_ART'), (Mechatronics, 'I', 'WORK_OF_ART'), (
          , 'I', 'WORK_OF_ART'), (Engineering, 'I', 'WORK_OF_ART'), (", 'O', ''), (at, 'O', ''), (University, 'B', 'ORG'), (of, 'I', 'ORG'), (Dhaka, 'I', 'ORG'), (
          , 'O', ''), (which, 'O', ''), (was, 'O', ''), (established, 'O', ''), (
          , 'O', ''), (on, 'O', ''), (15th, 'B', 'DATE'), (May, 'I', 'DATE'), (,, 'I', 'DATE'), (2015, 'I', 'DATE'), (., 'O', ''), (Currently, 'O', ''), (,, 'O', ''), (I, 'O', ''), (work, 'O', ''), (as, 'O', ''), (a, 'O', ''), (Teacher, 'O', ''), (and, 'O', ''), (earn, 'O', ''), (about, 'B', 'MONEY'), ($, 'I', 'MONEY'), (8000, 'I', 'MONEY'), (/, 'I', 'MONEY'), (month, 'I', 'MONEY')]


In [3]:
# Visualize named entities in the document
displacy.render(nlp(str(doc)), jupyter=True, style='ent')



In [4]:
# Visualize syntactic dependencies in the document
displacy.render(nlp(str(doc)), jupyter=True, style='dep', options={'distance': 120})

# Sentences written in a certain way make a Huge Impact

# Two example sentences for NER analysis
sentence1 = "Alice Walker has booked a flight for December 10th, 2023, to travel from Dhaka to Cox's Bazar and experience the stunning coastal beauty."
sentence2 = "In her travel itinerary, Alice Walker is set to journey from Dhaka to Cox's Bazar on December 10th, 2023, immersing herself in the scenic wonders of the coastline."

# Process the sentences with NLP pipeline
doc1 = nlp(sentence1)
doc2 = nlp(sentence2)



In [5]:
# Visualize named entities in the sentences
displacy.render(doc1, style="ent", jupyter=True)


In [6]:
displacy.render(doc2, style="ent", jupyter=True)



In [7]:
# Extract text from a URL and process it
def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html.parser')
    for script in soup(["script", 'style', "aside"]):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))

# Convert the content of a URL to a SpaCy document
convert = url_to_string('https://dailycampus.com/2024/01/11/mens-basketball-huskies-escape-cintas-with-close-80-75-win/')
article = nlp(convert)

# Print the length of the processed document
len(article)



1588

In [8]:
# Print the processed document
article

    Men’s Basketball: Huskies escape Cintas with close 80-75 win | The Daily Campus Spotify   Sign in News News Home Life Life Home Campus Events Movie & TV Reviews Music Reviews Opinion Opinion Home Letters to the Editor + Op-Eds Culture Shock Sports Sports Home Sports Staff Photo Photo Home Video Comics About About Us Our Staff Board of Directors Executive History Print Editions Special Editions Join Us Tip Line   Sign in Welcome!Log into your account your username your password Forgot your password? Password recovery Recover your password your email Search        Trending Now Men’s Basketball: No. 4 Huskies rout Hoyas 80-67 in scrappy Big East showdown  Resignation of Claudine Gay fuels firestorm over plagiarism, politics on campus  An Analysis of UConn’s 10-Year Plan. Part 1: The endowment   Search   29 F Storrs   The Daily CampusThe Daily Campus  Tuesday, January 16, 2024  News News Home Life Life Home Campus Events Movie & TV Reviews Music Reviews Opinion Opinion Home Letters to 

In [9]:
# Count the occurrences of different entity labels in the document
labels = [x.label_ for x in article.ents]
Counter(labels)

Counter({'CARDINAL': 51,
         'ORG': 30,
         'GPE': 29,
         'PERSON': 19,
         'DATE': 17,
         'LOC': 8,
         'TIME': 8,
         'NORP': 4,
         'ORDINAL': 3,
         'WORK_OF_ART': 2,
         'EVENT': 2,
         'PERCENT': 1,
         'FAC': 1,
         'PRODUCT': 1})

In [10]:
# Get the three most common entity items and their occurrences
items = [x.text for x in article.ents]
Counter(items).most_common(3)



[('UConn', 12), ('Huskies', 6), ('80-75', 5)]

In [11]:
# Display the 30th sentence in the document
sentences = [x for x in article.sents]
print(sentences[30])

View this post on Instagram            A post shared by UConn Men's Basketball (@uconnmbb)  


In [12]:
# Visualize syntactic dependencies in the 30th sentence
displacy.render(nlp(str(sentences[30])), jupyter=True, style='dep', options={'distance': 70})

# Extract relevant information from the 30th sentence, excluding stop words and punctuation
[(x, x.pos_, x.lemma_, x.ent_type_) for x in [y
                                            for y
                                            in nlp(str(sentences[30]))
                                            if not y.is_stop and y.pos_ != 'PUNCT']]

# Print entities and their labels in the 25th sentence
dict([(str(x), x.label_) for x in nlp(str(sentences[25])).ents])

{'first-half': 'DATE',
 'Karaban': 'ORG',
 'the second half': 'DATE',
 'Tristen Newton': 'ORG'}