In [1]:
import requests
from bs4 import BeautifulSoup, Comment

# 1. Retrieve the following Web Page
url_md = "https://www.visitmaryland.org/"
resp_md = requests.get(url_md)
print("1) HTTP status code for visitmaryland.org:", resp_md.status_code)

1) HTTP status code for visitmaryland.org: 403


In [2]:
# 2. Extract the Main Page Text (ignore script and style)
soup_md = BeautifulSoup(resp_md.text, "html.parser")

# Remove script and style tags
for tag in soup_md(["script", "style"]):
    tag.decompose()

# Get visible text
main_text_md = soup_md.get_text(separator="\n")
# Optionally clean up blank lines
main_text_md = "\n".join(
    line.strip() for line in main_text_md.splitlines() if line.strip()
)

print("\n2) Visible text from visitmaryland.org main page:\n")
print(main_text_md)


2) Visible text from visitmaryland.org main page:

Just a moment...
Enable JavaScript and cookies to continue


In [3]:
#3. Extract Headings from a Wikipedia Page
import requests
from bs4 import BeautifulSoup

url_wiki = "https://en.wikipedia.org/wiki/Natural_language_processing"

headers = {
    "User-Agent": "DATA622-homework-bot/1.0 (gm97457@umbc.edu)"
}

resp_wiki = requests.get(url_wiki, headers=headers)

In [4]:
print("Status:", resp_wiki.status_code)
print("Final URL:", resp_wiki.url)

if resp_wiki.status_code != 200:
    raise SystemExit("Request failed, cannot parse headings.")

Status: 200
Final URL: https://en.wikipedia.org/wiki/Natural_language_processing


In [5]:
soup_wiki = BeautifulSoup(resp_wiki.text, "html.parser")

In [6]:
h1_list = soup_wiki.find_all("h1")
h2_list = soup_wiki.find_all("h2")
h3_list = soup_wiki.find_all("h3")

print("Counts -> h1:", len(h1_list), "h2:", len(h2_list), "h3:", len(h3_list))

Counts -> h1: 1 h2: 9 h3: 12


In [7]:
print("\n3) Headings (h1, h2, h3) from NLP Wikipedia page:\n")
for level in ["h1", "h2", "h3"]:
    for h in soup_wiki.find_all(level):
        heading_text = h.get_text(strip=True)
        if heading_text:
            print(f"{level.upper()}: {heading_text}")


3) Headings (h1, h2, h3) from NLP Wikipedia page:

H1: Natural language processing
H2: Contents
H2: History
H2: Approaches: Symbolic, statistical, neural networks
H2: Common NLP tasks
H2: General tendencies and (possible) future directions
H2: See also
H2: References
H2: Further reading
H2: External links
H3: Symbolic NLP (1950s – early 1990s)
H3: Statistical NLP (1990s–present)
H3: Statistical approach
H3: Neural networks
H3: Text and speech processing
H3: Morphological analysis
H3: Syntactic analysis
H3: Lexical semantics (of individual words in context)
H3: Relational semantics (semantics of individual sentences)
H3: Discourse (semantics beyond individual sentences)
H3: Higher-level NLP applications
H3: Cognition


In [8]:
# 4. Extract Links (href values) from the Wikipedia page
import requests
from bs4 import BeautifulSoup

url_wiki = "https://en.wikipedia.org/wiki/Natural_language_processing"
headers = {
    "User-Agent": "DATA622-homework-bot/1.0 (gm97457@umbc.edu)"
}
resp_wiki = requests.get(url_wiki, headers=headers)

print("Status:", resp_wiki.status_code)
print("Final URL:", resp_wiki.url)


Status: 200
Final URL: https://en.wikipedia.org/wiki/Natural_language_processing


In [9]:
soup_wiki = BeautifulSoup(resp_wiki.text, "html.parser")

In [10]:
links = soup_wiki.find_all("a", href=True)
print("Number of links found:", len(links))

Number of links found: 1042


In [11]:
print("\n4) All link URLs (href) from NLP Wikipedia page:\n")
for a in links:
    print(a["href"])


4) All link URLs (href) from NLP Wikipedia page:

#bodyContent
/wiki/Main_Page
/wiki/Wikipedia:Contents
/wiki/Portal:Current_events
/wiki/Special:Random
/wiki/Wikipedia:About
//en.wikipedia.org/wiki/Wikipedia:Contact_us
/wiki/Help:Contents
/wiki/Help:Introduction
/wiki/Wikipedia:Community_portal
/wiki/Special:RecentChanges
/wiki/Wikipedia:File_upload_wizard
/wiki/Special:SpecialPages
/wiki/Main_Page
/wiki/Special:Search
https://donate.wikimedia.org/?wmf_source=donate&wmf_medium=sidebar&wmf_campaign=en.wikipedia.org&uselang=en
/w/index.php?title=Special:CreateAccount&returnto=Natural+language+processing
/w/index.php?title=Special:UserLogin&returnto=Natural+language+processing
https://donate.wikimedia.org/?wmf_source=donate&wmf_medium=sidebar&wmf_campaign=en.wikipedia.org&uselang=en
/w/index.php?title=Special:CreateAccount&returnto=Natural+language+processing
/w/index.php?title=Special:UserLogin&returnto=Natural+language+processing
#
#History
#Symbolic_NLP_(1950s_–_early_1990s)
#Statist

In [12]:
# 5. Extract the first paragraph and save to nlp_intro.txt
import requests
from bs4 import BeautifulSoup

url_wiki = "https://en.wikipedia.org/wiki/Natural_language_processing"
headers = {
    "User-Agent": "DATA622-homework-bot/1.0 (gm97457@umbc.edu)"
}
resp_wiki = requests.get(url_wiki, headers=headers)

print("Status:", resp_wiki.status_code)
print("Final URL:", resp_wiki.url)


Status: 200
Final URL: https://en.wikipedia.org/wiki/Natural_language_processing


In [13]:
soup_wiki = BeautifulSoup(resp_wiki.text, "html.parser")

ps = soup_wiki.find_all("p")
print("Number of <p> tags:", len(ps))

Number of <p> tags: 20


In [14]:
first_p = soup_wiki.find("p")
if first_p:
    first_paragraph_text = first_p.get_text(strip=True)
    with open("nlp_intro.txt", "w", encoding="utf-8") as f:
        f.write(first_paragraph_text)
    print("\n5) First paragraph saved to nlp_intro.txt")
else:
    print("\n5) No paragraph tag (<p>) found on the page.")


5) First paragraph saved to nlp_intro.txt


In [15]:
with open("nlp_intro.txt", "r", encoding="utf-8") as f:
    text = f.read()

print(text)

Natural language processing(NLP) is the processing ofnatural languageinformation by acomputer. NLP is a subfield ofcomputer scienceand is closely associated withartificial intelligence. NLP is also related toinformation retrieval,knowledge representation,computational linguistics, andlinguisticsmore broadly.[1]
