In [1]:
! pip install requests beautifulsoup4



In [2]:
pip show beautifulsoup4

Name: beautifulsoup4
Version: 4.13.5
Summary: Screen-scraping library
Home-page: https://www.crummy.com/software/BeautifulSoup/bs4/
Author: 
Author-email: Leonard Richardson <leonardr@segfault.org>
License: MIT License
Location: /usr/local/lib/python3.12/dist-packages
Requires: soupsieve, typing-extensions
Required-by: gdown, google, libpysal, nbconvert, yfinance


In [3]:
import requests
from bs4 import BeautifulSoup

In [4]:
# Set the URL of the Wikipedia page you want to scrape
url = "https://en.wikipedia.org/wiki/Main_Page"

In [5]:
# Set user-agent to mimic a browser visit
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36'
}

In [6]:
# Send a GET request to the Wikipedia homepage
response = requests.get(url, headers=headers)

In [7]:
# Check if the request was successful
if response.status_code == 200:
    # Parse the HTML content
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find all featured articles (specifically the "In the news" or "Did you know" sections)
    in_the_news = soup.find('div', id='mp-upper')  # Main upper section
    if in_the_news:
        # Print titles for "In the news"
        print("In the News:\n")
        news_items = in_the_news.find_all('li')  # List items
        for item in news_items:
            print("-", item.get_text(strip=True))

    # Find the "Did you know" section
    did_you_know = soup.find('div', id='mp-dyk')  # Section for featured articles
    if did_you_know:
        print("\nDid You Know:\n")
        dyk_items = did_you_know.find('ul').find_all('li')  # List items
        for item in dyk_items:
            print("-", item.get_text(strip=True))

else:
    print("Error fetching the webpage:", response.status_code)

In the News:

- Indigenous people of the Everglades region
- Canoe River train crash
- Nuremberg trials
- Archive
- By email
- More featured articles
- About
- ... that theSj√∂lejonetclass(example pictured), Sweden's first indigenous submarines, featured rotating torpedo tubes and disappearing guns?
- ... thatDavid Avraham Voluck, a Chabad Jew and native tribal judge, credits Alaska natives with inspiring him to become more observant in his own faith?
- ... that the music video forBini's "First Luv" was inspired byrococoart?
- ... thatTanguturi Prakasam, a leader of the Indian independence movement, later became a significant dissenting figure within the Indian government?
- ... that the inscription on abracteatein theVindelev Hoardhas been interpreted as the oldest known reference to the Norse godOdin?
- ... that theMiller Housein Indiana opened to the public in 2011, drawing high visitor numbers that were likened to the opening of a new Disney ride?
- ... thatRosa Dubovskyadmonished 

# Scraping Links from the Website.

In [8]:
# Extract and print all hyperlinks
links = soup.find_all('a')            # Find all <a> tags

for link in links:
    href = link.get('href')           # Get the href attribute
    text = link.get_text(strip=True)  # Get the link text
    print(f"Link Text: {text}, URL: {href}")

Link Text: Jump to content, URL: #bodyContent
Link Text: Main page, URL: /wiki/Main_Page
Link Text: Contents, URL: /wiki/Wikipedia:Contents
Link Text: Current events, URL: /wiki/Portal:Current_events
Link Text: Random article, URL: /wiki/Special:Random
Link Text: About Wikipedia, URL: /wiki/Wikipedia:About
Link Text: Contact us, URL: //en.wikipedia.org/wiki/Wikipedia:Contact_us
Link Text: Help, URL: /wiki/Help:Contents
Link Text: Learn to edit, URL: /wiki/Help:Introduction
Link Text: Community portal, URL: /wiki/Wikipedia:Community_portal
Link Text: Recent changes, URL: /wiki/Special:RecentChanges
Link Text: Upload file, URL: /wiki/Wikipedia:File_upload_wizard
Link Text: Special pages, URL: /wiki/Special:SpecialPages
Link Text: , URL: /wiki/Main_Page
Link Text: Search, URL: /wiki/Special:Search
Link Text: Donate, URL: https://donate.wikimedia.org/?wmf_source=donate&wmf_medium=sidebar&wmf_campaign=en.wikipedia.org&uselang=en
Link Text: Create account, URL: /w/index.php?title=Special:Cre

In [None]:
# find the table and extract its rows
table = soup.find('table', {'id':'customers'})
rows = table.find_all('tr')

for row in rows:
  columns = row.find_all('td')   # get all columns in the rows

  # print the text in each column
  for column in columns:
    print(column.get_text(Strip = True), end='|')
  print()                        # new line after each row

# Scraping Number of Pages

In [10]:
for page in range(1, 6):  # Scrape the first 5 pages
    url = f"{url}{page}/"
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    quotes = soup.find_all('div', class_='quote')
    for quote in quotes:
        text = quote.find('span', class_='text').get_text(strip=True)
        author = quote.find('small', class_='author').get_text(strip=True)
        print(f'Quote: {text}\nAuthor: {author}\n')

# Using Images

In [11]:
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# Example placeholder, update with an actual image source
images = soup.find_all('img')
for img in images:
    image_url = img['src']
    print("Image URL:", image_url)

# Handling exceptions

In [12]:
try:
    response = requests.get(url, headers=headers)
    response.raise_for_status()  # Raises an error for bad status codes
except requests.exceptions.RequestException as e:
    print("Error:", e)

Error: 404 Client Error: Not Found for url: https://en.wikipedia.org/wiki/Main_Page1/2/3/4/5/
