Example 1: Basic HTML Parsing

In [2]:
from bs4 import BeautifulSoup

# Sample HTML content
html_content = "<html><head><title>My Web Page</title></head><body><p>Hello, World!</p></body></html>"

# Parse the HTML
soup = BeautifulSoup(html_content, "html.parser")

# Print the title tag
print("Title:", soup.title)
print("Title Text:", soup.title.text)

print()

# Print the paragraph text
print("Paragraph:", soup.p)
print("Paragraph Text:", soup.p.text)


Title: <title>My Web Page</title>
Title Text: My Web Page

Paragraph: <p>Hello, World!</p>
Paragraph Text: Hello, World!


Example 2: Finding Elements by Tag

In [3]:
from bs4 import BeautifulSoup

# Sample HTML content
html_content = "<ul><li>Item 1</li><li>Item 2</li><li>Item 3</li></ul>"

# Parse the HTML
soup = BeautifulSoup(html_content, "html.parser")

# Find all list items
items = soup.find_all("li")

print(items)

# Print each item
for item in items:
    print(item.text)


[<li>Item 1</li>, <li>Item 2</li>, <li>Item 3</li>]
Item 1
Item 2
Item 3


Example 3: Finding Elements by Class

In [15]:
from bs4 import BeautifulSoup

# Sample HTML content
html_content = "<div class='container'><p class='text'>Hello, World!</p><p class='text'>Hii</p></div>"

# Parse the HTML
soup = BeautifulSoup(html_content, "html.parser")

# Find the paragraph with class 'text'
paragraph = soup.find_all("p", class_="text")

# Print the paragraph text
para_text = [para.text for para in paragraph]
print(para_text)


['Hello, World!', 'Hii']


Example 4: Finding Elements by ID

In [4]:
from bs4 import BeautifulSoup

# Sample HTML content
html_content = "<div id='content'><p>Hello, World!</p></div>"

# Parse the HTML
soup = BeautifulSoup(html_content, "html.parser")

# Find the div with id 'content'
div = soup.find("div", id="content")

# Print the paragraph text
print(div.p.text)


Hello, World!


Example 5: Navigating the HTML Tree

In [18]:
from bs4 import BeautifulSoup

# Sample HTML content
html_content = "<div><p>Paragraph 1</p><p>Paragraph 2</p></div>"

# Parse the HTML
soup = BeautifulSoup(html_content, "html.parser")

# Navigate the HTML tree
div = soup.div
for p in div.find_all("p"):
    print(p.text)


Paragraph 1
Paragraph 2


Example 6: Extracting Attributes

In [20]:
from bs4 import BeautifulSoup

# Sample HTML content
html_content = "<a href='https://example.com'>Visit Example</a>"

# Parse the HTML
soup = BeautifulSoup(html_content, "html.parser")

# Extract the 'href' attribute
link = soup.a
print("Link Text:", link.text)
print("Link URL:", link["href"])


Link Text: Visit Example
Link URL: https://example.com




```
Example 7: Web Scraping a Real Website
```



In [29]:
import requests
from bs4 import BeautifulSoup

# Make an HTTP GET request
url = "https://example.com"
response = requests.get(url)

# Parse the HTML
soup = BeautifulSoup(response.text, "html.parser")

# Extract and print the page title
title = soup.title
print("Page Title:", title.text)


Page Title: Example Domain


Example 8: Scraping Tables


In [42]:
from bs4 import BeautifulSoup

# Sample HTML content with a table
html_content = "<table><tr><th>Name</th><th>Age</th></tr><tr><td>Alice</td><td>25</td></tr><tr><td>Bob</td><td>30</td></tr></table>"

# Parse the HTML
soup = BeautifulSoup(html_content, "html.parser")

# Find the table
table = soup.find("table")

# Extract and print table data
for row in table.find_all("tr")[1:]:
    columns = row.find_all("td")
    name = columns[0].text
    age = columns[1].text
    print(f"Name: {name}, Age: {age}")


Name: Alice, Age: 25
Name: Bob, Age: 30


Example 9: Scraping Images

In [9]:
from bs4 import BeautifulSoup

# Sample HTML content with an image
html_content = "<img src='https://example.com/image.jpg' alt='Sample Image'>"

# Parse the HTML
soup = BeautifulSoup(html_content, "html.parser")

# Find the image
image = soup.img

# Extract and print the image source and alt text
src = image["src"]
alt = image["alt"]
print("Image Source:", src)
print("Image Alt Text:", alt)


Image Source: https://example.com/image.jpg
Image Alt Text: Sample Image


Example 10: Web Scraping Pagination


In [43]:
import requests
from bs4 import BeautifulSoup
import json
import re

def get_blog_url(soup):
    div_ = soup.find_all('div', attrs={'class': "FL PR20"})
    url_list = []
    for title in div_:
        href = (title.find('a')['href'])
        url_list.append("https://www.moneycontrol.com/" + href)
    return url_list

def get_blog_content(url):
    request = requests.get(url)
    soup = BeautifulSoup(request.text, 'html.parser')
    all_script = soup.find_all('script', attrs={'type':'application/ld+json'})
    raw_article_str = all_script[2].get_text().replace("\r\n", " ")
    parts = re.split(r"""("[^"|'[^']*')""", raw_article_str)
    parts[::2] = map(lambda s: "".join(s.split()), parts[::2])
    article_str = "".join(parts)
    article_str = article_str[1:]
    article_str = article_str[:-1]
    article_dict = json.loads(article_str)
    all_tags = soup.findAll('div', attrs={'class':'tags_first_line'})
    lst_all_tags = []
    for i in all_tags:
        lst_all_tags.append(i.get_text())
    tags = lst_all_tags[0].replace("Tags: ", "")
    tags = tags.replace("\n", "")
    tags = tags.split("#")
    tags = tags[1:]
    tags = ", ".join([str(elem).strip() for elem in tags])
    article_dict['tags'] = tags
    return article_dict

sc_id=["RI", "AE01", "API", "BAF", "HCL02"]
# Web scraping multiple pages with pagination
for page in sc_id:  # Assuming there are 3 pages
    url = f"https://www.moneycontrol.com/stocks/company_info/stock_news.php?sc_id={page}"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    url_list = get_blog_url(soup)
    print(url_list)
    
    # print("---------------------")
    # print(f"For Company: {page}")
    # print("---------------------")
    # for url in url_list:
    #     article_dict = get_blog_content(url)
    #     print(page)
    #     print(article_dict['datePublished'])
    #     print(article_dict['author'])
    #     print(article_dict['headline'])

['https://www.moneycontrol.com//news/buzzing-stocks/staymegacaps-as-2024-ushersuncertainties-kotak-note_17332181.html', 'https://www.moneycontrol.com//news/buzzing-stocks/reliance-shares-open-2-higherrobust-q2-earningsÂ\xa0_17288081.html', 'https://www.moneycontrol.com//news/result-analysis/reliance-q1-results-newbies-coverweaknessthe-legacy-business_17125141.html', 'https://www.moneycontrol.com//news/results/reliance-consolidated-june-2023-net-sales-at-rs-20755900-crore-down-536-y-o-y_17124801.html', 'https://www.moneycontrol.com//news/results/relianceq1-a-look-at-profits-capex-booming-businesseschallenges_17124391.html', 'https://www.moneycontrol.com//news/results/reliance-standalone-june-2023-net-sales-at-rs-11713600-crore-down-206-y-o-y_17124311.html', 'https://www.moneycontrol.com//news/results/reliancetrack-to-create-jio-financial-services-ambani-sees-boost-to-inclusion_17124041.html', 'https://www.moneycontrol.com//news/results/reliance-jio-q1-net-profit-rises-12-yoy-to-rs-4863-