# Scrape and parse text from any website and show the below:¶
1. Extract Text From HTML With String Methods

In [5]:
#import library
from urllib.request import urlopen

#getting data
url = "http://books.toscrape.com/"
page = urlopen(url)

#extract the HTML from the page
html_bytes = page.read()
html = html_bytes.decode("utf-8")

#getting required data
title_index = html.find("<title>")
start_index = title_index + len("<title>")
end_index = html.find("</title>")
title = html[start_index:end_index]
print(title)


    All products | Books to Scrape - Sandbox



In [6]:
title_index = html.find("<title>")
title_index

361

In [8]:
title = html[start_index:end_index]
title

'\n    All products | Books to Scrape - Sandbox\n'

# Extract Text From HTML With Regular Expressions

In [15]:
import re
from urllib.request import urlopen

url = "http://books.toscrape.com/"
page = urlopen(url)
html = page.read().decode("utf-8")

pattern = "<div.*?>.*?</div.*?>"
match_results = re.search(pattern, html)
title = match_results.group()
title = re.sub("<.*?>", "", title) # Remove HTML tags

print(title)



# Use an HTML Parser for Web Scraping in Python

In [None]:
from bs4 import BeautifulSoup
from urllib.request import urlopen

url = "http://books.toscrape.com/"
page = urlopen(url)
html = page.read().decode("utf-8")
soup = BeautifulSoup(html, "html.parser")
print(soup.get_text())
img = soup.find_all("img")
print(img)

# Save the scrapped text to a text file

In [62]:
import requests
from bs4 import BeautifulSoup

def save_text_file(book_data, file_name):
    # open the specified file for writing
    with open(file_name, 'w',encoding="utf-8") as file:
        # loop over each book data and write it to the file
        for book in book_data:
            file.write('Name: ' + book['name'] + '\n')
            file.write('Price: ' + book['price'] + '\n')
            file.write('Description: ' + book['description'] + '\n')
            file.write('Rating: ' + book['rating'] + '\n')
            file.write('-------------------\n')

    print('Data saved to ' + file_name)

# send a GET request to the website
response = requests.get('http://books.toscrape.com/')

# parse the HTML content of the page using BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')

# find all the articles that contain book information
articles = soup.find_all('article', class_='product_pod')

# initialize an empty list to store the book data
book_data = []

# loop over each article and extract the book information
for article in articles:
    # extract the product name
    name = article.h3.a.attrs['title']

    # extract the product price
    price_elem = article.select('.price_color')
    price = price_elem[0].get_text() if price_elem else 'N/A'

    # extract the product description
    response_desc = requests.get('http://books.toscrape.com/' + article.h3.a.attrs['href'])
    soup_desc = BeautifulSoup(response_desc.content, 'html.parser')
    desc_elem = soup_desc.select_one('#product_description + p')
    description = desc_elem.get_text() if desc_elem else 'N/A'

    # extract the product rating
    rating_elem = article.select('p')
    rating = rating_elem[0].attrs['class'][1] if rating_elem else 'N/A'

    # create a dictionary to store the book information
    book = {
        'name': name,
        'price': price,
        'description': description,
        'rating': rating
    }

    # add the book dictionary to the book data list
    book_data.append(book)

    # print the book information
    print('Name:', name)
    print('Price:', price)
    print('Description:', description)
    print('Rating:', rating)
    print('-------------------')

# save the book data to a text file
save_text_file(book_data, 'book_data.txt')


Name: A Light in the Attic
Price: £51.77
Description: It's hard to imagine a world without A Light in the Attic. This now-classic collection of poetry and drawings from Shel Silverstein celebrates its 20th anniversary with this special edition. Silverstein's humorous and creative verse can amuse the dowdiest of readers. Lemon-faced adults and fidgety kids sit still and read these rhythmic words and laugh and smile and love th It's hard to imagine a world without A Light in the Attic. This now-classic collection of poetry and drawings from Shel Silverstein celebrates its 20th anniversary with this special edition. Silverstein's humorous and creative verse can amuse the dowdiest of readers. Lemon-faced adults and fidgety kids sit still and read these rhythmic words and laugh and smile and love that Silverstein. Need proof of his genius? RockabyeRockabye baby, in the treetopDon't you know a treetopIs no safe place to rock?And who put you up there,And your cradle, too?Baby, I think someone

Name: The Dirty Little Secrets of Getting Your Dream Job
Price: £33.34
Description: Drawing on his extensive experience evaluating applicants for his marketing agency, and featuring stories based on real-life situations, sample cover letters, resumes, and straightforward advice, Don Raskin’s The Dirty Little Secrets of Getting Your Dream Job offers all the necessary tools for navigating the tough job market and securing your dream job.Don Raskin owns and Drawing on his extensive experience evaluating applicants for his marketing agency, and featuring stories based on real-life situations, sample cover letters, resumes, and straightforward advice, Don Raskin’s The Dirty Little Secrets of Getting Your Dream Job offers all the necessary tools for navigating the tough job market and securing your dream job.Don Raskin owns and operates MME, an advertising and marketing agency in New York City. During his twenty-five years at the agency he has interviewed hundreds of new college graduates fo

Name: Starving Hearts (Triangular Trade Trilogy, #1)
Price: £13.99
Description: Since her assault, Miss Annette Chetwynd has been plagued by nightmares and worries about an arranged marriage. But she yearns to find her anonymous rescuer. Unfortunately, her health and intellect prevent it. Both repel suitors and cause Annette to doubt God's existence, at least until He answers her prayers in an unusual way ... Mr. Peter Adsley is joining the clergy, an Since her assault, Miss Annette Chetwynd has been plagued by nightmares and worries about an arranged marriage. But she yearns to find her anonymous rescuer. Unfortunately, her health and intellect prevent it. Both repel suitors and cause Annette to doubt God's existence, at least until He answers her prayers in an unusual way ... Mr. Peter Adsley is joining the clergy, and he desires a godly wife by his side. After his failed attempt to obtain one, he engages in a clandestine meeting with the bewitching young woman who keeps crossing his

Name: Libertarianism for Beginners
Price: £51.33
Description: Libertarianism isn't about winning elections; it is first and foremost a political philosophy--a description of how, in the opinion of libertarians, free people ought to treat one another, at least when they use the law, which they regard as potentially dangerous. If libertarians are correct, the law should intrude into people's lives as little as possible, rarely telling Libertarianism isn't about winning elections; it is first and foremost a political philosophy--a description of how, in the opinion of libertarians, free people ought to treat one another, at least when they use the law, which they regard as potentially dangerous. If libertarians are correct, the law should intrude into people's lives as little as possible, rarely telling them what to do or how to live.A political and economic philosophy as old as John Locke and John Stuart Mill, but as alive and timely as Rand Paul, the Tea Party, and the novels of Ayn Ran

In [66]:
import csv

def save_csv_file(book_data, file_name):
    # open the specified file for writing
    with open(file_name, 'w', newline='', encoding='utf-8') as file:
        # create a CSV writer object
        writer = csv.writer(file)

        # write the header row
        writer.writerow(['Name', 'Price', 'Description', 'Rating'])

        # loop over each book data and write it to the file
        for book in book_data:
            writer.writerow([book['name'], book['price'], book['description'], book['rating']])

    print('Data saved to ' + file_name)

# save the book data to a CSV file
save_csv_file(book_data, 'book_data.csv')


Data saved to book_data.csv
