In [1]:
#Import necessary modules
import requests
from bs4 import BeautifulSoup
import pandas as pd

print("All modules imported successfully!")

All modules imported successfully!


In [2]:
#Saving the URL of the website to scrape
url = "https://books.toscrape.com/"

#Sending a GET request to the website
response = requests.get(url)

#Checking if the request was successful
if response.status_code == 200:
    print("Successfully accessed the website!")
else:
    print(f"Failed to access the website. Status code: {response.status_code}")

Successfully accessed the website!


In [3]:
soup = BeautifulSoup(response.text, "html.parser")

In [4]:
soup.title

<title>
    All products | Books to Scrape - Sandbox
</title>

In [5]:
#Process to get all the books on a page
books = soup.find_all("article", class_= "product_pod")
print(type(books))
print(len(books))
books[0]

<class 'bs4.element.ResultSet'>
20


<article class="product_pod">
<div class="image_container">
<a href="catalogue/a-light-in-the-attic_1000/index.html"><img alt="A Light in the Attic" class="thumbnail" src="media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg"/></a>
</div>
<p class="star-rating Three">
<i class="icon-star"></i>
<i class="icon-star"></i>
<i class="icon-star"></i>
<i class="icon-star"></i>
<i class="icon-star"></i>
</p>
<h3><a href="catalogue/a-light-in-the-attic_1000/index.html" title="A Light in the Attic">A Light in the ...</a></h3>
<div class="product_price">
<p class="price_color">Â£51.77</p>
<p class="instock availability">
<i class="icon-ok"></i>
    
        In stock
    
</p>
<form>
<button class="btn btn-primary btn-block" data-loading-text="Adding..." type="submit">Add to basket</button>
</form>
</div>
</article>

In [6]:
#Looping through all the pages to extract all books
base_url = "https://books.toscrape.com/catalogue/page-{}.html"

all_books = []

for page in range(1, 51):
    url = base_url.format(page)
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")

    books = soup.find_all("article", class_="product_pod")
    all_books.extend(books)

print(f"Total number of books: {len(all_books)}")

Total number of books: 1000


In [7]:
#Extracting the info from each book
titles = []
prices = []
ratings = []
availability = []

for book in all_books:
    title = book.h3.a["title"]
    titles.append(title)

    price = book.find("p", class_="price_color").text
    prices.append(price)

    rating = book.find("p", class_="star-rating")["class"][1]
    ratings.append(rating)

    avail = book.find("p", class_="instock availability").text.strip()
    availability.append(avail)

In [8]:
#Creating a DataFrame
df = pd.DataFrame({
    "Title": titles,
    "Price": prices,
    "Rating": ratings,
    "Availability": availability
})

df.head()

Unnamed: 0,Title,Price,Rating,Availability
0,A Light in the Attic,Â£51.77,Three,In stock
1,Tipping the Velvet,Â£53.74,One,In stock
2,Soumission,Â£50.10,One,In stock
3,Sharp Objects,Â£47.82,Four,In stock
4,Sapiens: A Brief History of Humankind,Â£54.23,Five,In stock


In [9]:
df.shape

(1000, 4)

In [10]:
#Cleaning the columns
df["Price"] = df["Price"].str.split('£').str[1].astype("Float64")

rating_map = {"One": 1, "Two": 2, "Three": 3, "Four": 4, "Five": 5}
df["Rating"] = df["Rating"].str.strip().replace(rating_map)
df["Rating"] = df["Rating"].astype("Int64")

df["Availability"] = df["Availability"].map({"In stock": 1, "Out of stock": 0})

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Title         1000 non-null   object 
 1   Price         1000 non-null   Float64
 2   Rating        1000 non-null   Int64  
 3   Availability  1000 non-null   int64  
dtypes: Float64(1), Int64(1), int64(1), object(1)
memory usage: 33.3+ KB


  df["Rating"] = df["Rating"].str.strip().replace(rating_map)
