In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Importing Dependencies

In [26]:
# importing libraries
import json
import re
import requests
import pandas as pd
from bs4 import BeautifulSoup

## Collecting HTML from the site

In [27]:
#the url from where data will be scraped
base_url = "https://www.wafilife.com/cat/books/subject/"

# # Send a GET request to the URL
response = requests.get(base_url)


## Parsing Data From HTML

In [28]:
# Create a BeautifulSoup object to parse the HTML content
soup = BeautifulSoup(response.content, 'html.parser')

## Saving the Soup Content as a **checkpoint**

In [29]:
def save_to_json_file(file_path):
    with open(path + "/soup.json", "r") as json_file:
        data = json.load(json_file)
    json.dump(data, json_file)


## Retrieve the soup data from JSON

In [37]:
import json

def read_json_file(file_path):
    with open(file_path, "r") as json_file:
        data = json.load(json_file)
    return data



## Retrieve Book Information

Declare Necessary Functions

In [33]:
def extract_book_info(category_link):
    # Send a GET request to the category URL
    response = requests.get(category_link)
    # Parse the HTML content of the category page
    soup = BeautifulSoup(response.content, "html.parser")

    # Find the page title, book names, authors, and price tags on the page
    page_title = soup.find_all("h1", class_="heading-title page-title")
    book_names = soup.find_all("h3", class_="heading-title product-title")
    authors = soup.find_all("div", class_="wd_product_categories")
    price_tags = soup.find_all('span', class_="price")

    # Extract the page title from the first element and create a list with the same length as book names
    page_title_list = [page_title[0].text.strip()] * len(book_names)

    # Extract the book names, authors, and prices, and strip any leading/trailing whitespace
    book_names_list = [name.text.strip() for name in book_names]
    authors_list = [author.text.strip() for author in authors]
    prices_list = [re.sub(r'<del>(.*?)</del>', '', str(price)) for price in price_tags]

    numeric_prices = []
    # Extract the numeric part of the price from each price
    for price in prices_list:
        price = re.findall(r'<span class="woocommerce-Price-amount amount">([\d.-]+)৳', price)
        numeric_prices.extend(price)

    # Return the extracted information as lists
    return page_title_list, book_names_list, authors_list, numeric_prices



In [34]:
def scrape_book_info(path):
    # Initialize a list to store the book href links
    book_href_links = []

    # Read the JSON file
    data = read_json_file(path + "/soup.json")
    html_content = data["html_content"]

    # Create a BeautifulSoup object from the HTML content
    soup = BeautifulSoup(html_content, "html.parser")

    # Find the category links
    category_links = soup.find_all('ul', class_='archive-product-subcategories')

    # Find the author links within the category links
    links = category_links[0].find_all('a', class_='author_name')
    book_href_links = [link['href'] for link in links]

    # Initialize lists to store the extracted information
    page_title_list = []
    book_names_list = []
    authors_list = []
    numeric_prices = []

    # Iterate over each category link and extract book information
    for category_link in book_href_links:
        # Extract book information using the extract_book_info function
        page_title, book_names, authors, prices = extract_book_info(category_link)

        # Extend the corresponding lists with the extracted information
        page_title_list.extend(page_title)
        book_names_list.extend(book_names)
        authors_list.extend(authors)
        numeric_prices.extend(prices)

    # Create a DataFrame using the extracted information
    df = pd.DataFrame({"Book Name": book_names_list, "Author": authors_list, "Category": page_title_list, "Price": numeric_prices})

    # Return the DataFrame
    return df


Retrieve Book Information

In [38]:
#path of the project

path = "/content/drive/MyDrive/Complete EDA Projects/Wafi Life Book Price"

# Scrape the book information and get the DataFrame
book_info_df = scrape_book_info(path)
# Save the DataFrame to Excel
book_info_df.to_excel(path + "/wafilife_book_data.xlsx", index=False)

In [40]:
book_info_df

Unnamed: 0,Book Name,Author,Category,Price
0,ইসলামি জীবনদর্শনে আখলাক ও রুহানিয়াত,ডাঃ ইসরার আহমাদ রাহিমাহুল্লাহ,"আদব, আখলাক",150
1,ওহে সুন্নাহর অনুসারীগণ!,ড. আবু বকর মুহাম্মাদ যাকারিয়া,"আদব, আখলাক",100
2,রাহমানের বান্দাদের গুণাবলী,আ. ন. ম. রশীদ আহমাদ,"আদব, আখলাক",72
3,মুসলিম শিষ্টাচার,মুফতি মনিরুজ্জামান,"আদব, আখলাক",490
4,দুনিয়ায় অবাধ্য সন্তানের পরিণতি,মুরাদুল ইসলাম লক্ষ্মীপুরী,"আদব, আখলাক",126
...,...,...,...,...
2152,হিফয করতে হলে,শাইখ আব্দুল কাইয়্যূম আস-সুহাইবানী,হিফয বিষয়ক বই,109
2153,সহজ হাফেজী কুরআন (আর্ট পেপার),,হিফয বিষয়ক বই,550
2154,হিফযুল কুরআন ছাত্রসহায়িকা,,হিফয বিষয়ক বই,87
2155,হিফযুল কুরআন শিক্ষক সহায়িকা,শায়েখ হাফেজ ক্বারী আব্দুল হক,হিফয বিষয়ক বই,189
