In [39]:
!pip install bs4




In [40]:
!pip install requests




In [41]:
from bs4 import BeautifulSoup
import requests
import pandas as pd


## I have decided to go with the BestBuy Website and scrap the data as it has lots for categories of product to choose
## About BestBuy
Best Buy is a multinational consumer electronics retailer headquartered in the United States, known for selling a wide range of electronics, appliances, and entertainment products both in physical stores and online.

## Extracting for a Single Product on the Page

In [42]:
# passing the URL
URL = "https://www.bestbuy.com/site/searchpage.jsp?st=tables&_dyncharset=UTF-8&_dynSessConf=&id=pcat17071&type=page&sc=Global&cp=1&nrp=&sp=&qp=&list=n&af=true&iht=y&usc=All+Categories&ks=960&keys=keys"

In [43]:
#Setting up headers
HEADERS = ({'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36', 'Accept-Language': 'en-US,en;q=0.5'})

In [44]:
# Sending an HTTP GET request to the specified URL
webpage = requests.get(URL, headers=HEADERS)

In [45]:
# Return type of webpage
type(webpage.content)

bytes

In [46]:
# Using the BeautifulSoup library to parse the raw HTML content of a web page
soup = BeautifulSoup(webpage.content, "html.parser")

In [47]:
# finds and stores all HTML <h4> elements with the attribute class set to 'sku-title' from the parsed web page content
links = soup.findAll("h4",attrs={'class':'sku-title'})

In [48]:
# extracts the href attribute value of the first anchor (<a>) element found within the links[0] element
link = links[0].find('a').get('href')

In [49]:
link

'/site/true-seating-ergo-electric-height-adjustable-standing-desk-white/6444085.p?skuId=6444085'

In [50]:
product_list = "https://www.bestbuy.com/" + link

In [51]:
product_list

'https://www.bestbuy.com//site/true-seating-ergo-electric-height-adjustable-standing-desk-white/6444085.p?skuId=6444085'

In [52]:
new_webpage = requests.get(product_list, headers=HEADERS)

In [53]:
new_webpage

<Response [200]>

In [54]:
new_soup = BeautifulSoup(new_webpage.content, "html.parser")

In [55]:
#searching heading or title of the product on the web page
new_soup.find("h1", attrs={"class":'heading-5 v-fw-regular'}).text.strip()

'True Seating - Ergo Electric Height Adjustable Standing Desk - White'

In [56]:
#searching Price of the product on the web page
new_soup.find("div", attrs={"class":'priceView-hero-price priceView-customer-price'}).find('span').text

'$369.99'

In [57]:
#searching rating and review of the product on the web page
new_soup.find("div", attrs={"class":'c-ratings-reviews flex c-ratings-reviews-small align-items-center gap-50 ugc-ratings-reviews flex-wrap small-gaps text-center'}).find('p').text

'User rating, 4.2 out of 5 stars with 5 reviews.'

##For Multiple Products on a page


In [59]:
import numpy as np

In [60]:
# Function to extract Product Title
def get_title(soup):

    try:
        # Outer Tag Object
        title = soup.find("h1", attrs={"class":'heading-5 v-fw-regular'})

        # Inner NavigatableString Object
        title_value = title.text

        # Title as a string value
        title_string = title_value.strip()

    except AttributeError:
        title_string = ""

    return title_string



In [61]:
# Function to extract Product Price
def get_price(soup):

    try:
        price = soup.find("div", attrs={"class":'priceView-hero-price priceView-customer-price'}).find('span').text

    except AttributeError:
            price = ""

    return price



In [62]:
# Function to extract Product Rating
def get_rating(soup):

    try:
        rating = soup.find("div", attrs={"class":'c-ratings-reviews flex c-ratings-reviews-small align-items-center gap-50 ugc-ratings-reviews flex-wrap small-gaps text-center'}).find('p').text

    except AttributeError:
        rating = ""

    return rating

In [63]:
if __name__ == '__main__':

    # adding the user agent to identify the client, ensure compatibility, and reduce the risk of being blocked
    HEADERS = ({'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36', 'Accept-Language': 'en-US,en;q=0.5'})

    # The URL of bestbuy webpage
    URL = "https://www.bestbuy.com/site/searchpage.jsp?st=tables&_dyncharset=UTF-8&_dynSessConf=&id=pcat17071&type=page&sc=Global&cp=1&nrp=&sp=&qp=&list=n&af=true&iht=y&usc=All+Categories&ks=960&keys=keys"

    # sending HTTP Request to the bestbuy URL
    webpage = requests.get(URL, headers=HEADERS)

    # Soup Object containing all data
    soup = BeautifulSoup(webpage.content, "html.parser")

    # Fetch links as List of Tag Objects
    links = soup.findAll("h4",attrs={'class':'sku-title'})

    # Store the links
    links_list = []

    # Loop for extracting links from Tag Objects
    for link in links:
            links_list.append(link.find('a').get('href'))

    d = {"title":[], "price":[], "rating":[]}

    # Loop for extracting product details from each link
    for link in links_list:
        new_webpage = requests.get("https://www.bestbuy.com/" + link, headers=HEADERS)

        new_soup = BeautifulSoup(new_webpage.content, "html.parser")

        # Function calls to display all necessary product information
        d['title'].append(get_title(new_soup))
        d['price'].append(get_price(new_soup))
        d['rating'].append(get_rating(new_soup))


    bestbuy_df = pd.DataFrame.from_dict(d)
    bestbuy_df['title'].replace('', np.nan, inplace=True)
    bestbuy_df = bestbuy_df.dropna(subset=['title'])
    #Writing it out as a csv
    bestbuy_df.to_csv("bestbuy_data.csv", header=True, index=False)


In [64]:
bestbuy_df


Unnamed: 0,title,price,rating
0,True Seating - Ergo Electric Height Adjustable...,$369.99,"User rating, 4.2 out of 5 stars with 5 reviews."
1,Walker Edison - Industrial Modern End / Side T...,$46.99,"User rating, 4.7 out of 5 stars with 27 reviews."
2,Sauder - Cottage Road Storage Coffee Table - B...,$270.99,"User rating, 5 out of 5 stars with 3 reviews."
3,Aluratek - Adjustable Ergonomic Laptop Cooling...,$49.99,"User rating, 4.6 out of 5 stars with 639 reviews."
4,"X Rocker - Ocelot Gaming Desk - Black, Red, Blue",$139.99,"User rating, 4.8 out of 5 stars with 17 reviews."
5,WorkSmart - Resin Table - Gray,$65.99,"User rating, 4.6 out of 5 stars with 12 reviews."
6,SD Gaming - Overlord Curved Table - Black,$158.99,"User rating, 4.8 out of 5 stars with 35 reviews."
7,Walker Edison - Huntsman Wood Dining Table - B...,$460.99,Be the first to write a review
8,Walker Edison - Round Rustic Coffee Table - Sl...,$96.99,"User rating, 4.8 out of 5 stars with 6 reviews."
9,"Walker Edison - 72"" Rectangular Solid Pine Woo...",$419.99,"User rating, 4.3 out of 5 stars with 4 reviews."
