# BeautifulSoup Concepts

In [47]:
# import beautifulsoup4 and the Python request module
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as u_req

In [48]:
# we're going to web-scrape the newegg graphics card page
my_url = "https://www.newegg.com/Video-Cards-Video-Devices/Category/ID-38?Tpk=graphics%20cards"

# download the web page by opening the connection and reading the website
web_client = u_req(my_url)

# read the html of the web page
page_html = web_client.read()

# close the connection to the web page
web_client.close()

In [49]:
# parse the html of the web page using beautiful soup
page_soup = soup(page_html, "html.parser")

In [50]:
# return the header (h1 tag) of the page
page_soup.h1

<h1 class="page-title-text">Video Cards &amp; Video Devices</h1>

In [51]:
# find all div elements with the class "item-container", return as list
containers = page_soup.findAll("div", {"class": "item-container"})

# return the number of elements found
len(containers)

12

In [52]:
# contains the first graphics card's html
container = containers[0]

# return the "item-info" div
item_info_container = container.find("div", {"class": "item-info"})

# return the name of the card's brand, by grabbing the img's "title" prop
brand = item_info_container.div.a.img["title"]

# return the "item-title" div
title_container = container.find("a", {"class": "item-title"})

# return the name of the graphics card
product_name = title_container.text

# now let's print the brand and product_name of the first graphics card
print(brand)
print(product_name)

ASUS
ASUS ROG STRIX GeForce RTX 2080 Overclocked 8G GDDR6 HDMI DP 1.4 USB Type-C (ROG-STRIX-RTX2080-O8G-GAMING)


### Iterating Through Every Item And Place Into a CSV

In [58]:
# create and open the graphics_cards.csv file
file = "scraped_outputs/graphics_cards.csv"
file = open(file, "w")

"""
set the headers (columns) of the csv file,
deliminate each column by a comma and each row by a new line (csv format)
"""
headers = "brand, product_name, shipping\n"
file.write(headers)

# iterate through each container
for container in containers:
    # grab the brand name of the graphics card
    item_info_container = container.find("div", {"class": "item-info"})
    brand = item_info_container.div.a.img["title"]
    
    # grab the product name of the graphics card
    title_container = container.find("a", {"class": "item-title"})
    product_name = title_container.text
    
    # grab the shipping information of the graphics card
    shipping_container = container.find("li", {"class": "price-ship"})
    shipping = shipping_container.text.strip()
    
    # replace all commas with a pipe to prevent csv parsing jankiness
    brand = brand.replace(",", "|")
    product_name = product_name.replace(",", "|")
    shipping = shipping.replace(",", "|")
    
    # write the information into the csv file
    file.write(brand + "," + product_name + "," + shipping + "\n")
    
# close the file (must close to save the data into the csv file)
file.close()