# Web Scraping Tutorial 

> In this tutorial, we would be scraping information of products from an e-commerce website newegg.com

#Download Anaconda

### Import the necessary libraries

> In order to import the libraries, an installation of the same is required when doing for first time

<pre><code>$ pip install beautifulsoup4

</code></pre>

> Link to beautiful soup documentation: https://www.crummy.com/software/BeautifulSoup/bs4/doc/


In [34]:
from bs4 import BeautifulSoup
import requests

"""
Python basics
Variables
Print statements
Data Types String, num
loops


import packages for ready code
"""

'\nPython basics\nVariables\nPrint statements\nData Types String, num\nloops\n\n\nimport packages for ready code\n'

### Decide on a webpage for scraping

In [28]:
# url = "https://www.newegg.com/Product/ProductList.aspx?Submit=StoreIM&Depa=1&Category=38"
# url = "https://www.newegg.com/Video-Cards-Video-Devices/Category/ID-38?Tpk=graphics%20cards"
# url = "https://www.newegg.com/Product/ProductList.aspx?Submit=ENE&DEPA=0&Order=BESTMATCH&Description=mobile&N=-1&isNodeId=1"

# url = "https://www.newegg.com/Product/ProductList.aspx?Submit=ENE&DEPA=0&Order=BESTMATCH&Description=mobile&N=-1&isNodeId=22"
# headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36'}
url = "https://www.newegg.com/Product/ProductList.aspx?Submit=ENE&DEPA=0&Order=BESTMATCH&Description=mobile&ignorear=0&N=-1&isNodeId=1"
headers = {'User-Agent': 'Chrome/54.0.2840.71'}

> The headers is required to be sent when making a request to server because most servers can identify and block requests coming from a script, remember to be nice with the server.

### Make a request to the webpage using its URL and convert response to a Beautiful Soup object

In [35]:
response = requests.get(url, headers)

# print(response.content)
soup = BeautifulSoup(response.content, "html")
# print(soup.prettify())

> Before going ahead let's see how does response and soup look like

### Try Out basic commands, refer to the documentation for more

In [36]:
#Basic Traversal

# title of the page
print("Title Tag:", soup.title)

# # get attributes:
print("Tag Name:", soup.title.name)

# # get values:
print("Title String:",soup.title.string)

# # beginning navigation:
# print(soup.title.parent.name)

print(soup.h1)

# # getting specific values:
# print(soup.p)

# all p tags
# print(soup.find_all('p'))

#iterate using loop
# for paragraph in soup.find_all('p'):
#     print(str(paragraph.text))

# for url in soup.find_all('a'):
#     print(url.get('href'))

Title Tag: <title>mobile - Newegg.com</title>
Tag Name: title
Title String: mobile - Newegg.com
<h1 class="page-title-text">"mobile"</h1>


### Get hold of a product

> we'll first extract information from single product, then run a loop for automation

In [37]:
var = soup.find("div", {"class":"item-container"})
# print(var)
tag = var.find("a", {"class":"item-title"})
print(tag.text)

Moto G5s Plus (Special Edition) Unlocked Smartphone Dual Camera (5.5" Lunar Gray, 32GB Storage 3GB RAM) US Warranty


In [38]:
containers = soup.findAll("div", {"class":"item-container"})

In [39]:
print("Total Records:", len(containers))

# print(containers[0])
# paste to sublime and analyze

Total Records: 36


### Find out what aspects of the product are we interested in!

In [40]:
# """
# What all information do we need?
# Information that repeats everywhere
# Brand of the product
# Names
# Price?
# Shipping
# """

In [41]:
container = containers[0]

# print(container)

#Brand of the product
# print(container.a)
# print(container.div)

# print(container.div.div.a)

# #referencing as array
print(container.div.div.a.img["title"])

# print(container.div.a)

# print(container)

Motorola


In [42]:
#title of the product
title_container = container.findAll("a", {"class":"item-title"})

# print(title_container)
print(title_container[0].text)

Moto G5s Plus (Special Edition) Unlocked Smartphone Dual Camera (5.5" Lunar Gray, 32GB Storage 3GB RAM) US Warranty


In [43]:
#shipping price
shipping_container = container.findAll("li", {"class":"price-ship"})

# print(shipping_container[0].text)
# print(shipping_container[0].text.strip())
shipping = shipping_container[0].text.strip()
print(shipping)

Free Shipping


In [44]:
#price

# print(container.div.findAll("div", {"class":"item-action"})[0].findAll("li", {"class":"price-current"})[0].sup.text)
price_container_d = container.div.findAll("div", {"class":"item-action"})[0].findAll("li", {"class":"price-current"})[0].strong.text
price_container_c = container.div.findAll("div", {"class":"item-action"})[0].findAll("li", {"class":"price-current"})[0].sup.text

print(price_container_d)

274


### Let's put all this into a loop

In [45]:
for container in containers:
    brand = container.div.div.a.img["title"]
   
    title_container = container.findAll("a", {"class":"item-title"})
    product_name = title_container[0].text
    
    shipping_container = container.findAll("li", {"class":"price-ship"})
    shipping = shipping_container[0].text.strip()
    
    print("Brand:" + brand)
    print("Product Name:" + product_name)
    print("Shipping:" + shipping)
    
#     try:
#         price_container_d = container.div.findAll("div", {"class":"item-action"})[0].findAll("li", {"class":"price-current"})[0].strong.text
#         price_container_c = container.div.findAll("div", {"class":"item-action"})[0].findAll("li", {"class":"price-current"})[0].sup.text
#         print("Price:"+price_container_d+price_container_c)
#     except:
#         print("Price: None")
    
   
    
    print()
    

Brand:Motorola
Product Name:Moto G5s Plus (Special Edition) Unlocked Smartphone Dual Camera (5.5" Lunar Gray, 32GB Storage 3GB RAM) US Warranty
Shipping:Free Shipping

Brand:Motorola
Product Name:Motorola Moto G6 4G LTE Unlocked Cell Phone US Version (5.7" Black, 32GB 3GB RAM)
Shipping:Free Shipping

Brand:Motorola
Product Name:Motorola Nexus 6 XT1100 64GB Unlocked GSM 4G LTE Phone w/ 13MP Camera - Midnight Blue
Shipping:Free Shipping

Brand:Honor
Product Name:Honor View 10 4G LTE Unlocked Cell Phone (5.99" Blue ,128GB 6GB RAM)
Shipping:Free Shipping

Brand:Honor
Product Name:Honor 7X Unlocked Smartphone with Dual Camera (5.93" Black, 32GB Storage 3GB RAM) US Warranty
Shipping:Free Shipping

Brand:SAMSUNG
Product Name:Samsung Galaxy S9 G9600 64GB Single SIM Unlocked GSM 4G LTE Phone w/ 12 MP Camera - Midnight Black (International Version)
Shipping:Free Shipping

Brand:Honor
Product Name:Honor 7X Unlocked Smartphone with Dual Camera (5.93" Blue, 32GB Storage 3GB RAM) US Warranty
Shippin

### Exporting to a CSV file

In [46]:
#export to csv file

filename = "all_products.csv"

f = open(filename, "w")

headers = "Brand, Product Name, Shipping\n"
f.write(headers)

for container in containers:
    brand = container.div.div.a.img["title"]
   
    title_container = container.findAll("a", {"class":"item-title"})
    product_name = title_container[0].text
    
    shipping_container = container.findAll("li", {"class":"price-ship"})
    shipping = shipping_container[0].text.strip()

    f.write(brand+","+product_name.replace(",", "|")+","+shipping+"\n")
#     print("Brand:" + brand)
#     print("Product Name:" + product_name)
#     print("Shipping:" + shipping)
#     print()
f.close()