# Web Scraping using BeautifulSoup

In [22]:
#import libraries
from bs4 import BeautifulSoup as bs
import requests

In [23]:
# Define URL
url = 'https://webscraper.io/test-sites/e-commerce/allinone/computers/laptops'
# Ask hosting server to fetch url, 200 for success
requests.get(url)

<Response [200]>

In [24]:
pages = requests.get(url)
pages.text

'<!DOCTYPE html>\n<html lang="en">\n<head>\n\n\t\t\t<!-- Anti-flicker snippet (recommended)  -->\n<style>.async-hide {\n\t\topacity: 0 !important\n\t} </style>\n<script>(function (a, s, y, n, c, h, i, d, e) {\n\t\ts.className += \' \' + y;\n\t\th.start = 1 * new Date;\n\t\th.end = i = function () {\n\t\t\ts.className = s.className.replace(RegExp(\' ?\' + y), \'\')\n\t\t};\n\t\t(a[n] = a[n] || []).hide = h;\n\t\tsetTimeout(function () {\n\t\t\ti();\n\t\t\th.end = null\n\t\t}, c);\n\t\th.timeout = c;\n\t})(window, document.documentElement, \'async-hide\', \'dataLayer\', 4000,\n\t\t{\'GTM-NVFPDWB\': true});</script>\n\t\n\t<!-- Google Tag Manager -->\n<script>(function (w, d, s, l, i) {\n\t\tw[l] = w[l] || [];\n\t\tw[l].push({\n\t\t\t\'gtm.start\':\n\t\t\t\tnew Date().getTime(), event: \'gtm.js\'\n\t\t});\n\t\tvar f = d.getElementsByTagName(s)[0],\n\t\t\tj = d.createElement(s), dl = l != \'dataLayer\' ? \'&l=\' + l : \'\';\n\t\tj.async = true;\n\t\tj.src =\n\t\t\t\'https://www.googletagma

In [6]:
#create an instance of the BeautifulSoup class
soup = bs(pages.content)
#Use prettify for making the Code readable
print(soup.prettify())

<!DOCTYPE html>
<html lang="en">
 <head>
  <!-- Anti-flicker snippet (recommended)  -->
  <style>
   .async-hide {
		opacity: 0 !important
	}
  </style>
  <script>
   (function (a, s, y, n, c, h, i, d, e) {
		s.className += ' ' + y;
		h.start = 1 * new Date;
		h.end = i = function () {
			s.className = s.className.replace(RegExp(' ?' + y), '')
		};
		(a[n] = a[n] || []).hide = h;
		setTimeout(function () {
			i();
			h.end = null
		}, c);
		h.timeout = c;
	})(window, document.documentElement, 'async-hide', 'dataLayer', 4000,
		{'GTM-NVFPDWB': true});
  </script>
  <!-- Google Tag Manager -->
  <script>
   (function (w, d, s, l, i) {
		w[l] = w[l] || [];
		w[l].push({
			'gtm.start':
				new Date().getTime(), event: 'gtm.js'
		});
		var f = d.getElementsByTagName(s)[0],
			j = d.createElement(s), dl = l != 'dataLayer' ? '&l=' + l : '';
		j.async = true;
		j.src =
			'https://www.googletagmanager.com/gtm.js?id=' + i + dl;
		f.parentNode.insertBefore(j, f);
	})(window, document, 'script',

### Finding values for some tags

In [7]:
soup.find_all("h1")

[<h1>Test Sites</h1>, <h1 class="page-header">Computers / Laptops</h1>]

In [8]:
soup.h1

<h1>Test Sites</h1>

In [9]:
print(soup.header.p.string)

soup.header.p

Web Scraper


<p>Web Scraper</p>

### Using .text, get the value

In [21]:
# Searching specific attributes of tags
price = soup.find('h4', class_= 'pull-right price')
print(price)

price.text

<h4 class="pull-right price">$295.99</h4>


'$295.99'

In [11]:
soup.find_all('h4', class_= 'pull-right price')[10:20]

[<h4 class="pull-right price">$391.48</h4>,
 <h4 class="pull-right price">$393.88</h4>,
 <h4 class="pull-right price">$399.00</h4>,
 <h4 class="pull-right price">$399.99</h4>,
 <h4 class="pull-right price">$404.23</h4>,
 <h4 class="pull-right price">$408.98</h4>,
 <h4 class="pull-right price">$409.63</h4>,
 <h4 class="pull-right price">$410.46</h4>,
 <h4 class="pull-right price">$410.66</h4>,
 <h4 class="pull-right price">$416.99</h4>]

### Using inspect in browser, get tags for necessary columns and get required values

In [12]:
# Filter by name
name = soup.find_all('a', class_='title')
# Filter by price
price = soup.find_all('h4', class_ = 'pull-right price')
# Filter by reviews
reviews = soup.find_all('p', class_ = 'pull-right')
# Filter by description
description = soup.find_all('p', class_ ='description')

In [14]:
# Create a for loop to make string list from find_all list
product_name_list = []
for i in name:
 name = i.text
 product_name_list.append(name)
price_list = []
for i in price:
 price = i.text
 price_list.append(price)
 
review_list = []
for i in reviews:
 rev = i.text
 review_list.append(rev)
 
description_list = []
for i in description:
 desc = i.text
 description_list.append(desc)

In [15]:
# Create dataframe with pandas
import pandas as pd
table = pd.DataFrame({'Product Name':product_name_list,
 'Price': price_list,
 'Reviews':review_list,
 'Description':description_list})

In [16]:
table.head(10)

Unnamed: 0,Product Name,Price,Reviews,Description
0,Asus VivoBook X4...,$295.99,14 reviews,"Asus VivoBook X441NA-GA190 Chocolate Black, 14..."
1,Prestigio SmartB...,$299.00,8 reviews,"Prestigio SmartBook 133S Dark Grey, 13.3"" FHD ..."
2,Prestigio SmartB...,$299.00,12 reviews,"Prestigio SmartBook 133S Gold, 13.3"" FHD IPS, ..."
3,Aspire E1-510,$306.99,2 reviews,"15.6"", Pentium N3520 2.16GHz, 4GB, 500GB, Linux"
4,Lenovo V110-15IA...,$321.94,5 reviews,"Lenovo V110-15IAP, 15.6"" HD, Celeron N3350 1.1..."
5,Lenovo V110-15IA...,$356.49,6 reviews,Asus VivoBook 15 X540NA-GQ008T Chocolate Black...
6,Hewlett Packard...,$364.46,12 reviews,"Hewlett Packard 250 G6 Dark Ash Silver, 15.6"" ..."
7,Acer Aspire 3 A3...,$372.70,2 reviews,"Acer Aspire 3 A315-31 Black, 15.6"" HD, Celeron..."
8,Acer Aspire A315...,$379.94,0 reviews,"Acer Aspire A315-31-C33J Black 15.6"", HD, Cele..."
9,Acer Aspire ES1-...,$379.95,9 reviews,"Acer Aspire ES1-572 Black, 15.6"" HD, Core i3-6..."


### Data is collected in tabular form, convert and save to suitable formats

In [35]:
table.to_csv('Scrapped data.csv', encoding='utf-8', index=False)
