# Scrape Websites


## Find a Page


Open a web page with the browser and inspect it.

Hover the cursor on the text and follow the shaded box surrounding the main text.

From the result, check the main text inside a few levels of HTML tags.

In [235]:
## Import Libraries

import regex as re

from urllib.parse import unquote
import urllib3
from bs4 import BeautifulSoup
import time

import warnings
warnings.filterwarnings('ignore')

### Define the content to retrieve (webpage's URL)

In [236]:
quote_page = 'https://www.jbhifi.com.au/collections/tvs/tvs-all-brands'

### Retrieve the page
- Require Internet connection

In [237]:
http = urllib3.PoolManager()

r = http.request('GET', quote_page)
if r.status == 200:
    ## time.sleep(4)
    page = r.data
    print('Type of the variable \'page\':', page.__class__.__name__)
    print('Page Retrieved. Request Status: %d, Page Size: %d' % (r.status, len(page)))
else:
    print('Some problem occurred. Request Status: %s' % r.status)

Type of the variable 'page': bytes
Page Retrieved. Request Status: 200, Page Size: 571225


### Convert the stream of bytes into a BeautifulSoup representation

In [238]:
soup = BeautifulSoup(page, 'html.parser')
print('Type of the variable \'soup\':', soup.__class__.__name__)

Type of the variable 'soup': BeautifulSoup


### Check the content
- The HTML source
- Includes all tags and scripts
- Can be long!

In [239]:
print(soup.prettify()[:1000000])

<!DOCTYPE html>
<!--[if IE 9]> <html class="ie9 no-js" lang="en"> <![endif]-->
<!--[if (gt IE 9)|!(IE)]><!-->
<html class="no-js" lang="en">
 <!--<![endif]-->
 <head>
  <meta charset="utf-8"/>
  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
  <meta content="width=device-width, maximum-scale=1.0, minimum-scale=1.0, initial-scale=1.0" name="viewport"/>
  <meta content="#000000" name="theme-color"/>
  <meta content="x7BL0ulPtmQdRoPpz4GP3wlYvggxk8gyttZk_uvgPdA" name="google-site-verification">
   <link href="https://www.jbhifi.com.au/collections/tvs/tvs-all-brands" rel="canonical"/>
   <link href="//cdn.shopify.com/s/files/1/0024/9803/5810/files/favicon_7cbca4eb-4222-41b7-a8f8-9dc52b7c4a7c_32x32.png?v=1594879882" rel="shortcut icon" type="image/png"/>
   <meta content="noydir" name="robots">
    <meta content="noodp" name="robots">
     <script crossorigin="anonymous" src="https://cdn.optimizely.com/js/18347140293.js">
     </script>
     <title>
      TVs all brands | JB Hi-Fi
  

### Check the HTML's Title

In [254]:
print('Title tag :%s:' % soup.title)
print('Title text:%s:' % soup.title.string)

Title tag :<title>TVs all brands | JB Hi-Fi</title>:
Title text:TVs all brands | JB Hi-Fi:


### Find the main content
- Check if it is possible to use only the relevant data

In [255]:
article_tag = 'main'
article = soup.find_all(article_tag)[0]
print('Type of the variable \'article\':', article.__class__.__name__)

Type of the variable 'article': Tag


In [256]:
article.text

'\n\n\nTVs all brands\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n^Discounts apply to most recent previous ticketed/advertised price. As we negotiate on price, products are likely to have sold below ticketed/advertised price in stores prior to the discount offer. Prices may differ at airport & express stores.\n\n\n'

### Get some of the text
- Plain text without HTML tags

In [243]:
# show the first 500 characters after removing redundant newlines
print(re.sub(r'\n\n+', '\n', article.text)[:500])


TVs all brands
^Discounts apply to most recent previous ticketed/advertised price. As we negotiate on price, products are likely to have sold below ticketed/advertised price in stores prior to the discount offer. Prices may differ at airport & express stores.



### Find the links in the text

In [244]:
for t in article.find_all('a'):
    print(t)

In [245]:
# identify the type of tag to retrieve
link_tag = 'a'

# create a list with the links from the `<a>` tag
tag_list = []
for t in article.find_all(link_tag):
    tag_list.append(t.get('href'))

# List comprehension version:
# tag_list = [t.get('href') for t in article.find_all(link_tag)]

print('Size of \'tag_list\':', len(tag_list))
tag_list

Size of 'tag_list': 0


[]

In [246]:
# keep only the links to the wiki itself
wiki_tag_list = []
for link in tag_list:
    if link is not None and link[:6] == '/wiki/':
        wiki_link = link[6:]
        wiki_tag_list.append(wiki_link)

# List comprehension:
# wiki_tag_list = [link[6:] for link in tag_list if link is not None and link[:6] == '/wiki/']

print('Size of \'wiki_tag_list\':', len(wiki_tag_list))
wiki_tag_list

Size of 'wiki_tag_list': 0


[]

In [247]:
filter

'(Season_|Category:|File:|Help:|Portal:|action=|Special:|Talk:)'

In [248]:
# create a filter for undesired links
filter  = '(%s)' % '|'.join([
    'Season_',
    'Category:',
    'File:',
    'Help:',
    'Portal:',
    'action=',
    'Special:',
    'Talk:'
])
# remove the links that are found in the filter
filtered_tag_list = []
for t in wiki_tag_list:
    if not re.search(filter, t):
        filtered_tag_list.append(t)

# filtered_tag_list = [t for t in wiki_tag_list if not re.search(filter, t)]
print('Size of \'filtered_tag_list\':', len(filtered_tag_list))
filtered_tag_list

Size of 'filtered_tag_list': 0


[]

In [249]:
# remove duplicates
unique_tag_list = list(set(filtered_tag_list))
print('Size of \'unique_tag_list\':', len(unique_tag_list))
unique_tag_list

Size of 'unique_tag_list': 0


[]

In [250]:
# convert escaped sequences
unquoted_tag_list = [unquote(t) for t in unique_tag_list]
print('Size of \'unquoted_tag_list\':', len(unquoted_tag_list))
unquoted_tag_list

Size of 'unquoted_tag_list': 0


[]

In [251]:
# convert underscore to space
spaced_tag_list = []
for tag in unquoted_tag_list:
    processed_tag = re.sub('_', ' ', tag)
    spaced_tag_list.append(processed_tag)

# spaced_tag_list = [re.sub('_', ' ', t) for t in unquoted_tag_list]
print('Size of \'tag_list\':', len(spaced_tag_list))
spaced_tag_list

Size of 'tag_list': 0


[]

In [252]:
# order the list
spaced_tag_list.sort()
print('Size of \'spaced_tag_list\':', len(spaced_tag_list))
spaced_tag_list

Size of 'spaced_tag_list': 0


[]

### Create a filter for unwanted types of articles

In [253]:
# remove the links that start with "The"
no_episodes_tag_list = []
for tag in spaced_tag_list:
    if not tag.startswith('The'):
        no_episodes_tag_list.append(tag)

#no_episodes_tag_list = [t for t in tag_list if not tag.startswith('The')]

print('Size of \'no_episodes_tag_list\':', len(no_episodes_tag_list))
no_episodes_tag_list

Size of 'no_episodes_tag_list': 0


[]



---



---



> > > > > > > > > © 2021 Institute of Data


---



---



