# Web Scraping Tests

In [54]:
import httpx
from selectolax.parser import HTMLParser
from rich import print
import re
from urllib.parse import urljoin

In [5]:
# setting up constants
ROOT_URL = "https://www.cleanaway.com.au/contact-us/our-locations/"
USER_AGENT = (
    "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/117.0"
)
HEADERS = {"User-Agent": USER_AGENT, "accept-language": "en-US"}
TIMEOUT = 100

In [7]:
# get website response
response = httpx.get(ROOT_URL, headers=HEADERS, timeout=TIMEOUT)
print(f"The status code returned by the website is: {response.status_code}")

In [10]:
parsed_html = HTMLParser(response.text)
print(f"The object returned is: {parsed_html}")
print(f"The type of the content is: {type(parsed_html)}")

In [14]:
service_links = parsed_html.css("div.white-box > a")
print(f"Number of services listed on the first page: {len(service_links)}")

In [26]:
# fetch URLS
for service in service_links:
    service_url = service.attrs["href"]
    print(f"[bold green]{service_url}[/]")

## Fetch Service Info

In [32]:
service = service_links[1]

service_url = service.attrs["href"]
print(f"[green bold]{service_url}[/]")

In [33]:
# get response from product url
service_response = httpx.get(service_url, headers=HEADERS, timeout=TIMEOUT)
print(service_response.status_code)

In [34]:
svc_html = HTMLParser(service_response.text)
svc_html

<HTMLParser chars=279374>

In [41]:
service_name = svc_html.css_first("div.location-box h1").text(strip=True)
service_name

'Albany Transfer Station & MRF'

In [42]:
address = svc_html.css_first("div.location-box div.info-block:first-of-type p a").text(
    strip=True
)
address

'2-16 Cuming Road & 37 Maxwell Street, Albany, WA, 6330'

In [43]:
address_url = svc_html.css_first(
    "div.location-box div.info-block:first-of-type p a"
).attrs["href"]
address_url

'http://maps.google.com/maps/place/?q=-35.01078332713777,117.86331638131479'

In [49]:
if (
    "services"
    in svc_html.css_first(
        "div.location-box div.info-block:last-of-type div.info-block__title"
    )
    .text(strip=True)
    .lower()
):
    services = svc_html.css_first(
        "div.location-box div.info-block:last-of-type div.info-block__desc p"
    ).text(strip=True)
else:
    services = "Miscellaneous"

print(services)

In [52]:
lat_long_pattern = re.compile(r".*\?q=(.*)\,(.*)")
lat_long_matches = lat_long_pattern.match(address_url)

latitude = lat_long_matches.group(1)
longitude = lat_long_matches.group(2)

print(latitude, longitude)

In [55]:
next_page_url = parsed_html.css_first("li.location-pagination__next a").attrs["href"]
next_page_url_link = urljoin(ROOT_URL, next_page_url)

next_page_url_link

'https://www.cleanaway.com.au/contact-us/our-locations/?pg=2'

In [1]:
from fake_useragent import UserAgent

In [32]:
user_agent = UserAgent(platforms="pc")
for _ in range(5):
    ua = user_agent.random
    print(ua)
    print(type(ua))

Mozilla/5.0 (X11; Ubuntu; Linux x86_64) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15
<class 'str'>
Mozilla/5.0 (X11; Linux x86_64; rv:123.0) Gecko/20100101 Firefox/123.0 
<class 'str'>
Mozilla/5.0 (X11; Ubuntu; Linux x86_64) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15
<class 'str'>
Mozilla/5.0 (X11; Linux x86_64; rv:123.0) Gecko/20100101 Firefox/123.0 
<class 'str'>
Mozilla/5.0 (X11; Linux x86_64; rv:123.0) Gecko/20100101 Firefox/123.0 
<class 'str'>


In [None]:
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0