# Web Scraping Tests

In [1]:
import httpx
from selectolax.parser import HTMLParser

In [2]:
# setting up constants
ROOT_URL = "https://gopher1.extrkt.com/"
USER_AGENT = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/117.0"
HEADERS = {"User-Agent": USER_AGENT, "accept-language": "en-US"}
TIMEOUT = 100

In [3]:
# get website response
response = httpx.get(ROOT_URL, headers=HEADERS, timeout=TIMEOUT)
print(response.status_code)

200


In [4]:
parsed_html = HTMLParser(response.text)
parsed_html

<HTMLParser chars=71960>

In [5]:
type(parsed_html)

selectolax.parser.HTMLParser

In [6]:
all_products = parsed_html.css("ul.products li")
print(f"Product on the first page: {len(all_products)}")

Product on the first page: 16


In [7]:
# fetch URLS

for product in all_products:
    product_url = product.css_first("a").attrs["href"]
    print(product_url)

https://gopher1.extrkt.com/?product=abominable-hoodie
https://gopher1.extrkt.com/?product=adrienne-trek-jacket
https://gopher1.extrkt.com/?product=aeon-capri
https://gopher1.extrkt.com/?product=aero-daily-fitness-tee
https://gopher1.extrkt.com/?product=aether-gym-pant
https://gopher1.extrkt.com/?product=affirm-water-bottle
https://gopher1.extrkt.com/?product=aim-analog-watch
https://gopher1.extrkt.com/?product=ajax-full-zip-sweatshirt
https://gopher1.extrkt.com/?product=ana-running-short
https://gopher1.extrkt.com/?product=angel-light-running-short
https://gopher1.extrkt.com/?product=antonia-racer-tank
https://gopher1.extrkt.com/?product=apollo-running-short
https://gopher1.extrkt.com/?product=arcadio-gym-short
https://gopher1.extrkt.com/?product=argus-all-weather-tank
https://gopher1.extrkt.com/?product=ariel-roll-sleeve-sweatshirt
https://gopher1.extrkt.com/?product=artemis-running-short


### Fetch Sample Product Info

In [8]:
rand_prod = all_products[0]

rand_prod_url = rand_prod.css_first("a").attrs["href"]
print(rand_prod_url)

https://gopher1.extrkt.com/?product=abominable-hoodie


In [9]:
# get response from product url
prod_response = httpx.get(rand_prod_url, headers=HEADERS, timeout=TIMEOUT)
print(prod_response.status_code)

200


In [10]:
prod_html = HTMLParser(prod_response.text)
prod_html


<HTMLParser chars=115587>

In [11]:
title = prod_html.css_first("h1.product_title").text(strip=True)
print(title)

Abominable Hoodie


In [12]:
price = prod_html.css_first("p.price").text(strip=True)
print(price)

£69.00


In [13]:
stock = prod_html.css_first("p.stock")

if stock is not None:
    in_stock = stock.text()

In [14]:
sku = prod_html.css_first("span.sku").text(strip=True)
print(sku)

MH09


In [15]:
category = prod_html.css_first("a[rel='tag']").text(strip=True)
print(category)

Hoodies & Sweatshirts


In [16]:
description = "\n".join([element.text(strip=True) for element in prod_html.css("div#tab-description p")])
print(description)


It took CoolTech™ weather apparel know-how and lots of wind-resistant fabric to get the Abominable Hoodie just right. It’s aggressively warm when it needs to be, while maintaining your comfort in milder climes.
• Blue heather hoodie.• Relaxed fit.• Moisture-wicking.• Machine wash/dry.


In [17]:
more_info = prod_html.css("div#tab-additional_information tr")

if more_info is not None:
    details = {
        info.css_first("th").text(): info.css_first("td p").text() for info in more_info
    }

print(details)

{'Size': 'XS, S, M, L, XL', 'Color': 'Blue, Green, Red'}


In [18]:
product_image_link = prod_html.css_first("div.woocommerce-product-gallery__wrapper a").attrs["href"]
print(product_image_link)

https://gopher1.extrkt.com/wp-content/uploads/2023/11/mh09-blue_main.jpg


**View sample image**

<img src="https://gopher1.extrkt.com/wp-content/uploads/2023/11/mh09-blue_main.jpg" alt="img" height="200px">


### Get Current Timestamp

In [19]:
from datetime import datetime

timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
timestamp

'2024-02-03 17:49:51'

### Handle Pagination

In [20]:
# source page html is "parsed_html"

next_page_url = parsed_html.css_first("ul.page-numbers li a.next").attrs["href"]
print(next_page_url)

https://gopher1.extrkt.com/?paged=2


In [21]:
# Crawling through pages to get all page links, recursively

def get_all_page_urls(page_url: str, counter: int=1):
    site_response = httpx.get(page_url, headers=HEADERS, timeout=TIMEOUT)
    content_html = HTMLParser(site_response.text)
    print(page_url)
    print(f"Page-{counter} up 👍")
    counter += 1
    next_page = content_html.css_first("ul.page-numbers li a.next")
    if next_page is not None:
        next_page_url = next_page.attrs["href"]
        get_all_page_urls(next_page_url, counter)

In [22]:
get_all_page_urls(ROOT_URL)

https://gopher1.extrkt.com/
Page-1 up 👍
https://gopher1.extrkt.com/?paged=2
Page-2 up 👍
https://gopher1.extrkt.com/?paged=3
Page-3 up 👍
https://gopher1.extrkt.com/?paged=4
Page-4 up 👍
https://gopher1.extrkt.com/?paged=5
Page-5 up 👍
https://gopher1.extrkt.com/?paged=6
Page-6 up 👍
https://gopher1.extrkt.com/?paged=7
Page-7 up 👍
https://gopher1.extrkt.com/?paged=8
Page-8 up 👍
https://gopher1.extrkt.com/?paged=9
Page-9 up 👍
https://gopher1.extrkt.com/?paged=10
Page-10 up 👍
https://gopher1.extrkt.com/?paged=11
Page-11 up 👍
https://gopher1.extrkt.com/?paged=12
Page-12 up 👍


### Crawling with Wait and Header Rotate

In [23]:
from fake_useragent import UserAgent

user_agent = UserAgent()

for _ in range(5):
    random_user_agent = user_agent.random
    print(random_user_agent)

Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/117.0
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36
Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36
Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36


In [24]:
RANDOM_HEADERS = {"User-Agent": user_agent.random, "accept-language": "en-US"}
TIMEOUT = 100

In [25]:
import time
import random

start_time = time.time()
for _ in range(5):
    sleep_sec = random.randint(1, 5)
    time.sleep(sleep_sec)
    print(f"Slept for {sleep_sec} 💤")
end_time = time.time()

print(f"Process time taken: {end_time - start_time:.2f} seconds")

Slept for 4 💤


KeyboardInterrupt: 

In [None]:
def get_all_page_urls(page_url: str, counter: int=1):
    random_headers = {"User-Agent": user_agent.random, "accept-language": "en-US"}
    sleep_sec = random.randint(1, 5)
    site_response = httpx.get(page_url, headers=random_headers, timeout=TIMEOUT)
    print(f"Random Header is: {random_headers}")
    content_html = HTMLParser(site_response.text)
    print(page_url)
    print(f"Page-{counter} up 👍")
    counter += 1
    next_page = content_html.css_first("ul.page-numbers li a.next")
    time.sleep(sleep_sec)
    print(f"Slept for {sleep_sec} seconds 💤")
    print("========> ..... <========\n")
    if next_page is not None:
        next_page_url = next_page.attrs["href"]
        get_all_page_urls(next_page_url, counter)

In [None]:
start_time = time.time()
get_all_page_urls(ROOT_URL)
end_time = time.time()
print(f"Crawl time taken: {end_time - start_time:.2f} seconds")

Random Header is: {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.41', 'accept-language': 'en-US'}
https://gopher1.extrkt.com/
Page-1 up 👍
Slept for 3 seconds 💤

Random Header is: {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 OPR/102.0.0.0', 'accept-language': 'en-US'}
https://gopher1.extrkt.com/?paged=2
Page-2 up 👍
Slept for 2 seconds 💤

Random Header is: {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.76', 'accept-language': 'en-US'}
https://gopher1.extrkt.com/?paged=3
Page-3 up 👍
Slept for 4 seconds 💤

Random Header is: {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/117.0', 'accept-language': 'en-US'}
https://gopher1.extrkt.com/?paged=4
Page-4 up 👍
Slept for 1 seconds 

In [28]:
from csv import DictReader

In [31]:
with open("../data/data.csv", "r", encoding="utf-8") as f:
    csv_reader = DictReader(f)
    for row in csv_reader:
        print(row)

In [None]:
open("../data/data.csv", "w", encoding="utf-8").truncate()


0

In [None]:
type(parsed_html)

NameError: name 'parsed_html' is not defined

In [None]:
# read csv file

In [33]:
with open("../data/external/products_link.csv", "r", encoding="utf-8") as f:
    csv_reader = DictReader(f)
    print(csv_reader)
    print(type(csv_reader))

<csv.DictReader object at 0x000001516B6E3730>
<class 'csv.DictReader'>


In [1]:
import os

In [2]:
os.chdir("../")

In [3]:
from src.utils.basic_utils import read_csv

In [4]:
data = read_csv("data/external/products_link.csv")
data

[2024-02-03 10:38:45 PM]:ProjectLogger INFO:basic_utils    111 - CSV file: data\external\products_link.csv loaded successfully


[{'page_number': '1',
  'page_url': 'https://gopher1.extrkt.com/',
  'product_url': 'https://gopher1.extrkt.com/?product=abominable-hoodie'},
 {'page_number': '1',
  'page_url': 'https://gopher1.extrkt.com/',
  'product_url': 'https://gopher1.extrkt.com/?product=adrienne-trek-jacket'},
 {'page_number': '1',
  'page_url': 'https://gopher1.extrkt.com/',
  'product_url': 'https://gopher1.extrkt.com/?product=aeon-capri'},
 {'page_number': '1',
  'page_url': 'https://gopher1.extrkt.com/',
  'product_url': 'https://gopher1.extrkt.com/?product=aero-daily-fitness-tee'},
 {'page_number': '1',
  'page_url': 'https://gopher1.extrkt.com/',
  'product_url': 'https://gopher1.extrkt.com/?product=aether-gym-pant'},
 {'page_number': '1',
  'page_url': 'https://gopher1.extrkt.com/',
  'product_url': 'https://gopher1.extrkt.com/?product=affirm-water-bottle'},
 {'page_number': '1',
  'page_url': 'https://gopher1.extrkt.com/',
  'product_url': 'https://gopher1.extrkt.com/?product=aim-analog-watch'},
 {'pag

In [6]:
for row in data:
    print(row['product_url'])

https://gopher1.extrkt.com/?product=abominable-hoodie
https://gopher1.extrkt.com/?product=adrienne-trek-jacket
https://gopher1.extrkt.com/?product=aeon-capri
https://gopher1.extrkt.com/?product=aero-daily-fitness-tee
https://gopher1.extrkt.com/?product=aether-gym-pant
https://gopher1.extrkt.com/?product=affirm-water-bottle
https://gopher1.extrkt.com/?product=aim-analog-watch
https://gopher1.extrkt.com/?product=ajax-full-zip-sweatshirt
https://gopher1.extrkt.com/?product=ana-running-short
https://gopher1.extrkt.com/?product=angel-light-running-short
https://gopher1.extrkt.com/?product=antonia-racer-tank
https://gopher1.extrkt.com/?product=apollo-running-short
https://gopher1.extrkt.com/?product=arcadio-gym-short
https://gopher1.extrkt.com/?product=argus-all-weather-tank
https://gopher1.extrkt.com/?product=ariel-roll-sleeve-sweatshirt
https://gopher1.extrkt.com/?product=artemis-running-short
https://gopher1.extrkt.com/?product=atlas-fitness-tank
https://gopher1.extrkt.com/?product=atomic