In [1]:
# imports
import re
import json
import httpx
import asyncio
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urlunparse, urljoin

In [2]:
# getResponse for single url

async def getResponse(target_url: str, client: httpx.AsyncClient) -> str | None:
    try:
        res = await client.get(target_url)

        # raise exception
        res.raise_for_status()
        return res.text    
    # except httpx.HTTPStatusError as e:
    #     print(f"HTTP Error: {e.response.status_code} for {e.request.url}")
    #     return None
    # except httpx.RequestError as e:
    #     print(f"Request Error: An error occurred while requesting {e.request.url}")
    #     return None
    except Exception as e:
        print(e)
        return None

    # not applicable for jupyter notebook
    # finally:
    #     await client.aclose() 

In [3]:
# defining client

base_url = 'https://www.shoecarnival.com'

custom_headers = {
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36"
}

# client = httpx.AsyncClient(http2=True, headers=custom_headers)

In [4]:
# regex function to find price
def convertPrice(price_str: str)-> float | None:
    match = re.search(r"[0-9]+.[0-9]{2}", price_str)

    if match:
        price = match.group()
        return float(price)
    else:
        return None

In [5]:
# shoe class
class Shoe:
    def __init__(self, url: str, brand: str = '', title: str = '', price: float = 0.0):
        self.brand = brand
        self.title = title
        self.price = price
        self.url = url

    def getURL(self) -> str:
        parsed_url = urlparse(self.url)

        url_without_params = (
            parsed_url.scheme,
            parsed_url.netloc,
            parsed_url.path,
            '', # params
            '', # query
            '' # fragment
        )

        return urlunparse(url_without_params)


    def __repr__(self):
        return f"Shoe(brand={self.brand}, title={self.title}, price={self.price})"

**CSS Selectors**

- info tag: ```div.css-1x7me35 > div[data-element="Box"] div.chakra-card__body > a.chakra-link```
- brand: ```div.chakra-stack div:first-child > h6```
- title: ```div.chakra-stack p[id^="product-tile"]```
- price: ```div.chakra-stack > div.mobile__price > h6:first-child```
- url: ```info_tag['href']```

In [6]:
# dataParser (generator)
def dataParser(html: str):
    soup = BeautifulSoup(html, "lxml")

    cards_info_tag_list = soup.select(
        'div.css-1x7me35 > div[data-element="Box"] div.chakra-card__body > a.chakra-link'
    )

    for card_info_tag in cards_info_tag_list:
        brand_tag = card_info_tag.select_one("div.chakra-stack > div > h6")
        brand_name = brand_tag.text if brand_tag else ""

        title_tag = card_info_tag.select_one('div.chakra-stack p[id^="product-tile"]')
        title = title_tag.string if title_tag else ""

        price_tag = card_info_tag.select_one(
            "div.chakra-stack > div.mobile__price > h6"
        )
        price = convertPrice(price_tag.get_text(strip=True)) if price_tag else 0.0

        relative_url = str(card_info_tag.get("href", ""))

        url = urljoin(base_url, relative_url) if relative_url else None

        if url:
            yield Shoe(url, brand=brand_name, title=title, price=price) # type: ignore



In [7]:
# event hook for success response
async def logResponse(response: httpx.Response):
    if 200 <= response.status_code <300:
        print(f"{response.status_code}✅   :::::   {response.url}")

In [8]:
all_shoes = []

In [9]:
# create async task
async def runSpider():
    urls = [f"https://www.shoecarnival.com/womens?page={page_num}" for page_num in range(1, 43)]
    limits = httpx.Limits(max_connections=100)

    async with httpx.AsyncClient(http2=True, headers=custom_headers, timeout=30.0, limits=limits, event_hooks={'response': [logResponse]}) as client:
        tasks = [asyncio.create_task(getResponse(url, client)) for url in urls]

        responses:list[str | None] = await asyncio.gather(*tasks)

        for html in responses:
            if isinstance(html, str):
                html_content = html

                for shoe_object in dataParser(html_content):
                    all_shoes.append(shoe_object)



In [10]:
await runSpider()

200✅   :::::   https://www.shoecarnival.com/womens?page=3
200✅   :::::   https://www.shoecarnival.com/womens?page=20
200✅   :::::   https://www.shoecarnival.com/womens?page=42
200✅   :::::   https://www.shoecarnival.com/womens?page=40
200✅   :::::   https://www.shoecarnival.com/womens?page=25
200✅   :::::   https://www.shoecarnival.com/womens?page=30
200✅   :::::   https://www.shoecarnival.com/womens?page=35
200✅   :::::   https://www.shoecarnival.com/womens?page=1
200✅   :::::   https://www.shoecarnival.com/womens?page=14
200✅   :::::   https://www.shoecarnival.com/womens?page=10
200✅   :::::   https://www.shoecarnival.com/womens?page=11
200✅   :::::   https://www.shoecarnival.com/womens?page=23
200✅   :::::   https://www.shoecarnival.com/womens?page=32
200✅   :::::   https://www.shoecarnival.com/womens?page=39
200✅   :::::   https://www.shoecarnival.com/womens?page=19
200✅   :::::   https://www.shoecarnival.com/womens?page=21
200✅   :::::   https://www.shoecarnival.com/womens?page=18

In [11]:
print(len(all_shoes))

1000


In [12]:
all_shoes[0].brand

'Nike'

In [13]:
shoe_dict_list = []

In [14]:
for shoe in all_shoes:
    clean_url = shoe.getURL()

    shoe_dict = {
        'brand': shoe.brand,
        'title': shoe.title,
        'price': shoe.price,
        'url': clean_url
    }

    shoe_dict_list.append(shoe_dict)

In [15]:
with open('./output/women_shoes.json', mode='w', encoding='utf-8') as json_file:
    json_data = json.dumps(shoe_dict_list)
    json_file.write(json_data)
    print('Extracted!')

Extracted!
