# Scraping gempundit.com
We obtain our data by scraping the website gempundit.com. The store offers a variety of gemstones and for each product there are multiple images from different angles.


In [15]:
import pandas as pd
from time import sleep
from bs4 import BeautifulSoup as bs
import lxml
import requests
import grequests
import shutil
import os
import io
from PIL import Image
import json
import concurrent.futures as cf
from tqdm import tqdm
from random import randint, random

In [16]:
GEMSTONES_CATEGORY_URL = 'https://www.gempundit.com/gemstones'
HEADERS = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
#session = grequests.Session()
#session.headers.update(HEADERS)
executor = cf.ThreadPoolExecutor(max_workers=4)


First we extract the links for all of the type of gemstone from the website.

In [33]:
def get_all_gem_links_website(ALL_GEMS_URL):
    request = grequests.get(ALL_GEMS_URL, headers=HEADERS)
    response = grequests.map([request])
    html = response[0].text
    soup = bs(html, 'html.parser')
    print('succesfully loaded page')
    gem_table = soup.find('div', {'class': 'container'})
    print('succesfully found gem table, now collecting links')
    gem_aClass = gem_table.find_all('a', {'data-category': 'gemstones'})
    gem_links = {}
    for gem in gem_aClass:
        link = gem.get('href')
        title = gem.get('title')
        print(title)
        # add the link to the dictionary
        gem_links[title] = link
    print('succesfully collected all gem links')
    return gem_links
def scrape_product_links(base_url, file_path=None):
    """
    Scrapes product links from a given base URL and returns them as a dictionary.

    Args:
        base_url (str): The base URL of the website. 

    Returns:
        dict: A dictionary containing product names as keys and hrefs as values.
    """
    if file_path is None:
        request = grequests.get(base_url, session=session)
        response = grequests.map([request])[0]
        response.raise_for_status()  # Check for HTTP errors

        soup = bs(response.content, 'html.parser')
    else:
        with open(file_path, 'r') as file:
            soup = bs(file, 'html.parser')

    result = {}  # Initialize an empty dictionary for results
    
    # Find the container with all the product blocks
    product_container = soup.find('ul', class_='stonesCategoryProduct')

    if product_container:
        # Iterate over each product block 
        for product_block in product_container.find_all('a'):
            product_name = product_block.find('span', class_='productNameH2').text.strip()
            product_href = product_block['href']

            # Add product name and href to the result dictionary
            result[product_name] = product_href

    return result



gem_links = scrape_product_links(GEMSTONES_CATEGORY_URL, '../src/gempundit_local.html')

Now we collect the links to all of the product pages for each gemstone category.

In [34]:
def get_all_gem_pages(gem_links):    
    print('getting all gem pages')
    gem_page_links = {}
    for gem in gem_links:
        print('\n', gem)
        try:
            request = grequests.get(gem_links[gem]+'/page/1000', session=session)
            response = grequests.map([request])[0]
            soup = bs(response.content, 'html.parser')
            number_of_pages = int(soup.find('li', {'class': 'current'}).text)
        except:
            number_of_pages = 1
        print('')
        print(gem, number_of_pages)
        gem_page_links[gem] = []
        for page_no in range(1, number_of_pages+1):
            gem_page_links[gem].append(gem_links[gem]+'/page/'+str(page_no))
            #print(page_no, ', ', end='')
    print('-'*50)
    print('succesfully collected links')
    return gem_page_links
all_pages = get_all_gem_pages(gem_links)

getting all gem pages

 Alexandrite

Alexandrite 16

 Alexandrite Cats Eye

Alexandrite Cats Eye 2

 Almandine Garnet (Pyrope)

Almandine Garnet (Pyrope) 10

 Amber

Amber 2

 Amethyst

Amethyst 45

 Amethyst Cabochon

Amethyst Cabochon 1

 Ametrine

Ametrine 12

 Ammolite

Ammolite 2

 Andalusite

Andalusite 1

 Andesine

Andesine 1

 Andesine Labradorite

Andesine Labradorite 1

 Angel Skin Coral

Angel Skin Coral 2

 Apatite

Apatite 5

 Apatite Cat's Eye

Apatite Cat's Eye 9

 Aquamarine

Aquamarine 54

 Australian Opal

Australian Opal 71

 Aventurine

Aventurine 9

 Azurite

Azurite 1

 Basra Pearls

Basra Pearls 2

 Beryl

Beryl 8

 Bicolor Sapphire

Bicolor Sapphire 1

 Bixbite

Bixbite 1

 Black Onyx

Black Onyx 28

 Black Opal

Black Opal 28

 Black Pearls

Black Pearls 3

 Black Spinel

Black Spinel 6

 Black Tourmaline

Black Tourmaline 13

 Blizzard Stone

Blizzard Stone 1

 Bloodstone

Bloodstone 10

 Blue Fluorite

Blue Fluorite 1

 Blue Moonstone

Blue Moonstone 1

 Blu

  with loop.timer(seconds, ref=ref) as t:


For later use we will save the dictionary with all the gemstone page links as a JSON file.

In [36]:
def save_gem_page_links(gem_page_links):
    with open("../dat/page_links/gem_page_links.json", "w") as outfile:
        json.dump(gem_page_links, outfile)
save_gem_page_links(all_pages)

#### Reloading gemstone page links

In [3]:
def load_gem_page_links():
    with open("../dat/page_links/gem_page_links.json", "r") as outfile:
        gem_page_links = json.load(outfile)
    return gem_page_links
all_pages = load_gem_page_links()

#

After obtaining all the links to the gemstone pages, we will extract the links to each product.

Because of the access and speed limits, we will use multiple threads to collect the links.

The webpage has a rate limit so after each collected links, we will do a random sleep between 0.1 and 5 seconds.
If we don't do this, the website will block our reqests.

The collected product links will be saves in the directory `dat/product_links`

In [4]:
import requests
def get_individual_gem_links(data):
    gem = data[0]
    links = data[1]

    if os.path.exists('../dat/product_links/' + gem + '.csv'):
        print('skipping', gem)
        return
    gem_product_links = []
    # check if csv file exists
    for page in links:
        html = requests.get(page, headers=HEADERS).text
        soup = bs(html, 'html.parser')
        gem_product_pages = soup.find_all('a', {'class': 'product-image dataimage'})
        gem_product_links.extend([page.get('href') for page in gem_product_pages])
        # sleep for a random time to avoid being blocked
        sleep(random()*2)
        if random() < 0.1:
            sleep(randint(1, 5))
    pd.DataFrame(gem_product_links, columns=[gem]).to_csv(os.path.join('../dat/product_links/', gem.replace('/', '_') + '.csv'), index=False)
    sleep(10)
    return 1
def get_gem_product_links(gem_page_links):
    futures = [executor.submit(get_individual_gem_links, [gem, gem_page_links[gem]]) for gem in gem_page_links]
    with tqdm(total=len(futures)) as pbar:
        for future in cf.as_completed(futures):
            result = future.result()
            pbar.update()
    print('succesfully collected all product links')
    
print('starting to collect all product links')
get_gem_product_links(all_pages)
print('succesfully collected all product links')

starting to collect all product links
skipping Almandine Garnet (Pyrope)
skipping Alexandrite
skipping Alexandrite Cats Eye
skipping Amber
skipping Amethyst
skipping Ammolite
skipping Amethyst Cabochon
skipping Ametrine
skipping Andalusite
skipping Angel Skin Coral
skipping Apatite Cat's Eye
skipping Andesine
skipping Andesine Labradorite
skipping Apatite
skipping Aventurine
skipping Aquamarine
skipping Azurite
skipping Basra Pearls
skipping Bicolor Sapphire
skipping Bixbite
skipping Australian Opal
skipping Black Onyx
skipping Black Pearls
skipping Black Spinel
skipping Black Opal
skipping Beryl
skipping Blizzard Stone
skipping Black Tourmaline
skipping Bloodstone
skipping Blue Moonstone
skipping Blue Spinel
skipping Blue Topaz
skipping Blue Zircon
skipping Blue Opal
skipping Blue Sapphire (Neelam)
skipping Blue Fluorite
skipping Brown Moonstone
skipping Brown Zircon
skipping Brazilian Emerald
skipping Burmese Ruby
skipping Boulder Opal
skipping Burmese Sapphire
skipping Cats Eye Moon

100%|██████████| 222/222 [00:59<00:00,  3.75it/s]

succesfully collected all product links
succesfully collected all product links



  with loop.timer(seconds, ref=ref) as t:


For each product page, we will scrape the image links contained within the page.
This is again done using multiple threads to speed up the process.

The collected image links will be saved as a CSV file in the `dat/image_links/` directory.


In [5]:
def get_img_links(data):
    gem = data[0]
    links = data[1]

    def extract_img_links(response):
        soup = bs(response.content, 'html.parser')
        img_links = [img['src'].split('?')[0] for div in soup.find_all("div", class_="item product_thumb_forHeight") 
                     for img in div.find_all('img')
                     if 'certi' not in img['src'] and 'hand' not in img['src']]
        return img_links

    # Build asynchronous requests using grequests
    requests = [grequests.get(link, headers=HEADERS, session=session) for link in links]

    # Send requests asynchronously
    responses = grequests.imap(requests, size=16)  # Adjust concurrency with 'size' 

    gem_img_links = []
    futures = []

    with tqdm(total=len(links)) as pbar:
        for response in responses:
            future = executor.submit(extract_img_links, response)
            futures.append(future)
            pbar.update()
    
    print('waiting for all threads to complete')

    for future in cf.as_completed(futures):
        gem_img_links.extend(future.result())
    # Save results
    pd.DataFrame(gem_img_links, columns=[gem]).to_csv(f'../dat/image_links/{gem}.csv')

def get_all_gem_image_links():
    for file in os.listdir('../dat/product_links/'):
        if os.path.exists('../dat/image_links/' + file):
            print('skipping', gem)
            continue
        data = pd.read_csv('../dat/product_links/' + file, index_col=0)
        gem = data.columns[0]
        links = data[gem].tolist()

        print('getting image links for', gem)
        get_img_links([gem, links])
        print('done')

In [13]:
def get_img_links(data):
    gem, links = data

    def extract_img_links(response):
        if response.status_code == 200:
            soup = bs(response.content, 'lxml')  # Use 'lxml' for faster parsing
            img_links = [img['src'].split('?')[0] for div in soup.find_all("div", class_="item product_thumb_forHeight")
                         for img in div.find_all('img')
                         if 'certi' not in img['src'] and 'hand' not in img['src']]
            return img_links
        else:
            return [] 
        
    responses = []
    futures = [executor.submit(requests.get, link, headers=HEADERS) for link in links]
    with tqdm(total=len(links)) as pbar:
        for future in cf.as_completed(futures):
            response = future.result()
            responses.append(response)
            pbar.update()

    gem_img_links = []

    future_to_response = {executor.submit(extract_img_links, response): response for response in responses}

    with tqdm(total=len(links)) as pbar:
        for future in cf.as_completed(future_to_response):
            gem_img_links.extend(future.result())
            pbar.update()

    # Save results
    pd.DataFrame(gem_img_links, columns=[gem]).to_csv(f'../dat/image_links/{gem}.csv')

def read_gem_product_links():
    gem_product_links = {}
    for file in os.listdir('../dat/product_links'):
        data = pd.read_csv(f'../dat/product_links/{file}', index_col=0)
        gem = data.columns[0]
        gem_product_links[gem] = data[gem].tolist()
    return gem_product_links

def get_all_gem_image_links():
    for file in os.listdir('../dat/product_links/'):
        if os.path.exists(f'../dat/image_links/{file}'):
            print('skipping', file.split('.')[0])  # Corrected to display the gem name being skipped
            continue
        data = pd.read_csv(f'../dat/product_links/{file}', index_col=0)
        if data.empty:
            print('empty, skipping', file.split('.')[0])  # Corrected to display the gem name being skipped
            continue
        gem = data.columns[0]
        links = data[gem].tolist()

        print('getting image links for', gem)
        get_img_links([gem, links])
        print('done')


In [17]:
get_all_gem_image_links()

skipping Alexandrite Cats Eye
skipping Alexandrite
skipping Almandine Garnet (Pyrope)
skipping Amber
empty, skipping Amethyst Cabochon
skipping Amethyst
skipping Ametrine
skipping Ammolite
empty, skipping Andalusite
empty, skipping Andesine Labradorite
empty, skipping Andesine
skipping Angel Skin Coral
skipping Apatite Cat's Eye
skipping Apatite
skipping Aquamarine
skipping Australian Opal
skipping Aventurine
empty, skipping Azurite
skipping Basra Pearls
skipping Beryl
empty, skipping Bicolor Sapphire
empty, skipping Bixbite
skipping Black Onyx
getting image links for Black Opal


  0%|          | 0/659 [00:00<?, ?it/s]

 30%|███       | 199/659 [12:39<47:51,  6.24s/it]  

### Downloading images

Now wo download all the collected images to the `dat/images` directory.

In [12]:
def save_image(gem, response):
    filename = response.url.split('/p')[-1].split('/')[1].split('?')[0]

    img = Image.open(io.BytesIO(response.content))
    if img.mode != 'RGB':
        img = img.convert('RGB')  # Convert to RGB if necessary
    img.save(os.path.join(img_path, gem, filename + '.jpg'), format='JPEG')

# Asynchronous image downloads
def process_gem(gem, links):
    reqs = (grequests.get(link, stream=True, headers=HEADERS) for link in links)
    responses = grequests.imap(reqs, size=8)  # Adjust pool size

    with tqdm(total=len(links)) as pbar:
        for response in responses:
            if response:  # Skip failures
                save_image(gem, response)
            pbar.update()

def process_file(file):
    data = pd.read_csv(file, index_col=0)
    gem = data.columns[0]
    links = data[gem].tolist()
    if not os.path.exists(img_path + gem):
        os.makedirs(img_path + gem, exist_ok=True)
        process_gem(gem, links)
    else:
        return

link_path = '../dat/image_links'
img_path = '../dat/images/'

executor = cf.ThreadPoolExecutor(max_workers=4)

futures = [executor.submit(process_file, os.path.join(link_path, file)) for file in os.listdir(link_path)]
with tqdm(total=len(futures)) as pbar:
    for future in cf.as_completed(futures):
        future.result()
        pbar.update()

  0%|          | 0/58 [00:00<?, ?it/s]

100%|██████████| 58/58 [00:00<00:00, 253.95it/s]
  with loop.timer(seconds, ref=ref) as t:


In [10]:
for dir in os.listdir(img_path):
    print(dir, len(os.listdir(img_path + dir)))
    if len(os.listdir(img_path + dir)) == 0:
        shutil.rmtree(img_path + dir)
        print('deleted', dir)

Alexandrite 1064
Alexandrite Cats Eye 0
deleted Alexandrite Cats Eye
Almandine Garnet (Pyrope) 0
deleted Almandine Garnet (Pyrope)
Amber 112
Amethyst 2004
Ametrine 633
Ammolite 0
deleted Ammolite
Angel Skin Coral 0
deleted Angel Skin Coral
Apatite 0
deleted Apatite
Apatite Cat's Eye 0
deleted Apatite Cat's Eye
Aquamarine 2648
Australian Opal 0
deleted Australian Opal
Aventurine 0
deleted Aventurine
Basra Pearls 0
deleted Basra Pearls
Beryl 0
deleted Beryl
Black Onyx 0
deleted Black Onyx
Blue Sapphire (Neelam) 5874
Blue Topaz 1375
Blue Zircon 449
Burmese Ruby 1158
Cats Eye 1826
Citrine (Sunela) 1620
Colombian Emerald 1179
Cornflower Blue Sapphire 40
Emerald (Panna) 8516
Fire Opal 390
Garnet 1637
Hessonite (Gomed) 1849
Iolite (Neeli) 416
Kashmir Blue Sapphire 122
Kyanite 262
Lapis Lazuli 144
Moldavite 1150
Moonstone 372
Navratna 1
No Oil Emerald 5
Opal 3397
Padparadscha Sapphire 16
Panjshir Emerald 17
Paraiba Tourmaline 17
Pearl (Moti) 144
Peridot 17
Pigeon Blood Ruby 14
Pink Sapphire 18

  with loop.timer(seconds, ref=ref) as t:
