# Scraping gempundit.com
We obtain our data by scraping the website gempundit.com. The store offers a variety of gemstones and for each product there are multiple images from different angles.


In [1]:
import pandas as pd
from time import sleep
from bs4 import BeautifulSoup as bs
import lxml
#import requests
import grequests
import shutil
import os
import io
from PIL import Image
import json
import concurrent.futures as cf
from tqdm import tqdm
from random import randint, random

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
  with loop.timer(seconds, ref=ref) as t:


In [2]:
GEMSTONES_CATEGORY_URL = 'https://www.gempundit.com/gemstones'
HEADERS = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
session = grequests.Session()
session.headers.update(HEADERS)
executor = cf.ThreadPoolExecutor(max_workers=12)


First we extract the links for all of the type of gemstone from the website.

In [5]:
def get_all_gem_links_website(ALL_GEMS_URL):
    html = grequests.get(ALL_GEMS_URL, headers=HEADERS).text
    soup = bs(html, 'html.parser')
    print('succesfully loaded page')
    gem_table = soup.find('div', {'class': 'container'})
    print('succesfully found gem table, now collecting links')
    gem_aClass = gem_table.find_all('a', {'data-category': 'gemstones'})
    gem_links = {}
    for gem in gem_aClass:
        link = gem.get('href')
        title = gem.get('title')
        print(title)
        # add the link to the dictionary
        gem_links[title] = link
    print('succesfully collected all gem links')
    return gem_links

gem_links = get_all_gem_links_website(GEMSTONES_CATEGORY_URL)

succesfully loaded page
succesfully found gem table, now collecting links
Blue Sapphire (Neelam)
Cats Eye
Emerald (Panna)
Hessonite (Gomed)
Pearl (Moti)
Ruby (Manik)
Red Coral (Moonga)
White Sapphire
Yellow Sapphire - Pukhraj
Amethyst
Citrine (Sunela)
Fire Opal
Garnet
Iolite (Neeli)
Navratna
Opal
Peridot
Pitambari Neelam
Turquoise
White Coral
Yellow Topaz
Zircon
Alexandrite
Burmese Ruby
Colombian Emerald
Cornflower Blue Sapphire
Kashmir Blue Sapphire
No Oil Emerald
Panjshir Emerald
Padparadscha Sapphire
Paraiba Tourmaline
Pigeon Blood Ruby
Pink Sapphire
Royal Blue Sapphire
Tanzanite
Amber
Ametrine
Aquamarine
Blue Topaz
Kyanite
Lapis Lazuli
Moldavite
Moonstone
Star Ruby
Tourmaline
Spinel
Blue Zircon
succesfully collected all gem links


Now we collect the links to all of the product pages for each gemstone category.

In [None]:
def get_all_gem_pages(gem_links):    
    print('getting all gem pages')
    gem_page_links = {}
    for gem in gem_links:
        print('\n', gem)
        try:
            number_of_pages = int(bs(grequests.get(gem_links[gem]+'/page/1000', headers=HEADERS).text, 'html.parser').find('li', {'class': 'current'}).text)
        except:
            number_of_pages = 1
        print('')
        print(gem, number_of_pages)
        gem_page_links[gem] = []
        for page_no in range(1, number_of_pages+1):
            gem_page_links[gem].append(gem_links[gem]+'/page/'+str(page_no))
            #print(page_no, ', ', end='')
    print('-'*50)
    print('succesfully collected links')
    return gem_page_links
all_pages = get_all_gem_pages(gem_links)

For later use we will save the dictionary with all the gemstone page links as a JSON file.

In [7]:
def save_gem_page_links(gem_page_links):
    with open("../dat/page_links/gem_page_links.json", "w") as outfile:
        json.dump(gem_page_links, outfile)
save_gem_page_links(all_pages)

#### Reloading gemstone page links

In [3]:
def load_gem_page_links():
    with open("../dat/page_links/gem_page_links.json", "r") as outfile:
        gem_page_links = json.load(outfile)
    return gem_page_links
all_pages = load_gem_page_links()

#

After obtaining all the links to the gemstone pages, we will extract the links to each product.

Because of the access and speed limits, we will use multiple threads to collect the links.

The webpage has a rate limit so after each collected links, we will do a random sleep between 0.1 and 5 seconds.
If we don't do this, the website will block our reqests.

The collected product links will be saves in the directory `dat/product_links`

In [4]:
def get_individual_gem_links(data):
    gem = data[0]
    links = data[1]

    if os.path.exists('../dat/product_links/' + gem + '.csv'):
        print('skipping', gem)
        return
    gem_product_links = []
    # check if csv file exists
    for page in links:
        html = grequests.get(page, headers=HEADERS).text
        soup = bs(html, 'html.parser')
        gem_product_pages = soup.find_all('a', {'class': 'product-image dataimage'})
        gem_product_links.extend([page.get('href') for page in gem_product_pages])
        # sleep for a random time to avoid being blocked
        sleep(random()*2)
        if random() < 0.1:
            sleep(randint(1, 5))
    pd.DataFrame(gem_product_links, columns=[gem]).to_csv('../dat/product_links/' + gem + '.csv')
    sleep(10)
    return 1
def get_gem_product_links(gem_page_links):
    futures = [executor.submit(get_individual_gem_links, [gem, gem_page_links[gem]]) for gem in gem_page_links]
    with tqdm(total=len(futures)) as pbar:
        for future in cf.as_completed(futures):
            result = future.result()
            pbar.update()
    print('succesfully collected all product links')
    
print('starting to collect all product links')
get_gem_product_links(all_pages)
print('succesfully collected all product links')

starting to collect all product links
skipping Cats Eye
skipping Hessonite (Gomed)
skipping Pearl (Moti)
skipping Red Coral (Moonga)
skipping White Sapphire
skipping Yellow Sapphire - Pukhraj
skipping Amethyst
skipping Fire Opal
skipping Garnet
skipping Citrine (Sunela)
skipping Navratna
skipping Peridot
skipping Iolite (Neeli)
skipping Turquoise
skipping White Coral
skipping Pitambari Neelam
skipping Zircon
skipping Yellow Topaz
skipping Colombian Emerald
skipping Cornflower Blue Sapphire
skipping Burmese Ruby
skipping Alexandrite
skipping Kashmir Blue Sapphire
skipping Padparadscha Sapphire
skipping No Oil Emerald
skipping Pigeon Blood Ruby
skipping Panjshir Emerald
skipping Paraiba Tourmaline
skipping Pink Sapphire
skipping Royal Blue Sapphire
skipping Aquamarine
skipping Blue Topaz
skipping Kyanite
skipping Lapis Lazuli
skipping Ametrine
skipping Tanzanite
skipping Amber
skipping Moldavite
skipping Moonstone
skipping Star Ruby
skipping Spinel
skipping Tourmaline
skipping Blue Zirco

100%|██████████| 47/47 [16:38<00:00, 21.25s/it]

succesfully collected all product links
succesfully collected all product links





For each product page, we will scrape the image links contained within the page.
This is again done using multiple threads to speed up the process.

The collected image links will be saved as a CSV file in the `dat/image_links/` directory.


In [4]:
def get_img_links(data):
    gem = data[0]
    links = data[1]

    def extract_img_links(response):
        soup = bs(response.content, 'html.parser')
        img_links = [img['src'].split('?')[0] for div in soup.find_all("div", class_="item product_thumb_forHeight") 
                     for img in div.find_all('img')
                     if 'certi' not in img['src'] and 'hand' not in img['src']]
        return img_links

    # Build asynchronous requests using grequests
    requests = [grequests.get(link, headers=HEADERS, session=session) for link in links]

    # Send requests asynchronously
    responses = grequests.imap(requests, size=16)  # Adjust concurrency with 'size' 

    gem_img_links = []
    futures = []

    with tqdm(total=len(links)) as pbar:
        for response in responses:
            future = executor.submit(extract_img_links, response)
            futures.append(future)
            pbar.update()
    
    print('waiting for all threads to complete')

    for future in cf.as_completed(futures):
        gem_img_links.extend(future.result())
    # Save results
    pd.DataFrame(gem_img_links, columns=[gem]).to_csv(f'../dat/image_links/{gem}.csv')

# def read_gem_product_links():
#     gem_product_links = {}
#     for file in os.listdir('../dat/product_links'):
#         data = pd.read_csv('../dat/product_links/' + file, index_col=0)
#         gem = data.columns[0]
#         gem_product_links[gem] = data[gem].tolist()
#     return gem_product_links

def get_all_gem_image_links():
    for file in os.listdir('../dat/product_links/'):
        if os.path.exists('../dat/image_links/' + file):
            print('skipping', gem)
            continue
        data = pd.read_csv('../dat/product_links/' + file, index_col=0)
        gem = data.columns[0]
        links = data[gem].tolist()

        print('getting image links for', gem)
        get_img_links([gem, links])
        print('done')

In [3]:
# from bs4 import BeautifulSoup as bs
# import grequests
# import pandas as pd
# import os
# from tqdm import tqdm
# from concurrent.futures import ThreadPoolExecutor, as_completed
# import requests

def get_img_links(data):
    gem, links = data

    def extract_img_links(response):
        if response.status_code == 200:
            soup = bs(response.content, 'lxml')  # Use 'lxml' for faster parsing
            img_links = [img['src'].split('?')[0] for div in soup.find_all("div", class_="item product_thumb_forHeight")
                         for img in div.find_all('img')
                         if 'certi' not in img['src'] and 'hand' not in img['src']]
            return img_links
        else:
            return []  # Return an empty list in case of HTTP errors

    # Use a session for connection pooling
    requests = (grequests.get(link, headers=HEADERS, session=session) for link in links)

    # Send requests asynchronously and adjust concurrency with 'size'
    responses = grequests.imap(requests, size=8)

    gem_img_links = []

    future_to_response = {executor.submit(extract_img_links, response): response for response in responses}

    with tqdm(total=len(links)) as pbar:
        for future in cf.as_completed(future_to_response):
            gem_img_links.extend(future.result())
            pbar.update()

    # Save results
    pd.DataFrame(gem_img_links, columns=[gem]).to_csv(f'../dat/image_links/{gem}.csv')

def read_gem_product_links():
    gem_product_links = {}
    for file in os.listdir('../dat/product_links'):
        data = pd.read_csv(f'../dat/product_links/{file}', index_col=0)
        gem = data.columns[0]
        gem_product_links[gem] = data[gem].tolist()
    return gem_product_links

def get_all_gem_image_links():
    for file in os.listdir('../dat/product_links/'):
        if os.path.exists(f'../dat/image_links/{file}'):
            print('skipping', file.split('.')[0])  # Corrected to display the gem name being skipped
            continue
        data = pd.read_csv(f'../dat/product_links/{file}', index_col=0)
        gem = data.columns[0]
        links = data[gem].tolist()

        print('getting image links for', gem)
        get_img_links([gem, links])
        print('done')


In [4]:
get_all_gem_image_links()

skipping Alexandrite
skipping Amber
skipping Amethyst
skipping Ametrine
skipping Aquamarine
skipping Blue Sapphire (Neelam)
skipping Blue Topaz
skipping Blue Zircon
skipping Burmese Ruby
skipping Cats Eye
skipping Citrine (Sunela)
skipping Colombian Emerald
skipping Cornflower Blue Sapphire
skipping Emerald (Panna)
skipping Fire Opal
skipping Garnet
skipping Hessonite (Gomed)
skipping Iolite (Neeli)
skipping Kashmir Blue Sapphire
skipping Kyanite
skipping Lapis Lazuli
skipping Moldavite
skipping Moonstone
skipping Navratna
skipping No Oil Emerald
skipping Opal
skipping Padparadscha Sapphire
skipping Panjshir Emerald
skipping Paraiba Tourmaline
skipping Pearl (Moti)
skipping Peridot
skipping Pigeon Blood Ruby
skipping Pink Sapphire
skipping Pitambari Neelam
skipping Red Coral (Moonga)
skipping Royal Blue Sapphire
skipping Ruby (Manik)
skipping Spinel
skipping Star Ruby
skipping Tanzanite
skipping Tourmaline
getting image links for Turquoise


Traceback (most recent call last):
  File "c:\Users\david\miniconda3\envs\gemstone\lib\site-packages\gevent\_ffi\loop.py", line 270, in python_check_callback
    def python_check_callback(self, watcher_ptr): # pylint:disable=unused-argument
KeyboardInterrupt
2024-02-22T10:55:28Z


KeyboardInterrupt: 

  with loop.timer(seconds, ref=ref) as t:


### Downloading images

Now wo download all the collected images to the `dat/images` directory.

In [None]:
def save_image(gem, response):
    filename = response.url.split('/p')[-1].split('/')[1].split('?')[0]

    img = Image.open(io.BytesIO(response.content))
    if img.mode != 'RGB':
        img = img.convert('RGB')  # Convert to RGB if necessary
    img.save(os.path.join(img_path, gem, filename + '.jpg'), format='JPEG')

# Asynchronous image downloads
def process_gem(gem, links):
    reqs = (grequests.get(link, stream=True, headers=HEADERS, session=session) for link in links)
    responses = grequests.imap(reqs, size=16)  # Adjust pool size

    for response in responses:
        if response:  # Skip failures
            save_image(gem, response)

def process_file(file):
    data = pd.read_csv(file, index_col=0)
    gem = data.columns[0]
    links = data[gem].tolist()
    os.makedirs(img_path + gem, exist_ok=True)
    process_gem(gem, links)

link_path = '../dat/image_links'
img_path = '../dat/images/'

executor = cf.ThreadPoolExecutor(max_workers=8)

futures = [executor.submit(process_file, os.path.join(link_path, file)) for file in os.listdir(link_path)]
with tqdm(total=len(futures)) as pbar:
    for future in cf.as_completed(futures):
        future.result()
        pbar.update()