In [1]:
import requests
import time
import tqdm
import csv
import pandas as pd
from bs4 import BeautifulSoup, Comment

In [204]:
# apollo
apollo_url = "https://www.apollodiagnostics.in/health-checkup-packages/bangalore"

def get_content(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")
    comments = soup.find_all(string=lambda text: isinstance(text, Comment))
    for comment in comments:
        comment.extract()
    return soup

In [206]:
def extract_price(element):
    digits_and_dot = [char for char in element if char.isdigit() or char == '.']
    if any(char.isdigit() or char == '.' for char in digits_and_dot):
        return float(''.join(digits_and_dot))
    else:
        return None


def apollo(soup, csv_filename='apollo.csv'):
    def extract_details(details_url):
        response1 = requests.get(details_url)
        soup1 = BeautifulSoup(response1.content, "html.parser")
        try:
            sample_type_elem = soup1.find('span', class_='test-type-icon').find_parent('td')
            sample_type = sample_type_elem.find_next('td').text.strip() if sample_type_elem else None
            
            general_faq_elem = soup1.find('div', class_='general-faq-index')
            general_faq = general_faq_elem.text.strip() if general_faq_elem else None

            return {
                "sample_type": sample_type,
                "general_faq": general_faq
            }
        except Exception as e:
            print(f"Error extracting details: {e}")
            return {
                "sample_type": None,
                "general_faq": None
            }
    
    # create folder 
    if not os.path.exists('datasets'):
        os.makedirs('datasets')

    csv_path = os.path.join('datasets', csv_filename)

    try:
        with open(csv_path, 'w', newline='', encoding='utf-8') as csvfile:
            fieldnames = ["product_name", "data_id", "itemid", "pre_test_info", "report_delivery",
                          "home_collection_available", "price", "sample_type", "general_faq"]
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()

            for product_div in tqdm.tqdm(soup.find_all('div', class_='tests-category-product-item'), desc="Processing URLs "):
                try:
                    product_name = product_div.find('h4', class_='tests-category-product-name').find('a').text.strip()
                    data_id = product_div.find('h4', class_='tests-category-product-name')['data-id']
                    itemid = product_div.find('button', class_='addtocart')['itemid']
                    pre_test_info = product_div.find('div', class_='tests-category-product-info').find('p').text.strip()
                    report_delivery = product_div.find('div', class_='tests-category-product-report').find('p').text.strip()
                    home_collection_available = product_div.find('p', class_='tests-category-product-home-available').text.strip()
                    price_element = product_div.find('p', class_='tests-category-product-price').text.strip()
                    price = extract_price(price_element)
                    time.sleep(1)
                    details_url = f'{apollo_url}/{data_id}'
                    details = extract_details(details_url)

                    writer.writerow({
                        "product_name": product_name,
                        "data_id": data_id,
                        "itemid": itemid,
                        "pre_test_info": pre_test_info,
                        "report_delivery": report_delivery,
                        "home_collection_available": home_collection_available,
                        "price": price,
                        "sample_type": details["sample_type"],
                        "general_faq": details["general_faq"]
                    })
                except Exception as e:
                    print(f"Error processing product: {e}")

    except Exception as e:
        print(f"Error creating CSV file: {e}")

    df = pd.read_csv(csv_filename)
    return df

In [197]:
result_df = apollo(get_content(apollo_url))
result_df.head()

Processing URLs :  50%|████████████████                | 215/427 [12:12<22:26,  6.35s/it]

Error extracting details: 'NoneType' object has no attribute 'find_parent'


Processing URLs :  60%|███████████████████             | 255/427 [14:29<09:28,  3.30s/it]

Error extracting details: 'NoneType' object has no attribute 'find_parent'


Processing URLs :  60%|███████████████████▏            | 256/427 [14:33<09:22,  3.29s/it]

Error extracting details: 'NoneType' object has no attribute 'find_parent'


Processing URLs :  60%|███████████████████▎            | 257/427 [14:36<09:14,  3.26s/it]

Error extracting details: 'NoneType' object has no attribute 'find_parent'


Processing URLs :  60%|███████████████████▎            | 258/427 [14:40<09:47,  3.47s/it]

Error extracting details: 'NoneType' object has no attribute 'find_parent'


Processing URLs :  64%|████████████████████▌           | 275/427 [15:37<08:33,  3.38s/it]

Error extracting details: 'NoneType' object has no attribute 'find_parent'


Processing URLs :  65%|████████████████████▋           | 276/427 [15:40<08:17,  3.29s/it]

Error extracting details: 'NoneType' object has no attribute 'find_parent'


Processing URLs :  65%|████████████████████▊           | 277/427 [15:43<08:08,  3.25s/it]

Error extracting details: 'NoneType' object has no attribute 'find_parent'


Processing URLs : 100%|████████████████████████████████| 427/427 [24:55<00:00,  3.50s/it]


Unnamed: 0,product_name,data_id,itemid,pre_test_info,report_delivery,home_collection_available,price,sample_type,general_faq
0,Free T3 & Free T4,free-t3-free-t4,2291,Any special preparation for the test is not re...,Speak to our customer care,Home Collection Available,700.0,Blood,FREE T4 (FT4)\n FREE T4 (FT4)\n\nFREE T3 (FT3)...
1,Antithyroid Antibodies (anti Tpo And Anti Thyr...,antithyroid-antibodies-anti-tpo-and-anti-thyro...,1238,No special preparation is required for the test.,Speak to our customer care,Home Collection Available,2750.0,Blood,ANTI THYROGLOBULIN ANTIBODY\n ANTI THYROGLOBUL...
2,Glucose - Serum/plasma - Fasting And 2 Hours A...,glucose-serumplasma-fasting-and-2-hours-after-...,1009,"First sample to be provided, with overnight fa...",Speak to our customer care,Home Collection Available,140.0,Blood,"GLUCOSE, FASTING\n GLUCOSE\n FASTING\n\nGLUCOS..."
3,Glucose Fasting & Pp,glucose-fasting-pp,1020,"First sample to be provided, with overnight fa...",Speak to our customer care,Home Collection Available,160.0,Blood,"GLUCOSE, FASTING\n GLUCOSE\n FASTING\n\nGLUCOS..."
4,"Fsh,lh,prolactin, T3,t4,tsh",fshlhprolactin-t3t4tsh,1025,No special preparation is required for the test.,Speak to our customer care,Home Collection Available,1950.0,Blood,FOLLICLE STIMULATING HORMONE (FSH)\n FOLLICLE ...
