In [22]:
import json
import time
import random
import re
import traceback
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import (
    TimeoutException, NoSuchElementException, StaleElementReferenceException,
    ElementClickInterceptedException, WebDriverException
)

In [None]:
# Helper functions

def safe_text(parent, xpath):
    try:
        return parent.find_element(By.XPATH, xpath).text.strip()
    except:
        return ""

def get_image_and_link(card):
    """Extract image URL and restaurant page link from a card"""
    image_url = ""
    link_url = ""
    
    # Try to get the <a> link
    try:
        a_tag = card.find_element(By.XPATH, ".//a[contains(@class,'sc-hqGPoI')]")
        link_url = a_tag.get_attribute("href")
    except:
        pass

    # Try to get the <img> element inside card
    try:
        img = card.find_element(By.XPATH, ".//img")
        possible_attrs = ["src", "data-src", "data-original", "data-srcset", "srcset"]

        for attr in possible_attrs:
            val = img.get_attribute(attr)
            if val and "http" in val:
                if " " in val:
                    val = val.split(" ")[0]  # handle srcset
                image_url = val
                break
    except:
        pass

    return image_url, link_url

def scroll_full_page(driver, pause_time=1.5):
    """Scrolls to bottom of page to load all restaurants"""
    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(pause_time)
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

def get_links(element):
    links = []

    if element.tag_name.lower() == "a":
        href = element.get_attribute("href")
        if href:
            links.append(href)

    nested_links = element.find_elements(By.XPATH, ".//a[@href]")
    for link in nested_links:
        links.append(link.get_attribute("href"))

    return list(set(links))   # remove duplicates

In [24]:
driver = webdriver.Chrome()
driver.maximize_window()
wait = WebDriverWait(driver, 12)

print("Opening Zomato…")
driver.get("https://www.zomato.com/")


Opening Zomato…


In [25]:
try:
    btn = wait.until(EC.element_to_be_clickable(
        (By.XPATH, "//*[contains(text(),'Accept')]")
    ))
    btn.click()
    print("Cookies accepted.")
except:
    print("No cookie popup.")


No cookie popup.


In [26]:
driver.execute_script("window.scrollTo(0, 500);")
time.sleep(1.5)

link = wait.until(
    EC.element_to_be_clickable((By.CSS_SELECTOR, 
    "a[href='https://www.zomato.com/restaurants']"))
)
driver.execute_script("arguments[0].click()", link)

driver.switch_to.window(driver.window_handles[-1])
time.sleep(2)
print(f"{driver.title} page opened.")


 page opened.


In [27]:
try:
        filters = wait.until(EC.element_to_be_clickable((By.XPATH, "//div[contains(text(),'Filters')]")))
        filters.click()

        more_filters = wait.until(EC.element_to_be_clickable((By.XPATH, "//p[text()='More filters']/parent::div")))
        more_filters.click()

        search_box = wait.until(EC.element_to_be_clickable((By.XPATH, "//section[@label='Search here']//input")))
        search_box.send_keys("pure veg")

        pureveg = wait.until(EC.element_to_be_clickable((By.XPATH, "//label[contains(text(),'Pure veg')]")))
        pureveg.click()

        apply_btn = wait.until(EC.element_to_be_clickable((By.XPATH, "//button[.//span[contains(text(),'Apply')]]")))
        apply_btn.click()
        time.sleep(4)
    
except Exception as e:
        print("Filter error:", e)

In [28]:


print("Scrolling full page…")
scroll_full_page(driver)


Scrolling full page…


In [29]:
cards = driver.find_elements(
    By.XPATH, "//a[contains(@class,'sc-hPeUyl') and contains(@class,'cKQNlu')]"
)

print(f"Found {len(cards)} restaurant cards.")


Found 9 restaurant cards.


In [30]:
restaurants = []

for idx, card in enumerate(cards, start=1):

    if idx > 15:
        break
    
    driver.execute_script("arguments[0].scrollIntoView(true);", card)
    time.sleep(0.1)

    name = safe_text(card, ".//h4")
    cuisine = safe_text(card, ".//p[contains(@class,'fSxdnq')]")
    price = safe_text(card, ".//p[contains(@class,'KXcjT')]")
    location = safe_text(card, ".//div[contains(@class,'min-basic-info-left')]/p")
    page_link = get_links(card)

    restaurants.append({
        "name": name,
        "cuisine": cuisine,
        "price_for_two": price,
        "location": location,
        # "image_url": img_url,
        "page_link": page_link
    })

    if idx % 25 == 0:
        print(f"{idx} restaurants processed…")

print("Extraction completed.")


Extraction completed.


In [31]:
import json
from datetime import datetime

# timestamp generate
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

filename = f"final_restaurants_info_{timestamp}.json"

with open(filename, "w", encoding="utf-8") as f:
    json.dump(restaurants, f, ensure_ascii=False, indent=4)

print(f"Exported JSON with {len(restaurants)} restaurants to {filename}")


Exported JSON with 9 restaurants to final_restaurants_info_2025-11-23_15-01-52.json


In [32]:
count = 0
for rest in restaurants:
    # if count > 3:
    #     break
    print('-------------------------')
    print(rest['page_link'][0])
    print('-------------------------')
    count += 1


-------------------------
https://www.zomato.com/ahmedabad/9834-the-fruit-truck-2-thaltej/info
-------------------------
-------------------------
https://www.zomato.com/ahmedabad/bowl-story-gurukul/info
-------------------------
-------------------------
https://www.zomato.com/ahmedabad/ray-cafe-panjrapol-sanstha/info
-------------------------
-------------------------
https://www.zomato.com/ahmedabad/ristretto-behind-the-rods-navrangpura/info
-------------------------
-------------------------
https://www.zomato.com/ahmedabad/rungg-premium-dining-panjrapol-sanstha/info
-------------------------
-------------------------
https://www.zomato.com/ahmedabad/pizzaiiolo-the-wood-fired-pizza-thaltej/info
-------------------------
-------------------------
https://www.zomato.com/ahmedabad/union-ambli/info
-------------------------
-------------------------
https://www.zomato.com/ahmedabad/the-shaka-cafe-prahlad-nagar/info
-------------------------
-------------------------
https://www.zomato.

In [33]:
for rest in restaurants:
    print(rest)
    print('--'*20)

{'name': '9834 The Fruit Truck', 'cuisine': 'Juices, Healthy Food, Shake, Desserts, Beverages', 'price_for_two': '₹600 for two', 'location': 'Thaltej, Ahmedabad', 'page_link': ['https://www.zomato.com/ahmedabad/9834-the-fruit-truck-2-thaltej/info']}
----------------------------------------
{'name': 'Bowl Story', 'cuisine': 'Italian, Mexican, North Indian, Continental', 'price_for_two': '₹600 for two', 'location': 'Gurukul, Ahmedabad', 'page_link': ['https://www.zomato.com/ahmedabad/bowl-story-gurukul/info']}
----------------------------------------
{'name': 'Ray Cafe', 'cuisine': 'Cafe, Coffee, Mexican, Pasta, Pizza, Desserts, Beverages', 'price_for_two': '₹1,500 for two', 'location': 'Panjrapol Sanstha, Ahmedabad', 'page_link': ['https://www.zomato.com/ahmedabad/ray-cafe-panjrapol-sanstha/info']}
----------------------------------------
{'name': 'Ristretto - Behind The Rods', 'cuisine': 'Continental, Chinese, North Indian, Lebanese, Italian, Fast Food, Desserts, Beverages', 'price_for

In [34]:
# Close the first driver
driver.quit()

In [None]:
def get_restaurant_name(driver, wait):
    try:
        name_el = wait.until(EC.visibility_of_element_located((By.XPATH, "//h1")))
        return name_el.text.strip()
    except:
        return None



In [36]:
def get_ratings(driver, wait):
    ratings = {}
    # Dining
    try:
        din_score_el = wait.until(
            EC.presence_of_element_located(
                (By.XPATH, "//div[text()='Dining Ratings']/preceding::div[contains(@class,'cILgox')][1]")
            )
        )
        din_score = float(din_score_el.text.strip())
        din_count_el = din_score_el.find_element(By.XPATH, "following::div[contains(@class,'kEgyiI')][1]")
        din_count = int(din_count_el.text.strip().replace(",", ""))
        ratings["dining"] = {"score": din_score, "count": din_count}
    except:
        pass

    # Delivery
    try:
        del_score_el = wait.until(
            EC.presence_of_element_located(
                (By.XPATH, "//div[text()='Delivery Ratings']/preceding::div[contains(@class,'cILgox')][1]")
            )
        )
        del_score = float(del_score_el.text.strip())
        del_count_el = del_score_el.find_element(By.XPATH, "following::div[contains(@class,'kEgyiI')][1]")
        del_count = int(del_count_el.text.strip().replace(",", ""))
        ratings["delivery"] = {"score": del_score, "count": del_count}
    except:
        pass

    return ratings




In [37]:
def get_cuisines(driver, wait):
    try:
        cuisine_els = wait.until(EC.presence_of_all_elements_located(
            (By.XPATH, "//section//div[contains(@class,'fXdtVd')]//a")
        ))
        cuisines = []
        for c in cuisine_els:
            text = c.text.strip()
            if text and text not in cuisines:
                cuisines.append(text)
        return cuisines
    except:
        return []



In [38]:
def get_address(driver):
    try:
        addr_el = driver.find_element(By.XPATH, "//section//div[contains(@class,'ckqoPM')]")
        return addr_el.text.strip()
    except:
        return None

def get_timing(driver):
    try:
        timing_el = driver.find_element(
            By.XPATH, "//section//span[contains(text(),'am') or contains(text(),'pm') or contains(text(),'–')]"
        )
        return timing_el.text.strip()
    except:
        return None

def get_cost_for_two(driver):
    try:
        cost_el = driver.find_element(
            By.XPATH, "//section//div[contains(@class,'ePRRqr') and contains(text(),'₹')]"
        )
        return cost_el.text.strip()
    except:
        return None

def get_phone(driver):
    try:
        phone_el = driver.find_element(
            By.XPATH, "//a[starts-with(@href,'tel:') and contains(@class,'leEVAg')]"
        )
        return phone_el.text.strip()
    except:
        return None


In [39]:
def scrap_full_restaurant(driver, link, wait_time=12):
    wait = WebDriverWait(driver, wait_time)

    def safe_click(xpath):
        try:
            btn = wait.until(EC.element_to_be_clickable((By.XPATH, xpath)))
            btn.click()
            time.sleep(2)
            return True
        except Exception:
            return False

    def load_all_scroll():
        pause = 1.2
        last_height = driver.execute_script("return document.body.scrollHeight")
        while True:
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(pause)
            new_height = driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                break
            last_height = new_height

    def get_all_images(selector):
        elements = driver.find_elements(By.CSS_SELECTOR, selector)
        urls = []
        for el in elements:
            try:
                src = (el.get_attribute("src") or
                       el.get_attribute("data-src") or
                       el.get_attribute("srcset"))
                if src:
                    urls.append(src)
            except:
                pass
        return list(set(urls))

    def clean_filename(name):
        return re.sub(r'[^a-zA-Z0-9_-]+', '_', name.strip())

    driver.get(link)
    time.sleep(3)
    title = driver.title.split('|')[0].strip()
    safe_title = clean_filename(title)

    dishes = []
    if safe_click("//*[text()='Order Online']"):
        time.sleep(3)
        cards = driver.find_elements(By.XPATH, "//*[contains(@class,'sc-iLWYPX')]")
        for card in cards:
            try:
                dish = {
                    "name": card.find_element(By.TAG_NAME, "h4").text,
                    "price": card.find_element(By.TAG_NAME, "span").text,
                    "detail": card.find_element(By.CLASS_NAME, "sc-isojaI").text
                }
                dishes.append(dish)
            except:
                pass

    photos = []
    if safe_click("//*[text()='Photos']"):
        load_all_scroll()
        photos = get_all_images("img.sc-s1isp7-5")

    menu_photos = []
    if safe_click("//*[text()='Menu']"):
        load_all_scroll()
        menu_photos = get_all_images("img.sc-s1isp7-5")

    data = {
        "restaurant_name": title,
        "url": link,
        "dishes": dishes,
        "photos": photos,
        "menu_photos": menu_photos
    }

    return data



In [40]:
def scrap_restaurant_full(driver, rest_id, url, timeout=15):
    wait = WebDriverWait(driver, timeout)
    restaurant = {
        "id": rest_id,
        "name": None,
        "ratings": {},
        "cuisines": [],
        "address": None,
        "timing": None,
        "cost_for_two": None,
        "phone": None
    }

    driver.get(url)
    time.sleep(2)

    restaurant["name"] = get_restaurant_name(driver, wait)
    if restaurant["name"]:
        print('name is added successfully')
    else:
        print(f'something problem')
        
    restaurant["ratings"] = get_ratings(driver, wait)
    if restaurant["ratings"]:
        print('ratings is added successfully')
    else:
        print(f'something problem')
        
    restaurant["cuisines"] = get_cuisines(driver, wait)
    if restaurant["cuisines"]:
        print('cuisines is added successfully')
    else:
        print(f'something problem')
        
    restaurant["address"] = get_address(driver)
    if restaurant["address"]:
        print('address is added successfully')
    else:
        print(f'something problem')
    
    restaurant["timing"] = get_timing(driver)
    if restaurant["timing"]:
        print('timing is added successfully')
    else:
        print(f'something problem')
        
    restaurant["cost_for_two"] = get_cost_for_two(driver)
    if restaurant["cost_for_two"]:
        print('cost_for_two is added successfully')
    else:
        print(f'something problem')
    
    restaurant["phone"] = get_phone(driver)
    if restaurant["phone"]:
        print('restaurant["phone"] is added successfully')
    else:
        print(f'something problem')
        
    restaurant_details = scrap_full_restaurant(driver, url)
    if restaurant_details:
        print('restaurant_details is added successfully')
    else:
        print('something problem')
    
    restaurant["dishes"] = restaurant_details.get("dishes", [])
    restaurant["photos"] = restaurant_details.get("photos", [])
    restaurant["menu_images"] = restaurant_details.get("menu_photos", [])

    return restaurant


In [41]:
# # loop thorugh the restaurant

# Function to split list into chunks
def chunk_list(lst, chunk_size):
    for i in range(0, len(lst), chunk_size):
        yield lst[i:i + chunk_size]

chunk_size = 100
chunks = list(chunk_list(restaurants, chunk_size))

total_restaurants = len(restaurants)
scraped_count = 0

for part_idx, chunk in enumerate(chunks, 1):
    start_idx = scraped_count + 1
    end_idx = scraped_count + len(chunk)
    
    # Generate timestamp for filename
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_file = f"scraped_restaurants_from_{start_idx}_to_{end_idx}_{timestamp}.json"
    
    print(f"\nScraping part {part_idx} ({start_idx} to {end_idx}) ...")
    
    # Start fresh driver
    driver = webdriver.Chrome()
    
    part_results = []
    for idx, r in enumerate(chunk, start_idx):
        print(f"Scraping restaurant {idx} of {total_restaurants}: {r['name']}")
        try:
            data = scrap_restaurant_full(driver, idx, r["page_link"][0])
            part_results.append(data)
        except Exception as e:
            print(f"Error scraping {r['name']}: {e}")
            continue
    
    driver.quit()  # Close driver for this chunk
    
    # Save this chunk immediately
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(part_results, f, ensure_ascii=False, indent=2)
    
    print(f"Saved {len(part_results)} records to {output_file}")
    
    scraped_count += len(chunk)
    
    # Optional pause
    time.sleep(5)

print("Scraping complete!")



Scraping part 1 (1 to 9) ...
Scraping restaurant 1 of 9: 9834 The Fruit Truck
name is added successfully
ratings is added successfully
cuisines is added successfully
address is added successfully
timing is added successfully
cost_for_two is added successfully
restaurant["phone"] is added successfully
restaurant_details is added successfully
Scraping restaurant 2 of 9: Bowl Story
name is added successfully
ratings is added successfully
cuisines is added successfully
address is added successfully
timing is added successfully
cost_for_two is added successfully
restaurant["phone"] is added successfully
restaurant_details is added successfully
Scraping restaurant 3 of 9: Ray Cafe
name is added successfully
ratings is added successfully
cuisines is added successfully
address is added successfully
timing is added successfully
cost_for_two is added successfully
restaurant["phone"] is added successfully
restaurant_details is added successfully
Scraping restaurant 4 of 9: Ristretto - Behind The

In [42]:
# ----------------------------------------------------Testing--------------------------------------------------