# Scraping Chope

The Chope website has a restaurant directory that will be useful while building a map of restaurants in Singapore. Due to how the website was structured, Selenium was needed to scrape the information.

NOTE: After the URL opens, you will manually need to click the 'Detailed View' button located on the top right. If you do not click this, the code will return an error. This click can be automated in the future.

![image.png](attachment:image.png)

There are some restaurants that have not been scraped as they're missing some attributes. Most of the information has been scraped successfully, however.

In [1]:
# imports
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd
import requests

In [2]:
# Initialize the Chrome WebDriver (make sure the chromedriver executable is in your PATH)
# NOTE: this code block will take ~20 mins to run.

driver = webdriver.Chrome()

# Open the URL
url = 'https://www.chope.co/singapore-restaurants/list_of_restaurants?source=chope.com.sg&lang=en_US'
driver.get(url) 

# Wait for the page to load (you may need to adjust the wait time)
wait = WebDriverWait(driver, 5)
wait.until(EC.presence_of_element_located((By.XPATH, "//div[@class='r-info col-xs-12 col-sm-7 cf']")))

# Define a function to scroll to the bottom of the page
def scroll_to_bottom():
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

# Define a function to click the "Load More" button using JavaScript
def click_load_more():
    try:
        load_more_button = driver.find_element(By.XPATH, '//*[@id="load_more"]')
        if load_more_button.is_displayed() and load_more_button.is_enabled():
            driver.execute_script("arguments[0].click();", load_more_button)
            return True
    except Exception as e:
        pass  # Handle the case when the button is not found
    return False

# Collect data
data = []  # Create a list to store data as dictionaries
scraped_names = set()  # Maintain a set of scraped restaurant names to avoid duplicates

while True:
    try:
        scroll_to_bottom()  # Scroll to the bottom of the page
        if not click_load_more():
            break  # No new elements found, stop scraping

        # Wait for new elements to load using JavaScript (adjust the wait time if needed)
        wait.until(EC.presence_of_element_located((By.XPATH, "//div[@class='r-info col-xs-12 col-sm-7 cf']")))
        time.sleep(2)  # Sleep for additional stability

        # Extract and add new data to the list (adjust the selector as needed)
        new_elements = driver.find_elements(By.XPATH, "//div[@class='r-info col-xs-12 col-sm-7 cf']")
        new_names = driver.find_elements(By.XPATH, "//h3[@class='r-name col-xs-12 col-sm-push-5 col-sm-7 ']/a")

        for element, name_element in zip(new_elements, new_names):
            restaurant_name = name_element.text

            # Check if the restaurant name has already been scraped to avoid duplicates
            if restaurant_name not in scraped_names:
                scraped_names.add(restaurant_name)

                try:
                    # Extract Cuisine, Location, and Price
                    cuisine_element = element.find_element(By.XPATH, ".//li/span[text()='Cuisine:']/following-sibling::p")
                    location_element = element.find_element(By.XPATH, ".//li/span[text()='Location:']/following-sibling::p")
                    price_element = element.find_element(By.XPATH, ".//li/span[text()='Price:']/following-sibling::p")

                    # Get the text values
                    cuisine = cuisine_element.text
                    location = location_element.text
                    price = price_element.text

                    # Create a dictionary with the extracted data
                    entry = {
                        'Name': restaurant_name,
                        'Cuisine': cuisine,
                        'Location': location,
                        'Price': price
                    }
                    data.append(entry)

                except Exception as e:
                    print(f"Error scraping attributes for '{restaurant_name}': {str(e)}")

    except Exception as e:
        print(f"Error in main loop: {str(e)}")
        break

# Create a pandas DataFrame from the collected data
df = pd.DataFrame(data)

# Close the WebDriver when done
driver.quit()

Error scraping attributes for 'Arc-en-ciel Patisserie': Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=117.0.5938.134)
Stacktrace:
	GetHandleVerifier [0x00007FF65CAF7D12+55474]
	(No symbol) [0x00007FF65CA677C2]
	(No symbol) [0x00007FF65C91E0EB]
	(No symbol) [0x00007FF65C8FE528]
	(No symbol) [0x00007FF65C983B77]
	(No symbol) [0x00007FF65C9975BF]
	(No symbol) [0x00007FF65C97EF33]
	(No symbol) [0x00007FF65C953D41]
	(No symbol) [0x00007FF65C954F84]
	GetHandleVerifier [0x00007FF65CE5B762+3609346]
	GetHandleVerifier [0x00007FF65CEB1A80+3962400]
	GetHandleVerifier [0x00007FF65CEA9F0F+3930799]
	GetHandleVerifier [0x00007FF65CB93CA6+694342]
	(No symbol) [0x00007FF65CA72218]
	(No symbol) [0x00007FF65CA6E484]
	(No symbol) [0x00007FF65CA6E5B2]
	(No symbol) [0x00007FF65CA5EE13]
	BaseThreadInitThunk [0x00007FFAF794257D+29]
	RtlUserThreadStart [0x00007FFAF8BEAA68+40]

Error in main loop: Message: no such window: target window alrea

In [4]:
print(df.shape) # checking the dimensions
print(df.dtypes) # checking the datatypes

(1771, 4)
Name        object
Cuisine     object
Location    object
Price       object
dtype: object


# Cleaning the dataset

The scraped data required some cleaning before it could be processed by Tableau. Here are the steps taken:

1. A column needed to be made which classified the restaurant into Vegan or meat based on the Cuisine column. If the cuisine column contains Vegan, Vegan Friendly, or Vegetarian, the restautant is assumed to be a Vegan restaurant.

2. Multiple locations are stored in a single datapoint (for e.g. Jalan Besar, Lavender). Every location needs a separate row entry as it should count as a different restaurant. (1 Tyrwhitt Bistro Bar for instance will have 2 entries).

3. Based on the locations, we need the Latitude and Longitude data. Nominatim was used for this purpose.

NOTES:

1. The Latitude and Longitude is accurate, but the postal code column contains some missing values. However, since we only need Latitude and Longitude for Tableau, this is fine.

2. The code to get the location data takes ~30 mins to run (likely due to rate limits imposed by Nominatim and the large size of the dataset).

In [5]:
# Create a new column 'Vegan/Meat' based on 'Cuisine' column
def classify_cuisine(cuisine):
    if 'Vegan' in cuisine:
        return 'Vegan'
    elif 'Vegetarian' in cuisine:
        return 'Vegan'
    elif 'Vegetarian Friendly' in cuisine:
        return 'Vegan'
    else:
        return 'Meat'

df['Vegan/Meat'] = df['Cuisine'].apply(classify_cuisine)

# Print the updated DataFrame
df.head()

Unnamed: 0,Name,Cuisine,Location,Price,Vegan/Meat
0,1 Tyrwhitt Bistro Bar,"American, Bar, Italian, Vegetarian Friendly","Jalan Besar, Lavender",$$,Vegan
1,1-Altitude Coast,"Bar, Southeast Asian",Sentosa,$$$,Meat
2,1-Arden,Bar,Raffles Place,$$$$,Meat
3,10 SCOTTS,"Bar, Japanese, Seafood, British, High Tea",Orchard,$$$,Meat
4,123 Zo Viet Bbq n Hotpot,"Vietnamese, Steamboat, BBQ",Paya Lebar,$$$,Meat


In [7]:
#Splitting the locations
# Create an empty DataFrame to store the result
result_df = pd.DataFrame(columns=df.columns)

# Iterate through the rows of the original DataFrame
for _, row in df.iterrows():
    locations = row['Location'].split(', ')
    # Check if there are multiple locations
    if len(locations) > 1:
        # If multiple locations, create a new row for each location
        for location in locations:
            new_row = row.copy()
            new_row['Location'] = location
            result_df = result_df.append(new_row, ignore_index=True)
    else:
        # If only one location, keep the row as is
        result_df = result_df.append(row, ignore_index=True)

# Print the updated DataFrame
print (result_df.head())
print (result_df.shape)

  result_df = result_df.append(new_row, ignore_index=True)
  result_df = result_df.append(row, ignore_index=True)


                    Name                                      Cuisine  \
0  1 Tyrwhitt Bistro Bar  American, Bar, Italian, Vegetarian Friendly   
1  1 Tyrwhitt Bistro Bar  American, Bar, Italian, Vegetarian Friendly   
2       1-Altitude Coast                         Bar, Southeast Asian   
3                1-Arden                                          Bar   
4              10 SCOTTS    Bar, Japanese, Seafood, British, High Tea   

        Location Price Vegan/Meat  
0    Jalan Besar    $$      Vegan  
1       Lavender    $$      Vegan  
2        Sentosa   $$$       Meat  
3  Raffles Place  $$$$       Meat  
4        Orchard   $$$       Meat  
(2002, 5)


In [16]:
#getting Latitude and Longitude
# Define the Nominatim base URL
base_url = 'https://nominatim.openstreetmap.org/search?'

# Initialize empty lists to store geolocation data
latitudes = []
longitudes = []
addresses = []
postal_codes = []

# Loop through each location in the CSV
for location in result_df['Location']:
    # Construct the URL for the geocoding request
    url = f'{base_url}q={location}&format=json&addressdetails=1'
    
    # Make the HTTP request to Nominatim
    response = requests.get(url)
    
    # Parse the JSON response
    data = response.json()
    
    if data:
        # Extract latitude, longitude, and other relevant data
        lat = data[0]["lat"]
        lon = data[0]["lon"]
        address = data[0]["display_name"]
        postal_code = data[0]["address"]["postcode"] if "postcode" in data[0]["address"] else None
        
        # Append the geolocation data to the lists
        latitudes.append(lat)
        longitudes.append(lon)
        addresses.append(address)
        postal_codes.append(postal_code)
    else:
        # Handle cases where no results are found by appending None values
        latitudes.append(None)
        longitudes.append(None)
        addresses.append(None)
        postal_codes.append(None)

# Add new columns to the original DataFrame with the geolocation data
result_df['Latitude'] = latitudes
result_df['Longitude'] = longitudes
result_df['Address'] = addresses
result_df['PostalCode'] = postal_codes

# Save the updated DataFrame to a new CSV file or use it as needed
result_df.to_csv('geocoded_location_final.csv', index=False)

In [17]:
print (result_df.head(5))
print (result_df.shape)

                    Name                                      Cuisine  \
0  1 Tyrwhitt Bistro Bar  American, Bar, Italian, Vegetarian Friendly   
1  1 Tyrwhitt Bistro Bar  American, Bar, Italian, Vegetarian Friendly   
2       1-Altitude Coast                         Bar, Southeast Asian   
3                1-Arden                                          Bar   
4              10 SCOTTS    Bar, Japanese, Seafood, British, High Tea   

        Location Price Vegan/Meat            Latitude           Longitude  \
0    Jalan Besar    $$      Vegan           2.0203597         103.3194074   
1       Lavender    $$      Vegan          1.30734955  103.86315267326367   
2        Sentosa   $$$       Meat          1.24894585  103.83430564159272   
3  Raffles Place  $$$$       Meat  1.2835416999999998  103.85146023266938   
4        Orchard   $$$       Meat          29.6041269         -95.9696778   

                                             Address PostalCode  
0        Jalan Besar, Kluang, Jo

In [18]:
# Some locations were not giving SG based adderesses (Orchard, for instance is giving a Texas address).
# Rather than running the whole code later, it was easier to store the info to a new df. 
# This was then mapped back to the main df using VLOOKUP.
Locations = [
    'Jalan Besar, Singapore',
    'Orchard, Singapore',
    'City Hall, Singapore',
    'Chinatown, Singapore',
    'Woodlands, Singapore',
    'West Coast, Singapore',
    'Changi, Singapore',
    'Jurong, Singapore',
    'Marina Central, Singapore',
    'East Coast, Singapore',
    'Queenstown, Singapore',
    'Dempsey, Singapore',
    'Duxton, Singapore',
    'Newton, Singapore',
    'Rochester, Singapore',
    'Club Street, Singapore',
    'Fullerton, Singapore',
    'Thomson, Singapore',
    'Bukit Merah, Singapore'
]

# Initialize an empty list to store geolocation data
geolocation_data = []

# Define the Nominatim base URL
base_url = 'https://nominatim.openstreetmap.org/search?'

# Loop through each location in the list
for location in Locations:
    # Construct the URL for the geocoding request
    url = f'{base_url}q={location}&format=json&addressdetails=1'
    
    # Make the HTTP request to Nominatim
    response = requests.get(url)
    
    # Parse the JSON response
    data = response.json()
    
    if data:
        # Extract latitude, longitude, and other relevant data
        lat = data[0]["lat"]
        lon = data[0]["lon"]
        address = data[0]["display_name"]
        postal_code = data[0]["address"]["postcode"] if "postcode" in data[0]["address"] else None
        
        # Store the geolocation data in a dictionary
        geolocation_data.append({
            'Location': location,
            'Latitude': lat,
            'Longitude': lon,
            'Address': address,
            'PostalCode': postal_code
        })
    else:
        # Handle cases where no results are found
        geolocation_data.append({
            'Location': location,
            'Latitude': None,
            'Longitude': None,
            'Address': None,
            'PostalCode': None
        })

# Create a DataFrame with the geolocation data
geolocation_df3 = pd.DataFrame(geolocation_data)

# Display the first 5 rows of the DataFrame
geolocation_df3.head(5)
geolocation_df3.to_csv('geocoded_locations3.csv', index=False)