# Website Scrapper

# Import all packages

In [2]:
import re
import requests
from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import csv

# Scrap a Single Page
- Using beautiful soup.
- Exract the following properties
    - heading
    - pricing
    - numberOfBedrooms = container.find(itemprop="numberOfBedrooms")
    - numberOfBathrooms
    - floorSize = container.find(itemprop="floorSize")

In [57]:
def scrape_page(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
    }
    response = requests.get(url, headers=headers)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content
        soup = BeautifulSoup(response.content, 'html.parser')
        containers = soup.find_all(class_=re.compile(r'ListItemCard_container__[a-zA-Z0-9]+'))
        all_buildings = []
        
        for container in containers:
            # Within each container, find the heading with the specified class pattern
            heading = container.find(class_=re.compile(r'CellHeading_heading__[a-zA-Z0-9]+'))
            pricing = container.find(class_=re.compile(r'CellPrice_text__[a-zA-Z0-9]+'))
            numberOfBedrooms = container.find(itemprop="numberOfBedrooms")
            numberOfBathroomsTotal = container.find(itemprop="numberOfBathroomsTotal")
            floorSize = container.find(itemprop="floorSize")
            
            if heading:
                templist = []
                templist.append(heading.text.strip())
                templist.append(pricing.text.strip())
                templist.append(numberOfBedrooms.text.strip())
                templist.append(numberOfBathroomsTotal.text.strip())
                templist.append(floorSize.text.strip())
                all_buildings.append(templist)

        return all_buildings
    else:
        print("Failed to retrieve page:", response.status_code)

# Selenium to navigate page

- Unable to find the href of the next page.
- Using Selenium to force locate the next page.

In [54]:
def get_next_page_url(url):
    # Create a WebDriver instance (assuming you have Chrome installed)
    driver = webdriver.Chrome()

    try:
        # Load the webpage
        driver.get(url)

        # Find the "Next page" button element
        next_page_button = WebDriverWait(driver, 10).until(
            EC.visibility_of_element_located((By.XPATH, '//a[@aria-label="Next page"]'))
        )

        # Click the "Next page" button
        next_page_button.click()

        # Retrieve the URL of the next page
        next_page_url = driver.current_url
        return next_page_url

    finally:
        # Close the WebDriver
        driver.quit()0

# Search a few pages automatically

In [59]:
def get_pages(url, number, result=[]):
    if number <= 0:
        return result
    
    result += scrape_page(url)
    next_page_url = get_next_page_url(url)
    
    if next_page_url:
        return get_pages(next_page_url, number - 1, result)
    else:
        return result
    
starting_url = "https://www.99.co/singapore/sale?main_category=hdb"
number_of_pages_to_scrape = 2
result = get_pages(starting_url, number_of_pages_to_scrape)

for res in result:
    print(res)

['3 Room HDB in 116 Pending Road', '$410,000', '2 Beds', '2 Baths', '786 sqft / 73.02 sqm']
['2 Room HDB in 76 Telok Blangah Drive', '$320,000', '1 Bed', '1 Bath', '474 sqft / 44.04 sqm']
['4 Room HDB in 623 Jurong West Street 61', '$488,000', '3 Beds', '2 Baths', '979 sqft / 90.95 sqm']
['3 Room HDB in 728 Clementi West Street 2', '$398,000', '2 Beds', '2 Baths', '732 sqft / 68 sqm']
['4 Room HDB in 93B Telok Blangah Street 31', '$980,000', '3 Beds', '2 Baths', '1,001 sqft / 93 sqm']
['4 Room HDB in 811A Choa Chu Kang Avenue 7', '$586,000', '3 Beds', '2 Baths', '1,001 sqft / 93 sqm']
['4 Bed HDB in 229 Choa Chu Kang Central', '$590,000', '4 Beds', '2 Baths', '1,313 sqft / 121.98 sqm']
['5 Room HDB in 297 Yishun Street 20', '$658,000', '3 Beds', '2 Baths', '1,205 sqft / 111.95 sqm']
['5 Room HDB in 309A Ang Mo Kio Street 31', '$1,200,000', '3 Beds', '2 Baths', '1,184 sqft / 110 sqm']
['3 Room HDB in 6 Holland Close', '$580,000', '2 Beds', '1 Bath', '786 sqft / 73.02 sqm']
['4 Room HDB 

# Save to CSV

In [None]:
def save_to_csv(data, filename):
    with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        # Write header row
        writer.writerow(['Title', 'Price', 'Bedrooms', 'Bathrooms', 'Floor Size'])
        # Write data rows
        writer.writerows(data)

starting_url = "https://www.99.co/singapore/sale?main_category=hdb"
number_of_pages_to_scrape = 2
result = get_pages(starting_url, number_of_pages_to_scrape)

# Specify the filename for the CSV file
csv_filename = "hdb_listings.csv"

# Save the scraped data into the CSV file
save_to_csv(result, csv_filename)