In [15]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from time import sleep
import pandas as pd
import os

In [None]:
# get all the property for a specific search

def getAll_Links(target_url = "https://www.furnishedfinder.com/housing/us--ca--san-francisco?max-price=5500&move-in-date=2025-05-23&map-ne-lat=37.82979&map-ne-lon=-122.34956&map-sw-lat=37.68968&map-sw-lon=-122.52607&map-zoom=12.509&sleeps-count=2"):

    driver = webdriver.Chrome()
    all_links = []

    try:
        driver.get(target_url)
        while True:
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.TAG_NAME, "a"))
            )
            
            # Find all links on current page
            links = driver.find_elements(By.TAG_NAME, "a")
            page_links = [link.get_attribute('href') for link in links if link.get_attribute('href')]

            # filter out random links, we only want the property links
            for link in page_links:
                if "furnishedfinder.com/property/" in link:
                    # check if the link is already in the list
                    if link not in all_links:
                        all_links.append(link)
                    else:
                        print(f"Link already in list: {link}")

            print(f"Collected {len(all_links)} links so far.")

            sleep(2)

            try:
                # Try to find and click the next page button
                next_button = driver.find_element(By.XPATH, "//div[@data-testid='pagination-next' and @aria-label='next page']")
                if not next_button.is_displayed() or not next_button.is_enabled():
                    print("No more pages to scrape")
                    sleep(2)
                    break

                
                # get the class of the next button
                next_button_class = next_button.get_attribute("class")
                # check if the class contains "disabled"
                if "cursor-not-allowed" in str(next_button_class):
                    print("No more pages to scrape")
                    sleep(2)
                    break


                next_button.click()
                
                # Wait for the new page to load
                WebDriverWait(driver, 10).until(
                    lambda driver: driver.execute_script('return document.readyState') == 'complete'
                )

            except (NoSuchElementException, TimeoutException):
                # No more pages to scrape
                print("done with all pages")
                break
                
    finally:
        driver.quit()

        print(f"Total links collected: {len(all_links)}")
        for link in all_links:
            print(link)

        # Save the links to a file
        with open("ff_links.txt", "w") as f:
            for link in all_links:
                f.write(link + "\n")


        return all_links


In [20]:
def getPageDetails(links):
    driver = webdriver.Chrome()

    data = []
    for link in links:

        driver.get(link)
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.TAG_NAME, "h1"))
        )

        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, "/html/body/div[1]/main/div[1]/div[2]/div[2]/div/div[1]/div/div[1]/div[1]/div[1]"))
        )

        sleep(10)
        # Get the property details
        id = link.split("/")[-1]
        title = driver.find_element(By.XPATH, "/html/body/div[1]/main/div[1]/div[2]/div[1]/div/div/h1").text
        price = driver.find_element(By.XPATH, "/html/body/div[1]/main/div[1]/div[2]/div[2]/div/div[1]/div/div[1]/div[1]/div[1]").text
        bedrooms = driver.find_element(By.XPATH, "/html/body/div[1]/main/div[1]/div[2]/div[1]/div/div/div[9]/span[1]").text
        bathrooms = driver.find_element(By.XPATH, "/html/body/div[1]/main/div[1]/div[2]/div[1]/div/div/div[9]/span[2]").text
        
        print(f"ID: {id}")
        print(f"Title: {title}")
        print(f"Price: {price}")
        print(f"Link: {link}")
        print(f"Bedrooms: {bedrooms}")
        print(f"Bathrooms: {bathrooms}")
        print("=====================================")

        data.append({
            "ID_furnished_finder": id,
            "Title": title,
            "Price": price,
            "Link": link,
            "Bedrooms": bedrooms,
            "Bathrooms": bathrooms
        })

        sleep(10)

    df = pd.DataFrame(data)
    df.to_csv("ff_properties.csv", index=False)
    driver.quit()
    return df

            

In [None]:
#OPT 1: get all the links from the website
#all_links = getAll_Links()

#OPT 2: get all the links from the file
# read all the links from the file
# with open("ff_links.txt", "r") as f:
#     all_links = [line.strip() for line in f.readlines()]


#OPT 3: get all the links from the array
all_links = ["https://www.furnishedfinder.com/property/372953_1", "https://www.furnishedfinder.com/property/243064_1", "https://www.furnishedfinder.com/property/460938_1"]


details_df = getPageDetails(all_links)