#### Get the HTML content after all content is loaded (infinite scroll)

In [10]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import time
import csv

def scrape_website(url):
    # Set up the Selenium webdriver
    driver = webdriver.Chrome()
    driver.get(url)

    # Simulate scrolling to load more content
    scroll_pause_time = 2
    last_height = driver.execute_script("return Math.max( document.body.scrollHeight, document.body.offsetHeight, document.documentElement.clientHeight, document.documentElement.scrollHeight, document.documentElement.offsetHeight);")

    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(scroll_pause_time)
        new_height = driver.execute_script(
            "return Math.max( document.body.scrollHeight, document.body.offsetHeight, document.documentElement.clientHeight, document.documentElement.scrollHeight, document.documentElement.offsetHeight);")

        if new_height == last_height:
            # If no new content is loaded, try triggering additional requests
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight + 1000);")
            time.sleep(scroll_pause_time)
            new_height = driver.execute_script(
                "return Math.max( document.body.scrollHeight, document.body.offsetHeight, document.documentElement.clientHeight, document.documentElement.scrollHeight, document.documentElement.offsetHeight);")

            if new_height == last_height:
                break

        last_height = new_height

    # Get the HTML content after all content is loaded
    html = driver.page_source
    driver.quit()

    # Parse the HTML with BeautifulSoup
    soup = BeautifulSoup(html, 'html.parser')

    # Find all articles on the page
    articles = soup.find_all('div', class_='css-1m1bruh')

    return articles


##### Use regular expressions to extract relevant information

In [6]:
import re

def clean_and_extract(text):
    area_match = re.search(r'Build up area(\d+\.?\d*) sq\.ft', text)
    price_match = re.search(r'Average Price[₹$]([\d,.]+(?:\.\d{1,2})?)\s?[Kk]/sq\.ft', text)
    facing_match = re.search(r'Main entrance facing([a-zA-Z-]+) Facing', text)
    project_name_match = re.search(r'Project Name([^B]+)', text)
    possession_date_match = re.search(r'Possession Date([a-zA-Z, ]+ \d{4})', text)
    
    # extract information or set to None if not found
    area = float(area_match.group(1)) if area_match else None
    price = float(price_match.group(1).replace(',', '')) if price_match else None
    facing = facing_match.group(1) if facing_match else None
    project_name = project_name_match.group(1).strip() if project_name_match else None
    possession_date = possession_date_match.group(1).strip() if possession_date_match else None

    return {'Build up area': f'{area} sq.ft', 'Average price': f'₹{price} K/sq.ft', 'facing': facing, 'Property name': project_name, 'Posession date': possession_date}

#### Extract relevant information from the entire html content for every property

In [7]:
def extract_info(details):
    project_details_list = []

    for article in details:
        proj_list = article.find_all('div', class_='_mkh2mm _1asa1q9c _rlozgrho _2hx11btx')
        
        for proj in proj_list:
            price_element = proj.find('div', class_='_csbfng _c8f6fq _g3gktf _ldyh40 _7l1ulh')
            price = price_element.text if price_element else None
            
            property_name_element = proj.find('div', class_='_9s1txw _gqyh40 _0h1q9y')
            property_name = property_name_element.text if property_name_element else None
            
            flat_details_tag = proj.find('h3', class_='_sq1l2s _vv1q9c _ks15vq _5vy24jg8 _blas14la _csbfng _g3dlk8 _c81fwx _h3ftgi')
            flat_details = flat_details_tag.text if flat_details_tag else None

            section = proj.find('section', class_='_1enfn7od _12el1ule _12il1osq _11ar1l2s _1vd01q9c _orqr15vq _cbben7od _3nng1e54 _12eccj1k _dgd0cs5v _gcpm15vq _9j73ad _gnftgi _9s1txw')
            more_info = clean_and_extract(section.text)

            project_details_list.append({'price range': price, 'property info': property_name, 'flat_details': flat_details, **more_info})

    return project_details_list


#### Results

##### print the results

In [11]:
if __name__ == "__main__":
    # Replace 'your_website_url' with the actual URL of the real estate website
    website_url = 'https://housing.com/in/buy/searches/Pfkgym5kvlhge7gn'
    details = scrape_website(website_url)
    info = extract_info(details)
    for project in info:
        print(project)

{'price range': '₹45.0 L', 'property info': '3 BHK Independent Builder Floor for sale in Dwarka Mor', 'flat_details': None, 'Build up area': 'None sq.ft', 'Average price': '₹5.0 K/sq.ft', 'facing': 'South-west', 'Property name': None, 'Posession date': None}
{'price range': '₹86.0 L - 1.38 Cr', 'property info': 'Signature Global City 92 Phase 2RERA', 'flat_details': '2, 3 BHK Flats for sale in Sector 92', 'Build up area': '959.0 sq.ft', 'Average price': '₹8.98 K/sq.ft', 'facing': None, 'Property name': None, 'Posession date': 'Aug, 2026'}
{'price range': '₹24.0 L - 40.0 L', 'property info': 'Metroview By S gambhir Buildtech Dwarka MorRERA', 'flat_details': '2, 3 BHK Builder Floors for sale in Dwarka Mor', 'Build up area': 'None sq.ft', 'Average price': '₹4.62 K/sq.ft', 'facing': None, 'Property name': None, 'Posession date': 'Nov, 2022'}
{'price range': '₹28 L - 60 L', 'property info': 'RP Luxury Floors', 'flat_details': '2, 3, 4 BHK Apartments for sale in Dwarka Mor', 'Build up area':

##### export in csv

In [12]:
csv_file_path = 'project_details.csv'
with open(csv_file_path, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['Property name', 'property info','price range', 'flat_details', 'Average price', 'facing', 'Build up area', 'Posession date']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        # Write the header
        writer.writeheader()

        # Write the data
        for project in info:
            writer.writerow(project)

print(f'Data exported to {csv_file_path}')

Data exported to project_details.csv
