In [76]:
# !pip install selenium beautifulsoup4
# !pip install pandas numpy matplotlib
url = "https://www.magicbricks.com/property-for-sale/residential-real-estate?bedroom=&proptype=Multistorey-Apartment,Builder-Floor-Apartment,Penthouse,Studio-Apartment&cityName=Pune"

In [77]:
# import libraries

from selenium import webdriver
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import time
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [78]:

def scroll_and_wait(driver, timeout=10):
    try:
        # Scroll down to the bottom of the page
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

        # Wait for the document to be in 'complete' state
        WebDriverWait(driver, timeout).until(EC.presence_of_element_located((By.XPATH, '//body[not(@style="cursor: wait;")]/@data-pageloaded]')))
    except Exception as e:
        print("Scrolling and waiting failed: {}".format(e))


In [79]:
import re

def extract_bhk(description,title):
    # Regular expression to find Bhk information
    bhk_pattern = re.compile(r'(\d+)\s*BHK', re.IGNORECASE)
    # Searching for Bhk information in the description
    match_dec = bhk_pattern.search(description)
    # Searching for Bhk information in the title
    match_tlt = bhk_pattern.search(title) 


    if match_dec:
        return int(match_dec.group(1))
    elif match_tlt:
        return int(match_tlt.group(1))
    else:          
        return "None"

In [80]:
def extract_values_from_html(property_div):
    super_area = None
    floor = None
    total_floors = None
    bathrooms = None

    try:
        summary_list = property_div.find('div', class_='mb-srp__card__summary__list')

        if summary_list:
            summary_list_items = summary_list.find_all('div', class_='mb-srp__card__summary__list--item')

            for item in summary_list_items:
                label = item.find('div', class_='mb-srp__card__summary--label')
                value = item.find('div', class_='mb-srp__card__summary--value')

                if label and value:
                    label_text = label.text.strip().lower().strip()
                    value_text = value.text.strip().lower().strip()

                    if 'super area' or "super_area" in label_text:
                        super_area = int(value_text.split(' ')[0])
                    elif 'floor' or "Floor" in label_text:
                        floor_data = value_text.split(' out of ')
                        floor = floor_data[0].strip()
                        total_floors = int(floor_data[1].strip())
                    elif 'bathroom' in label_text:
                        bathrooms = int(value_text) if value_text.isdigit() else None
    except Exception as e:
        print("Error in extract_values_from_html function:", e)

    # print(super_area, floor, total_floors, bathrooms)
    return super_area, floor, total_floors, bathrooms


In [81]:
import csv

def save_to_csv(data=[], filename='output.csv'):
    try:
        with open(filename, mode='a', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)

            # Check if the file is empty, if so, write the header
            if file.tell() == 0:
                header = ['Property_Title', 'Price', 'Rate', 'Bhk', 'Super_area' , 'Floor' , 'Total_floor' ]
                writer.writerow(header)
                data=["null"]*len(header)
                writer.writerow(data)
                
            if data:
                writer.writerow(data)
            else:
                print("No data to write!")
                pass
        print(f"Data written successfully!".center(50, '-') + "\n")
    except Exception as e:
        print(f"Error writing to CSV: {e}")

save_to_csv()


No data to write!
------------Data written successfully!------------



In [82]:
def extraction(property_divs):
    try:
        c = 1
        for property_div in property_divs:
            try:  # extracting required data from each div class "property_div"
                print(f"{c}/{len(property_divs)}")
                property_title = property_div.find('h2', class_='mb-srp__card--title').text.strip()
                property_price = property_div.find('div', class_='mb-srp__card__price--amount').text.strip()
                property_price=property_price.replace("₹","Rs ")
                property_rate_elem = property_div.find('div', class_='mb-srp__card__price--size')
                property_rate = property_rate_elem.text.strip().split(' ')[0] if property_rate_elem else 'N/A'
                property_rate=property_rate.replace("₹","Rs ")
                property_description = property_div.find('p', class_='two-line-truncated').text.strip()
                property_bhk = extract_bhk(property_description,property_title)
                super_area, floor, total_floors, bathrooms = extract_values_from_html(property_div)

                print("\n" + "-"*50 + "\n")
                print("Property Title:", property_title)
                print("Price: {0}".format(property_price).strip().center(50, '-'))
                print("BHK: {0}".format(property_bhk).strip().center(50, '-'))
                print("Rate: {0}".format(property_rate).strip().center(50, '-'))
                print("Super Area: {0} sqft".format(super_area).strip().center(50, '-'))
                print("Floor: {0}".format(floor).strip().center(50, '-'))
                print("Total Floors: {0}".format(total_floors).strip().center(50, '-'))
                print("Bathrooms: {0}".format(bathrooms or "No data").strip().center(50, '-'))
                print("\n" + "-"*50 + "\n")

                save_to_csv([property_title, property_price, property_rate, property_bhk, super_area, floor, total_floors])

            except Exception as e:
                print("\n" + "-"*50 + "\n")
                print("Extraction failed ".center(50, '-'))
                print(e, "\n")
                print("\n" + "-"*50 + "\n")
            finally:
                c += 1
                pass
    except Exception as e:
        print("Caused by :: {e}".format(e=e).strip().center(50, '-'))
        pass
    finally:
        driver.quit()


In [83]:
driver = webdriver.Chrome()
driver.get(url)

page_source = driver.page_source
soup = BeautifulSoup(page_source, 'html.parser')
property_divs = soup.find_all('div', class_='mb-srp__list')
extraction(property_divs)
# Wait for the new content to load
while len(property_divs)!= 2500:
    time.sleep(5)
    scroll_and_wait(driver)
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, 'html.parser')
    property_divs = soup.find_all('div', class_='mb-srp__list')
    extraction(property_divs)
    print(len(property_divs))

1/30
Error in extract_values_from_html function: invalid literal for int() with base 10: 'poss.'
882 None None None

--------------------------------------------------

Property Title: 2 BHK Flat for Sale in Chandrakamal, Shukrawar Peth, Pune
----------------Price: Rs 1.65 Cr-----------------
----------------------BHK: 2----------------------
------------------Rate: Rs 13854------------------
---------------Super Area: 882 sqft---------------
-------------------Floor: None--------------------
----------------Total Floors: None----------------
----------------Bathrooms: No data----------------

--------------------------------------------------

------------Data written successfully!------------

2/30
Error in extract_values_from_html function: invalid literal for int() with base 10: 'poss.'
846 None None None

--------------------------------------------------

Property Title: 2 BHK Flat for Sale in ANP Autograph, Punawale, Pimpri Chinchwad, Pune
----------------Price: Rs 93.3 Lac-----

MaxRetryError: HTTPConnectionPool(host='localhost', port=59728): Max retries exceeded with url: /session/be211be8bf3a585beae8f6b3c202d9d6/source (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x00000223BA2C4A10>: Failed to establish a new connection: [WinError 10061] No connection could be made because the target machine actively refused it'))