In [11]:
# !pip install selenium beautifulsoup4
# !pip install pandas numpy matplotlib
url = "https://www.magicbricks.com/property-for-sale/residential-real-estate?bedroom=&proptype=Multistorey-Apartment,Builder-Floor-Apartment,Penthouse,Studio-Apartment&cityName=Pune"

In [12]:
# import libraries

from selenium import webdriver
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import time
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [13]:
def scroll_and_wait(driver, timeout=10):
    try:
        # Scroll down to the bottom of the page
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

        # Wait for the new content to load
        WebDriverWait(driver, timeout).until(EC.presence_of_element_located((By.CLASS_NAME, 'mb-srp__list')))
    except Exception as e:
        print("Scrolling and waiting failed: {}".format(e))


In [14]:
import re

def extract_bhk(description):
    # Regular expression to find Bhk information
    bhk_pattern = re.compile(r'(\d+)\s*BHK', re.IGNORECASE)

    # Searching for Bhk information in the description
    match = bhk_pattern.search(description)

    if match:
        return int(match.group(1))
    else:
        
        return None

In [15]:
def extract_values_from_html(data):
    super_area = None
    floor = None
    total_floors = None
    bathrooms = None

    # Find all items with the specified class
    summary_items = data.find_all('div', class_='mb-srp__card__summary__list--item')

    for item in summary_items:
        label = item.find('div', class_='mb-srp__card__summary--label')
        value = item.find('div', class_='mb-srp__card__summary--value')

        if label and value:
            label_text = label.text.strip().lower()
            value_text = value.text.strip()

            if 'super area' in label_text:
                super_area = int(value_text.split(' ')[0])
            elif 'floor' in label_text:
                floor_data = value_text.split(' out of ')
                floor = floor_data[0].strip()
                total_floors = int(floor_data[1].strip())
            elif 'bathroom' in label_text:
                bathrooms = int(value_text)

    return super_area, floor, total_floors, bathrooms


In [16]:
import csv

def save_to_csv(data=["null"]*7, filename='output.csv'):
    try:
        with open(filename, mode='a', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)

            # Check if the file is empty, if so, write the header
            if file.tell() == 0:
                header = ['Property_Title', 'Price', 'Rate', 'Bhk', 'Super_area' , 'Floor' , 'Total_floor' ]
                writer.writerow(header)

            writer.writerow(data)
        print(f"Data written successfully!".center(50, '-') + "\n")
    except Exception as e:
        print(f"Error writing to CSV: {e}")

save_to_csv()


------------Data written successfully!------------



In [17]:
def extraction(property_divs):
    try:
        c=1
        for property_div in property_divs:
            try: #extracting required data from each div class" property_div"
                print(f"{c}/{len(property_divs)}")
                property_title = property_div.find('h2', class_='mb-srp__card--title').text.strip()
                property_price = property_div.find('div', class_='mb-srp__card__price--amount').text.strip()
                property_rate = property_div.find('div', class_='mb-srp__card__price--size').text.strip()
                property_rate = property_rate.split(' ')[0]
                property_description = property_div.find('p', class_='two-line-truncated').text.strip()
                # property_bhk = extract_bhk(property_description) 
                super_area, floor, total_floors, bathrooms = extract_values_from_html(property_div)
                # if property_bhk==None:    
                #     property_bhk = "0"                
                print("\n" + "-"*50 + "\n")
                print("Property Title:", property_title)
                print("Price:  Rs {0}".format(property_price).strip().center(50, '-'))
                # print("BHK: {0}".format(property_bhk).strip().center(50, ''))
                
                # print("Description:", property_description)
                print("\n" + "-"*50 + "\n")
                save_to_csv(property_title, property_price, property_rate, 1, super_area, floor, total_floors)
                
            except Exception as e:
                print("\n".center(50, '--==--'))
                print("Extraction failed ".center(50, '-'))
                print(e, "\n")
                print("\n".center(50, '--==--'))
            finally:
                c+=1
                pass
    except Exception as e:
        print("Caused by {e}".format(e=e).strip().center(50, '-'))
        pass
    finally:
        driver.quit()


In [18]:
driver = webdriver.Chrome()
driver.get(url)

page_source = driver.page_source
soup = BeautifulSoup(page_source, 'html.parser')
property_divs = soup.find_all('div', class_='mb-srp__list')
extraction(property_divs)
# Wait for the new content to load
while True or len(property_divs)!= 250:
    time.sleep(5)
    scroll_and_wait(driver)
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, 'html.parser')
    property_divs = soup.find_all('div', class_='mb-srp__list')
    extraction(property_divs)
    print(len(property_divs))

1/30

--------------------------------------------------

Property Title: 2 BHK Flat for Sale in Chandrakamal, Shukrawar Peth, Pune
---------------Price:  Rs ₹1.65 Cr----------------

--------------------------------------------------

Caused by The fill character must be exactly one character long
