In [9]:
# !pip install selenium beautifulsoup4
# !pip install pandas numpy matplotlib
url = "https://www.magicbricks.com/property-for-sale/residential-real-estate?bedroom=&proptype=Multistorey-Apartment,Builder-Floor-Apartment,Penthouse,Studio-Apartment&cityName=Pune"

In [10]:
# import libraries

from selenium import webdriver
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import time
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [11]:

def scroll_and_wait(driver, timeout=10):
    try:
        # Scroll down to the bottom of the page
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

        # Wait for the document to be in 'complete' state
        WebDriverWait(driver, timeout).until(EC.presence_of_element_located((By.XPATH, '//body[not(@style="cursor: wait;")]/@data-pageloaded]')))
    except Exception as e:
        print("Scrolling and waiting failed: {}".format(e))


In [12]:
import re

def extract_bhk(description,title):
    # Regular expression to find Bhk information
    bhk_pattern = re.compile(r'(\d+)\s*BHK', re.IGNORECASE)
    # Searching for Bhk information in the description
    match_dec = bhk_pattern.search(description)
    # Searching for Bhk information in the title
    match_tlt = bhk_pattern.search(title) 


    if match_dec:
        return int(match_dec.group(1))
    elif match_tlt:
        return int(match_tlt.group(1))
    else:          
        return "None"

In [13]:
from bs4 import BeautifulSoup

def extract_values_from_html(property_div):
    super_area = None
    floor = None
    total_floors = None
    bathrooms = None

    floor_data = None
    try:
        summary_list_items = property_div.find('div', class_='mb-srp__card__summary__list').find_all('div', class_='mb-srp__card__summary__list--item')

        for item in summary_list_items:
            label = item.find('div', class_='mb-srp__card__summary--label')
            value = item.find('div', class_='mb-srp__card__summary--value')

            if label and value:
                label_text = label.text.strip().lower()
                value_text = value.text.strip().lower().replace(',', '')

                if 'super area' in label_text  or "super_area" in label_text or "superarea" in label_text:
                    super_area = value_text.split('')
                if 'floor' in label_text or 'floors' in label_text or 'flooring' in label_text:
                    floor_data = value_text
                    # floor = floor_data[0].strip()
                    # total_floors = int(floor_data[1].strip())
                if 'bathroom' in label_text:
                    bathrooms = int(value_text) if value_text.isdigit() else None
    except Exception as e:
        print("Error in extract_values_from_html function:", e)

    # return super_area, floor, total_floors, bathrooms
    return super_area, floor_data, 0, bathrooms


In [14]:
import csv

def save_to_csv(data=[], filename='output.csv'):
    try:
        with open(filename, mode='a', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)

            # Check if the file is empty, if so, write the header
            if file.tell() == 0:
                header = ['Property_Title', 'Price', 'Rate', 'Bhk', 'Super_area' , 'Floor' , 'Total_floor' ]
                writer.writerow(header)
                data=["null"]*len(header)
                writer.writerow(data)
                
            if data:
                writer.writerow(data)
            else:
                print("No data to write!")
                pass
        print(f"Data written successfully!".center(50, '-') + "\n")
    except Exception as e:
        print(f"Error writing to CSV: {e}")

save_to_csv()


No data to write!
------------Data written successfully!------------



In [15]:
def extraction(property_divs):
    try:
        c = 1
        for property_div in property_divs:
            try:  # extracting required data from each div class "property_div"
                print(f"{c}/{len(property_divs)}")
                property_title = property_div.find('h2', class_='mb-srp__card--title').text.strip()
                property_price = property_div.find('div', class_='mb-srp__card__price--amount').text.strip()
                property_price=property_price.replace("₹","Rs ")
                property_rate_elem = property_div.find('div', class_='mb-srp__card__price--size')
                property_rate = property_rate_elem.text.strip().split(' ')[0] if property_rate_elem else 'N/A'
                property_rate=property_rate.replace("₹","Rs ")
                property_description = property_div.find('p', class_='two-line-truncated').text.strip()
                property_bhk = extract_bhk(property_description,property_title)
                super_area, floor, total_floors, bathrooms = extract_values_from_html(property_div)

                print("\n" + "-"*50 + "\n")
                print("Property Title:", property_title)
                print("Price: {0}".format(property_price).strip().center(50, '-'))
                print("BHK: {0}".format(property_bhk).strip().center(50, '-'))
                print("Rate: {0}".format(property_rate).strip().center(50, '-'))
                print("Super Area: {0} sqft".format(super_area).strip().center(50, '-'))
                print("Floor: {0}".format(floor).strip().center(50, '-'))
                print("Total Floors: {0}".format(total_floors).strip().center(50, '-'))
                print("Bathrooms: {0}".format(bathrooms or "No data").strip().center(50, '-'))
                print("\n" + "-"*50 + "\n")

                save_to_csv([property_title, property_price, property_rate, property_bhk, super_area, floor, total_floors])

            except Exception as e:
                print("\n" + "-"*50 + "\n")
                print("Extraction failed ".center(50, '-'))
                print(e, "\n")
                print("\n" + "-"*50 + "\n")
            finally:
                c += 1
                pass
    except Exception as e:
        print("Caused by :: {e}".format(e=e).strip().center(50, '-'))
        pass
    finally:
        driver.quit()


In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

# ... (other functions and imports)

driver = webdriver.Chrome()
driver.get(url)

try:
    # Wait for the initial content to load
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'mb-srp__list')))

    while True:
        # Scroll and wait for new content
        scroll_and_wait(driver)

        # Get the updated page source
        page_source = driver.page_source
        soup = BeautifulSoup(page_source, 'html.parser')

        # Extract property divs
        property_divs = soup.find_all('div', class_='mb-srp__list')

        # Perform extraction
        extraction(property_divs)
        
        print(len(property_divs))

        # Check for an exit condition (customize this based on your requirements)
        if len(property_divs) >= 2500:
            break

except Exception as e:
    print("An error occurred:", e)
finally:
    driver.quit()


 #### Make a linear regression model, random forest, decision tree model of the data extracted to estimate Property value. input variables (x variable): Project, specification(Bhk), Floor, area output (y variable): Property value Note the properties extracted must be unique (Duplicates must be removed in final CSV) 

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Load data from CSV file
filename = './output.csv'  # Replace with your actual file name
df = pd.read_csv(filename)

# Preprocess data
X = df[['bhk', 'total_floors', 'carpet area', 'super area']]
y = df['price']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Linear Regression model
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
y_pred_linear = linear_model.predict(X_test)
linear_metrics = {'Mean Squared Error': mean_squared_error(y_test, y_pred_linear),
                  'R-squared': r2_score(y_test, y_pred_linear)}

# Random Forest model
rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
rf_metrics = {'Mean Squared Error': mean_squared_error(y_test, y_pred_rf),
               'R-squared': r2_score(y_test, y_pred_rf)}

# Decision Tree model
dt_model = DecisionTreeRegressor()
dt_model.fit(X_train, y_train)
y_pred_dt = dt_model.predict(X_test)
dt_metrics = {'Mean Squared Error': mean_squared_error(y_test, y_pred_dt),
               'R-squared': r2_score(y_test, y_pred_dt)}

# Gradient Boosting model
gb_model = GradientBoostingRegressor()
gb_model.fit(X_train, y_train)
y_pred_gb = gb_model.predict(X_test)
gb_metrics = {'Mean Squared Error': mean_squared_error(y_test, y_pred_gb),
               'R-squared': r2_score(y_test, y_pred_gb)}

# Display metrics
print("Linear Regression Metrics:")
print(linear_metrics)

print("\nRandom Forest Metrics:")
print(rf_metrics)

print("\nDecision Tree Metrics:")
print(dt_metrics)

print("\nGradient Boosting Metrics:")
print(gb_metrics)
