### Code to extract query parameters from URL

In Chrome, I opened the magicbricks site and searched for the properties with desired filters like city, budget, sorting by price, etc. Then, after inspecting the network tab in dev tools, I found the API they use to request property details in JSON form. I used the same parameters to make requests to the API.

In [1]:
from urllib.parse import urlparse, parse_qs

# Original URL
# url = "https://www.magicbricks.com/mbsrp/propertySearch.html?editSearch=Y&category=S&propertyType=10002,10003,10021,10022&budgetMax=45000000&city=2951&page=2&sortBy=Highest_Price&postedSince=-1&pType=10002,10003,10021,10022&isNRI=N&multiLang=en"

# url = "https://www.magicbricks.com/mbsrp/propertySearch.html?editSearch=Y&category=S&propertyType=10002,10003,10021,10022,10001,10017&bedrooms=11700,11703,11704,11705,11706,11707,11708,11709,11710,11701,11702&city=2951&page=2&groupstart=30&offset=0&maxOffset=263&sortBy=premiumRecent&postedSince=-1&pType=10002,10003,10021,10022,10001,10017&isNRI=N&multiLang=en"
# Parse the URL

# url = "https://www.magicbricks.com/mbsrp/propertySearch.html?editSearch=Y&category=S&propertyType=10002,10003,10021,10022,10001,10017&bedrooms=11700,11703,11704,11705,11706,11707,11708,11709,11710,11701,11702&city=2951&page=2&sortBy=Highest_Price&postedSince=-1&pType=10002,10003,10021,10022,10001,10017&isNRI=N&multiLang=en"



url = "https://www.magicbricks.com/mbsrp/propertySearch.html?editSearch=Y&category=S&propertyType=10002,10003,10021,10022,10001,10017&budgetMin=3000000&budgetMax=45000000&bedrooms=11700,11701,11702,11703,11704,11705,11706,11707,11708,11709,11710&city=6403&page=2&sortBy=Lowest_Price&postedSince=-1&pType=10002,10003,10021,10022,10001,10017&isNRI=N&multiLang=en"
# Extract query parameters
parsed_url = urlparse(url)
query_params = parse_qs(parsed_url.query)

# Clean up the values (optional: since parse_qs returns values as lists)
query_params = {key: value[0] if len(value) == 1 else value for key, value in query_params.items()}

query_params


{'editSearch': 'Y',
 'category': 'S',
 'propertyType': '10002,10003,10021,10022,10001,10017',
 'budgetMin': '3000000',
 'budgetMax': '45000000',
 'bedrooms': '11700,11701,11702,11703,11704,11705,11706,11707,11708,11709,11710',
 'city': '6403',
 'page': '2',
 'sortBy': 'Lowest_Price',
 'postedSince': '-1',
 'pType': '10002,10003,10021,10022,10001,10017',
 'isNRI': 'N',
 'multiLang': 'en'}

## Scraping resultLists to get property details in bulk

After this data is scraped, the urls for each unique property are stored in a list. The list is then used to scrape the property details for each property separately.

In [None]:
import os
import time
import json
import logging
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
import random

# Set up logging
logging.basicConfig(
    filename='scraping_errors.log',
    filemode='a',
    level=logging.ERROR,
    format='%(asctime)s - %(levelname)s - %(message)s'
)


driver_path = r'C:\Users\soura\OneDrive\Desktop\Datascience\web_scraping\projects\magicbricks_realestate\chromedriver.exe'

chrome_options = Options()
chrome_options.add_argument("--disable-http2")
chrome_options.add_argument("--incognito")
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_argument("--ignore-certificate-errors")
chrome_options.add_argument("--enable-features=NetworkServiceInProcess")
chrome_options.add_argument("--disable-features=NetworkService")
chrome_options.add_argument(
    "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36"
)

service = Service(driver_path)

driver = webdriver.Chrome(service=service,options=chrome_options)
driver.maximize_window()



In [92]:
# Base URL and query parameters
base_url = "http://www.magicbricks.com/mbsrp/propertySearch.html"

query_params ={'editSearch': 'Y',
 'category': 'S',
 'propertyType': '10002,10003,10021,10022,10001,10017',
 'budgetMin': '60000000',
 'budgetMax': '100000000',
 'bedrooms': '11700,11701,11702,11703,11704,11705,11706,11707,11708,11709,11710',
 'city': '7045',
 'page': '2',
 'sortBy': 'Lowest_Price',
 'postedSince': '-1',
 'pType': '10002,10003,10021,10022,10001,10017',
 'isNRI': 'N',
 'multiLang': 'en'}

In [78]:
# Create a directory for saving JSONs
os.makedirs('greaternoida', exist_ok=True)

In [93]:
# Error tracking
failed_attempts = 0
MAX_FAILED_ATTEMPTS = 5

# Main scraping loop
page = 1
counter = 1  # For JSON file naming

# while True:
for _ in range(1,102):
    try:
        # Update page number in query parameters
        query_params['page'] = str(page)
        # query_params['groupstart'] = str((page-1)*30)
        # if page > 2:
        #     query_params['maxOffset'] = '305'

        # Construct the URL with query parameters
        url_with_params = f"{base_url}?" + "&".join([f"{k}={v}" for k, v in query_params.items()])
        # wait = WebDriverWait(driver, 5)

        # Navigate to the URL
        driver.get(url_with_params)
        # wait_for_page_to_load(driver, wait)

        # Wait for a moment to ensure the page has loaded
        time.sleep(5)  # Adjust the sleep time if necessary

        # Retrieve the JSON content using JavaScript execution
        json_content = driver.execute_script("return document.body.innerText;")

        # Parse the JSON content
        response_json = json.loads(json_content)

        # Save JSON data
        file_name = f"greaternoida/list8LacMinus_lowest_Price4_{page}.json"
        with open(file_name, 'w', encoding='utf-8') as json_file:
            json.dump(response_json, json_file, ensure_ascii=False, indent=4)

        print(f"Saved: {file_name}, with page={page}")
        counter += 1

        # Reset failed attempts counter on success
        failed_attempts = 0

        # Increment page
        page += 1

        # Wait between requests
        time.sleep(2)

    except Exception as e:
        # Log the error
        logging.error(f"Failed for page={page} with error: {e}")
        failed_attempts += 1

        # If too many failures, exit the loop
        if failed_attempts >= MAX_FAILED_ATTEMPTS:
            print("Too many consecutive failures. Stopping the scraping process.")
            break

        # Continue to next iteration
        continue

# Close the WebDriver
driver.quit()

Saved: greaternoida/list8LacMinus_lowest_Price4_1.json, with page=1
Saved: greaternoida/list8LacMinus_lowest_Price4_2.json, with page=2
Saved: greaternoida/list8LacMinus_lowest_Price4_3.json, with page=3
Saved: greaternoida/list8LacMinus_lowest_Price4_4.json, with page=4
Saved: greaternoida/list8LacMinus_lowest_Price4_5.json, with page=5
Saved: greaternoida/list8LacMinus_lowest_Price4_6.json, with page=6
Saved: greaternoida/list8LacMinus_lowest_Price4_7.json, with page=7
Saved: greaternoida/list8LacMinus_lowest_Price4_8.json, with page=8
Saved: greaternoida/list8LacMinus_lowest_Price4_9.json, with page=9
Saved: greaternoida/list8LacMinus_lowest_Price4_10.json, with page=10
Saved: greaternoida/list8LacMinus_lowest_Price4_11.json, with page=11
Saved: greaternoida/list8LacMinus_lowest_Price4_12.json, with page=12
Saved: greaternoida/list8LacMinus_lowest_Price4_13.json, with page=13
Saved: greaternoida/list8LacMinus_lowest_Price4_14.json, with page=14
Saved: greaternoida/list8LacMinus_lowe

### Code to check for number of unique listings

In [110]:
import os
import json

# Directory where JSON files are saved
data_directory = 'resultLists1'

# Initialize variables
all_ids = []  # To collect all property IDs
file_issues = []  # To track files with issues
duplicate_ids = set()  # To track duplicate IDs
count = 0
all_urls = []
duplicate_urls = set()

# Process each JSON file
for file_name in sorted(os.listdir(data_directory)):
    if file_name.endswith('.json'):
        file_path = os.path.join(data_directory, file_name)

        try:
            # Read the JSON file
            with open(file_path, 'r', encoding='utf-8') as json_file:
                data = json.load(json_file)
            
            # Extract resultList
            result_list = data.get('resultList', [])

            # Check if resultList has exactly 30 items
            if len(result_list) != 30:
                file_issues.append((file_name, len(result_list)))
            
            count +=1

            # Extract IDs and check for duplicates
            ids_in_file = [property_data['id'] for property_data in result_list if property_data.get('price') is not None]
            for property_id in ids_in_file:
                
                if property_id in all_ids:
                    duplicate_ids.add(property_id)
                else:
                    all_ids.append(property_id)

            # Extract IDs and check for duplicates
            urls_in_file = [property_data['url'] for property_data in result_list if property_data.get('price') is not None]
            for property_url in urls_in_file:
                if property_url in all_urls:
                    duplicate_urls.add(property_url)
                else:
                    all_urls.append(property_url)
                    
                    
          
        
        except Exception as e:
            file_issues.append((file_name, f"Error reading file: {e}"))

# Check for duplicates
duplicate_ids_list = list(duplicate_ids)
duplicate_urls_list = list(duplicate_urls)

# Report results
print(f"Total JSON files processed: {len(os.listdir(data_directory))}")
print(f"Files with issues (not 30 listings or errors): {file_issues}")
print(f"Total unique IDs collected: {len(all_ids)}")
print(f"Duplicate IDs found: {len(duplicate_ids_list)}")

print(f"Total unique URLs collected: {len(all_urls)}")
print(f"Duplicate URLs found: {len(duplicate_urls_list)}")

if duplicate_ids_list:
    print("Duplicate IDs:")
    # print(duplicate_ids_list)
else:
    print("No duplicate IDs found.")

if duplicate_urls_list:
    print("Duplicate URLs:")
    # print(duplicate_urls_list)
else:    
    print("No duplicate URLs found.")


Total JSON files processed: 1499
Files with issues (not 30 listings or errors): [('list8LacMinus_lowest_Price1_99.json', 10), ('list8LacMinus_lowest_Price5_82.json', 16), ('list8LacMinus_lowest_Price5_83.json', 0), ('list8LacMinus_lowest_Price5_84.json', 0), ('list8LacMinus_lowest_Price9_89.json', 28), ('list8LacMinus_lowest_Price9_90.json', 0)]
Total unique IDs collected: 22996
Duplicate IDs found: 11326
Total unique URLs collected: 22996
Duplicate URLs found: 11326
Duplicate IDs:
Duplicate URLs:


In [19]:
len(all_urls)

9962

### Saving unique listings separately

In [112]:
import os
import json

# Directories
input_directory = 'resultLists1'  # Replace with your input directory path
unique_json_directory = 'Data/gurgaonuniqueResults'  # Directory to save unique JSONs
combined_json_file = 'Data/gurgaonuniqueResultsCombined.json'  # Path for combined JSON

# Ensure output directories exist
os.makedirs(unique_json_directory, exist_ok=True)

# Dictionary to store unique properties by their ID
unique_properties = {}

# Process each JSON file in the input directory
for filename in os.listdir(input_directory):
    if filename.endswith('.json'):
        file_path = os.path.join(input_directory, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            try:
                data = json.load(file)
                result_list = data.get('resultList', [])
                for property in result_list:
                    property_id = property.get('id')
                    if property_id and property_id not in unique_properties:
                        # Add to unique properties
                        unique_properties[property_id] = property

                        # Save this property as a separate JSON file
                        unique_file_path = os.path.join(unique_json_directory, f"{property_id}.json")
                        with open(unique_file_path, 'w', encoding='utf-8') as unique_file:
                            json.dump(property, unique_file, indent=4)
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON in file {filename}: {e}")

# Save the combined JSON
with open(combined_json_file, 'w', encoding='utf-8') as combined_file:
    json.dump(unique_properties, combined_file, indent=4)

print(f"Processing complete. Unique JSONs saved to {unique_json_directory}. Combined JSON saved to {combined_json_file}.")


Processing complete. Unique JSONs saved to Data/gurgaonuniqueResults. Combined JSON saved to Data/gurgaonuniqueResultsCombined.json.


### Extracting list of property ids and urls for further scraping

In [113]:
import json
import pandas as pd

# Path to the combined JSON file and output CSV file
combined_json_file = 'Data/gurgaonuniqueResultsCombined.json'  # Replace with the actual path
csv_output_file = 'Data/gurgaonidsAndUrls.csv'  # Replace with the desired CSV file path

# Load the combined JSON file
with open(combined_json_file, 'r', encoding='utf-8') as file:
    combined_data = json.load(file)

# Prepare data for the CSV
csv_data = [{'id': prop_id, 'url': prop_data.get('url')} for prop_id, prop_data in combined_data.items()]

# Create a DataFrame and save it as a CSV
df = pd.DataFrame(csv_data)
df.to_csv(csv_output_file, index=False)

print(f"CSV file with id and url has been created at {csv_output_file}.")


CSV file with id and url has been created at Data/gurgaonidsAndUrls.csv.


In [114]:
df.shape

(22996, 2)

In [31]:
df.iloc[1]['id']

np.int64(56722769)

## Scraping details from each property's webpage

The data obtained from the resultLists scraping is incomplete. However, it contains url for each property. We can use the url to scrape the details of each property.

In [27]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
# from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import json
import os
import pandas as pd
import time

# Base variables
base_url = "https://www.magicbricks.com/propertyDetails/"
output_directory = "Data/propertyDetails"

# Load DataFrame
df = pd.read_csv("Data/idsAndUrls.csv")

# Ensure output directory exists
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

In [34]:
# Initialize WebDriver
driver_path = r'C:\Users\soura\OneDrive\Desktop\Datascience\web_scraping\projects\magicbricks_realestate\chromedriver.exe'

# chrome_options = Options()
# chrome_options.add_argument("--disable-http2")
# chrome_options.add_argument("--incognito")
# chrome_options.add_argument("--disable-blink-features=AutomationControlled")
# chrome_options.add_argument("--ignore-certificate-errors")
# chrome_options.add_argument("--enable-features=NetworkServiceInProcess")
# chrome_options.add_argument("--disable-features=NetworkService")
# chrome_options.add_argument(
#     "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36"
# )

chrome_options = Options()
chrome_options.add_argument("--headless")  # Runs Chrome in headless mode.
chrome_options.add_argument('--no-sandbox')  # Bypass OS security model
chrome_options.add_argument('--disable-gpu')  # Applicable to Windows OS only

service = Service(driver_path)

driver = webdriver.Chrome(service=service,options=chrome_options)
driver.maximize_window()

with open('error_ids.txt', 'r') as f:
    error_ids = f.read().splitlines()

try:
    for i in range(733, 9963):
        try:
            # Get property details
            property_id = df.iloc[i]['id']
            property_url = df.iloc[i]['url']
            full_url = f"{base_url}{property_url}"

            # Navigate to the URL
            driver.get(full_url)

            # Wait for the JavaScript variable to load
            wait = WebDriverWait(driver, 10)
            property_details = wait.until(
                lambda d: d.execute_script("return window.SERVER_PRELOADED_STATE_DETAILS")
            )

            # Save the JSON data to a file
            output_file = os.path.join(output_directory, f"{property_id}.json")
            with open(output_file, 'w', encoding='utf-8') as f:
                json.dump(property_details, f, indent=4)

            print(f"Successfully saved data for property ID: {property_id} with iter number: {i}")

            # time.sleep(1)

        except Exception as e:
            error_ids.append(property_id)
            print(f"Error processing iteration: {i} with property ID {property_id}: {e}\nURL: {full_url}\n\n")

finally:
    # Close the driver
    driver.quit()
    print(f"Total Error Ids: {len(error_ids)}\nError Ids: {error_ids}")
    with open('error_ids.txt', 'a') as f:
        error_ids = [str(i) for i in error_ids]
        f.write('\n'.join(error_ids))

Error processing iteration: 733 with property ID 81398445: Message: 

URL: https://www.magicbricks.com/propertyDetails/3-BHK-3471-Sq-ft-Multistorey-Apartment-FOR-Sale-Sector-54-in-Gurgaon&id=4d423831333938343435


Error processing iteration: 734 with property ID 81396913: Message: 

URL: https://www.magicbricks.com/propertyDetails/3-BHK-2997-Sq-ft-Multistorey-Apartment-FOR-Sale-Sector-54-in-Gurgaon&id=4d423831333936393133


Total Error Ids: 2651
Error Ids: ['81042495', '81496685', '81035517', '77611115', '80826177', '80778125', '80868193', '81258135', '79981875', '74750997', '80797091', '79008187', '78876137', '68031327', '81586599', '78995705', '76166215', '80614487', '78714903', '78894059', '80273311', '68354449', '81068565', '81481689', '68042457', '67945899', '81586513', '75819711', '78355437', '69686449', '81513805', '79263753', '81529999', '70191341', '81006729', '80480189', '81183183', '72305595', '81053257', '81577199', '81580685', '81647695', '79680837', '76882733', '79385785'

KeyboardInterrupt: 