In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def scrape_magicbricks(url, max_records=3000):
    records = []
    page_number = 1

    while len(records) < max_records:
        page_url = f"{url}&pageNum={page_number}"
        response = requests.get(page_url)
        soup = BeautifulSoup(response.text, 'html.parser')

        property_cards = soup.find_all('div', class_='mb-srp__card__container')
        estimate_cards = soup.find_all('div', class_='mb-srp__card__estimate')


        for property_card, estimate_card in zip(property_cards, estimate_cards):
            super_area_div = property_card.find('div', {'data-summary': 'super-area'})
            carpet_area_div = property_card.find('div', {'data-summary': 'carpet-area'})

            if super_area_div:
                super_area = super_area_div.find('div', class_='mb-srp__card__summary--value').text.strip()
                super_area = int(super_area.replace(' sqft', '').strip())
            else:
                surface_area = 0

            if carpet_area_div:
                carpet_area = carpet_area_div.find('div', class_='mb-srp__card__summary--value').text.strip()
                carpet_area = int(carpet_area.replace(' sqft', '').strip())
            else:
                carpet_area = 0
            price_span = estimate_card.find('span', class_='rupees')
            price = price_span.next_sibling.strip() if price_span else 0
            
            if 'Cr' in price:
                price = float(price.replace('Cr', '')) * 10**7
            elif 'Lac' in price:
                price = float(price.replace('Lac', '')) * 10**5

            size_div = estimate_card.find('div', class_='mb-srp__card__price--size')
            size = size_div.text.strip() if size_div else 0

            if size != 0:
                size = int(size.replace('₹', '').replace('per sqft', '').replace(',', '').strip())

            record = {
                'Title': get_text(property_card.find('h2', class_='mb-srp__card--title')),
                'Price': price,
                'Price per sqft': size,
                'Carpet area (sqft)': carpet_area,
                'Super area (sqft)': surface_area
            }
            records.append(record)


        page_number += 1

    return pd.DataFrame(records)

def get_text(element):
    try:
        return element.text.strip() if element else None
    except AttributeError:
        return None

url = 'https://www.magicbricks.com/property-for-sale/residential-real-estate?bedroom=&proptype=Multistorey-Apartment,Builder-Floor-Apartment,Penthouse,Studio-Apartment&cityName=Pune'
df = scrape_magicbricks(url, max_records=300) 

df.head()

Unnamed: 0,Title,Price,Price per sqft,Carpet area (sqft),Super area (sqft)
0,"4 BHK Flat for Sale in Rahul Arcus, Baner, Pune",26700000.0,9780,2100,0
1,"2 BHK Flat for Sale in Vishal Leela Heights, W...",8200000.0,7455,786,0
2,"4 BHK Flat for Sale in Amar Landmark, Baner, Pune",90000000.0,19569,3407,0
3,"3 BHK Flat for Sale in Godrej Hillside, Mahalu...",10000000.0,7692,980,0
4,"1 BHK Flat for Sale in Paranjape Blue Ridge, H...",4500000.0,7858,440,0


In [2]:
import re
location_regex = re.compile(r',\s*(\w+),\s*Pune')
project_regex = re.compile(r'in\s(.+?),\s*\w+,?\s*Pune')
bhk_regex = re.compile(r'(\d+)\sBHK')

df['Location'] = df['Title'].apply(lambda x: location_regex.search(x).group(1) if location_regex.search(x) else None)
df['Project'] = df['Title'].apply(lambda x: project_regex.search(x).group(1) if project_regex.search(x) else None)
df['BHK'] = df['Title'].apply(lambda x: bhk_regex.search(x).group(1) if bhk_regex.search(x) else None)

df

Unnamed: 0,Title,Price,Price per sqft,Carpet area (sqft),Super area (sqft),Location,Project,BHK
0,"4 BHK Flat for Sale in Rahul Arcus, Baner, Pune",26700000.0,9780,2100,0,Baner,Rahul Arcus,4
1,"2 BHK Flat for Sale in Vishal Leela Heights, W...",8200000.0,7455,786,0,Wakad,Vishal Leela Heights,2
2,"4 BHK Flat for Sale in Amar Landmark, Baner, Pune",90000000.0,19569,3407,0,Baner,Amar Landmark,4
3,"3 BHK Flat for Sale in Godrej Hillside, Mahalu...",10000000.0,7692,980,0,Mahalunge,Godrej Hillside,3
4,"1 BHK Flat for Sale in Paranjape Blue Ridge, H...",4500000.0,7858,440,0,Hinjawadi,Paranjape Blue Ridge,1
...,...,...,...,...,...,...,...,...
295,"2 BHK Flat for Sale in ITrend Homes, Mahalunge...",7800000.0,7647,750,0,Mahalunge,ITrend Homes,2
296,"3 BHK Flat for Sale in Wakad, Pune",10000000.0,7407,0,0,,,3
297,"4 BHK Flat for Sale in Yoo Pune, Magarpatta, Pune",76000000.0,13509,5626,0,Magarpatta,Yoo Pune,4
298,"3 BHK Flat for Sale in Gera World of Joy, Khar...",14400000.0,8597,1240,0,Kharadi,Gera World of Joy,3


In [None]:
df.to_csv('magic_file.csv', index=False)