In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

project = []
carpet_area = []
super_area = []
bhk = []
area = []
price = []
total_floor = []

for page in range(100):
    url = "https://www.magicbricks.com/property-for-sale/residential-real-estate?bedroom=&proptype=Multistorey-Apartment,Builder-Floor-Apartment,Penthouse,Studio-Apartment&cityName=Pune&page={}".format(page)
    r = requests.get(url)
    soup = BeautifulSoup(r.content, 'html.parser')

    for i in soup.find_all('div', class_='mb-srp__card__container'):
        project_elem = i.find('span', class_="mb-srp__card__developer--name--highlight")
        carpet_area_elem = i.find('div', {'data-summary': 'carpet-area', 'class': 'mb-srp__card__summary__list--item'})
        super_area_elem = i.find('div', {'data-summary': 'super-area', 'class': 'mb-srp__card__summary__list--item'})
        bhk_elem = i.find('h2', class_="mb-srp__card--title")
        floor_ele = i.find('div', {'data-summary': 'floor', 'class': 'mb-srp__card__summary__list--item'})
        
        if project_elem:
            project.append(project_elem.text.strip())
        else:
            project.append('Unavailable')

        if carpet_area_elem:
            carpet_area.append(carpet_area_elem.text.strip())
        else:
            carpet_area.append(0)
            
        if super_area_elem:
            super_area.append(super_area_elem.text.strip())
        else:
            super_area.append(0)

        if bhk_elem:
            bhk.append(bhk_elem.text.strip()[0])
            area.append(bhk_elem.text.split('in ')[1].strip())
        else:
            bhk.append('N/A')
            area.append('N/A')
            
        if floor_ele:
            total_floor.append(floor_ele.text.strip())
        else:
            total_floor.append('N/A')

    for j in soup.find_all('div', class_="mb-srp__card__estimate"):
        price_elem = j.find('div', class_="mb-srp__card__price--amount")
        
        if price_elem:
            price.append(price_elem.text.strip())
        else:
            price.append('N/A')

# Create a DataFrame using the extracted data
data = {
    'project': project,
    'carpet_area': carpet_area,
    'super_area' : super_area,
    'bhk': bhk,
    'area': area,
    'price': price,
    'floors' : total_floor
}

df = pd.DataFrame(data)
print(df.head())

             project           carpet_area           super_area bhk  \
0        Gagan Aviva   Carpet Area356 sqft                    0   2   
1  Purva Emerald Bay                     0  Super Area1860 sqft   3   
2       Chandrakamal   Carpet Area938 sqft                    0   2   
3           MJ Opera   Carpet Area825 sqft                    0   2   
4  Kingston Atlantis  Carpet Area1145 sqft                    0   3   

                                            area      price floors  
0                     Gagan Aviva, Kesnand, Pune  ₹42.4 Lac    N/A  
1  Purva Emerald Bay, Keshav Nagar Mundhwa, Pune   ₹1.30 Cr    N/A  
2             Chandrakamal, Shukrawar Peth, Pune   ₹1.76 Cr    N/A  
3                          MJ Opera, Wakad, Pune    ₹91 Lac    N/A  
4      Kingston Atlantis, NIBM Annexe Area, Pune   ₹1.09 Cr    N/A  


## Preprocessing data

In [2]:
df['carpet_area'] = df['carpet_area'].str.extract('(\d+)').astype(float).fillna(0).astype(int)

In [3]:
df['super_area'] = df['super_area'].str.extract('(\d+)').astype(float).fillna(0).astype(int)

In [4]:
df['area'] = df['area'].apply(lambda x: x.split(', ')[1] if len(x.split(', ')) > 2 else x)

In [5]:
def convert_currency_to_numeric(price):
    price = price.replace('₹', '')  # Remove currency symbol
    if 'Cr' in price:  # Convert Crore to numeric
        return float(price.replace('Cr', '')) * 10000000
    elif 'Lac' in price:  # Convert Lac to numeric
        return float(price.replace('Lac', '')) * 100000
    else:
        return float(price)  # Convert other formats directly
    
# Applying the conversion function to 'price' column
df['price'] = df['price'].apply(convert_currency_to_numeric).astype('int')

In [7]:
df['floors'] = df['floors'].apply(lambda x: x.split()[-1])

In [8]:
df.to_csv('house_prices.csv', index=False)