In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
import time

#User Agent
headers = ({'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36', 'Accept-Language': 'en-US, en;q=0.5'})

## Overview 
This code sets up the necessary tools and headers to scrape web data. It imports libraries for sending HTTP requests, parsing HTML, and handling data. Additionally, it defines a user agent header to mimic a web browser, which can be useful to avoid getting blocked by some websites.
- **import requests**
This line imports the requests module, which is a popular Python module used to send HTTP requests to websites.
- **from bs4 import BeautifulSoup** 
This line imports BeautifulSoup from the bs4 module. BeautifulSoup is a library that is used for web scraping purposes to pull the data out of HTML and XML files. It creates a parse tree that can be used to extract data in a hierarchical and more readable manner.
- **import os**
This line imports the os module, which provides a way of interacting with the operating sysyem. This could be used for tasks like creating directories, reading environment variables etc.
- **headers = {...}**
This line defines a dictionary called headers with a 'User-Agent' key. The value of this key is a string that represents a user agent string.
The user agent string is used to tell the server about the browser and operating system of the user. Some websites serve different content based on the user agent or even block certain user agents (often to prevent scraping). By defining a common browser's user agent string, this code is trying to mimic a real browser request to potentially avoid blocks or get the same content a real user would see.
  

### Extracting Flats/Apartments

In [5]:
flats = pd.DataFrame()

In [7]:
start = 127
end = 200
csv_file = f"Real Estate Project/flats_gurgaon_data-p{start}-{end}.csv"

pageNumber = start
req=0

city = 'gurgaon'

In [None]:
while pageNumber < end:
    i=1
    url = f"https://www.99acres.com/flats-in-{city}-ffid-page-{pageNumber}"
    page = requests.get(url, headers=headers)
    pageSoup = BeautifulSoup(page.content,'html.parser')
    req += 1
    for soup in pageSoup.select_one('div[data-label="SEARCH"]').select('section[data-hydration-on-demands="true"]'):
    # Extract propert name and property sub_name
        try:
            property_name = soup.select_one('a.srpTuple__propertyName').text.strip()
            # Extract Link
            link = soup.select_one('a.srpTuple__propertyName')['href']
            society = soup.select_one('#srp_tuple_society_heading').text.strip()
        except:
            continue
        # Detail Page
        page = requests.get(link,headers=headers)
        dpageSoup = BeautifulSoup(page.content,'html.parser')
        req += 1
        try:
            # price range
            price = dpageSoup.select_one('#pdPrice2').text.strip()
        except:
            price = ''

        # Area
        try:
            area = dpageSoup.select_one('#srp_tuple_price_per_unit_area').text.strip()
        except:
            area = ''
        # Area with Type
        try:
            areaWithType = dpageSoup.select_one('#factArea').text.strip()
        except:
            areaWithType = ''

        # Configuration
        try: 
            bedRoom = dpageSoup.select_one('#bedRoomNum').text.strip()
        except:
            bedRoom = ''
        try:
            bathroom = dpageSoup.select_one('#bathroomNum').text.strip()
        except:
            bathroom=''
        try:
            balcony = dpageSoup.select_one('#balconyNum').text.strip()
        except:
            balcony = ''
        try:
            additionalRoom = dpageSoup.select_one('#additionalRooms').text.strip()
        except:
            additionalRoom = ''

        # Address
        try:
            address = dpageSoup.select_one('#address').text.strip()
        except:
            address = ''
        # Floor Number
        try:
            floorNum = dpageSoup.select_one('#floorNumLabel').text.strip()
        except:
            floorNum = ''
        try:
            facing = dpageSoup.select_one('#facingLabel').text.strip()
        except:
            facing = ''
        try:
            agePossession = dpageSoup.select_one('#agePossessionaLbl').text.strip()
        except:
            agePossession = ''
        # Nearby Locations
        try:
            nearbyLocations = [i.text.strip() for i in dpageSoup.select_one('div.NearByLocation__tagWrap').select('span.NearByLocation__infoText')]
        except:
            nearbyLocations = ''
        # Descriptions
        try:
            description = dpageSoup.select_one('#description').text.strip()
        except:
            description = ''
        # Furnish Details
        try:
            furnishDetails = [i.text.strip() for i in dpageSoup.select_one('#FurnishDetails').select('li')]
        except:
            furnishDetails = ''
        # Features
        if furnishDetails:
            try:
                features = [i.text.strip() for i in dpageSoup.select('#features')[1].select('li')]
            except:
                features = ''
        else:
            try:
                features = [i.text.strip() for i in dpageSoup.select('#features')[0].select('li')]
            except:
                features = ''

        # Rating by Features
        try:
            rating = [i.text.strip() for i in dpageSoup.select_one('div.review__rightSide>div>ul>li>div').select('div.ratingByFeature__circleWrap')]
        except:
            rating = ''
        # print(top_f)
        try:
            # Property ID
            property_id = dpageSoup.select_one('#Prop_Id').text.strip()
        except:
            property_id = ''

        # Create a dictionary with the given variables
        property_data = {
            'property_name':property_name,
            'link' : link,
            'society' : society,
            'price':price,
            'area':area,
            'areaWithType': areaWithType,
            'bedRoom':bedRoom,
            'bathroom':bathroom,
            'balcony':balcony,
            'additionalRoom':additionalRoom,
            'address':address,
            'floorNum':floorNum,
            'facing':facing,
            'agePossession':agePossession,
            'nearbyLocations':nearbyLocations,
            'description':description,
            'furnishDetails':furnishDetails,
            'features':features,
            'rating':rating,
            'property_id':property_id
        }

        temp_df = pd.DataFrame.from_records([property_data])
        # print(temp_df)
        flats = pd.concat([flats,temp_df], ignore_index=True)
        i += 1
        if os.path.isfile(csv_file):
            temp_df.to_csv(csv_file,mode='a',header=False,index=False)
        else:
            temp_df.to_csv(csv_file,mode='a',header=False,index=False)

        if req %4==0:
            time.sleep(10)
        if req %15==0:
            time.sleep(50)
    print(f'{pageNumber} -> {i}')
    pageNumber += 1
