In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
import time
# User Agent
headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'}

**Overview**<br><br>
This code sets up the necessary tools and headers to scrape web data. It imports libraries for sending HTTP requests, parsing HTML, and handling data. Additionally, it defines a user agent header to mimic a web browser, which can be useful to avoid getting blocked by some websites.



- **import requests**

    This line imports the requests module, which is a popular Python module used to send HTTP requests to websites.

- **from bs4 import BeautifulSoup**
    This line imports BeautifulSoup from the bs4 module. BeautifulSoup is a library that is used for web scraping purposes to pull the data out of HTML and XML files. It creates a parse tree that can be used to extract data in a hierarchical and more readable manner.
    
- **import os**

    This line imports the os module, which provides a way of interacting with the operating system. This could be used for tasks like creating directories, reading environment variables, etc.
    
- **headers = {...}**

    This line defines a dictionary called headers with a 'User-Agent' key. The value of this key is a string that represents a user agent string.

    The user agent string is used to tell the server about the browser and operating system of the user. Some websites serve different content based on the user agent or even block certain user agents (often to prevent scraping). By defining a common browser's user agent string, this code is trying to mimic a real browser request to potentially avoid blocks or get the same content a real user would see.

### Extracting Flats/Apartments


In [None]:
flats = pd.DataFrame()

In [None]:

# Put start page number and end page number
start = 1 # Starting Page
end = 50 # End Page

csv_file = f"/content/drive/MyDrive/DSMP/Case Studies/Real estate/flats_appartment/flats_gurgaon_data-p{start}-{end}.csv"

pageNumber = start
req=0

city = 'gurgaon'

while pageNumber < end:
    i=1
    url = f'https://www.99acres.com/flats-in-{city}-ffid-page-{pageNumber}'
    page = requests.get(url, headers=headers)
    pageSoup = BeautifulSoup(page.content, 'html.parser')
    req+=1
    for soup in pageSoup.select_one('div[data-label="SEARCH"]').select('section[data-hydration-on-demand="true"]'):

    # Extract property name and property sub-name
        try:
            property_name = soup.select_one('a.srpTuple__propertyName').text.strip()
            # Extract link
            link = soup.select_one('a.srpTuple__propertyName')['href']
            society = soup.select_one('#srp_tuple_society_heading').text.strip()
        except:
            continue
        # Detail Page
        page = requests.get(link, headers=headers)
        dpageSoup = BeautifulSoup(page.content, 'html.parser')
        req += 1
        try:
            #price Range
            price = dpageSoup.select_one('#pdPrice2').text.strip()
        except:
            price = ''

        # Area
        try:
            area = soup.select_one('#srp_tuple_price_per_unit_area').text.strip()
        except:
            area =''
        # Area with Type
        try:
            areaWithType = dpageSoup.select_one('#factArea').text.strip()
        except:
            areaWithType = ''


        # Configuration
        try:
            bedRoom = dpageSoup.select_one('#bedRoomNum').text.strip()
        except:
            bedRoom = ''
        try:
            bathroom = dpageSoup.select_one('#bathroomNum').text.strip()
        except:
            bathroom = ''
        try:
            balcony = dpageSoup.select_one('#balconyNum').text.strip()
        except:
            balcony = ''

        try:
            additionalRoom = dpageSoup.select_one('#additionalRooms').text.strip()
        except:
            additionalRoom = ''


        # Address

        try:
            address = dpageSoup.select_one('#address').text.strip()
        except:
            address = ''
        # Floor Number
        try:
            floorNum = dpageSoup.select_one('#floorNumLabel').text.strip()
        except:
            floorNum = ''

        try:
            facing = dpageSoup.select_one('#facingLabel').text.strip()
        except:
            facing = ''

        try:
            agePossession = dpageSoup.select_one('#agePossessionLbl').text.strip()
        except:
            agePossession = ''

        # Nearby Locations

        try:
            nearbyLocations = [i.text.strip() for i in dpageSoup.select_one('div.NearByLocation__tagWrap').select('span.NearByLocation__infoText')]
        except:
            nearbyLocations = ''

        # Descriptions
        try:
            description = dpageSoup.select_one('#description').text.strip()
        except:
            description = ''

        # Furnish Details
        try:
            furnishDetails = [i.text.strip() for i in dpageSoup.select_one('#FurnishDetails').select('li')]
        except:
            furnishDetails = ''

        # Features
        if furnishDetails:
            try:
                features = [i.text.strip() for i in dpageSoup.select('#features')[1].select('li')]
            except:
                features = ''
        else:
            try:
                features = [i.text.strip() for i in dpageSoup.select('#features')[0].select('li')]
            except:
                features = ''



        # Rating by Features
        try:
            rating = [i.text for i in dpageSoup.select_one('div.review__rightSide>div>ul>li>div').select('div.ratingByFeature__circleWrap')]
        except:
            rating = ''
        # print(top_f)

        try:
            # Property ID
            property_id = dpageSoup.select_one('#Prop_Id').text.strip()
        except:
            property_id = ''

        # Create a dictionary with the given variables
        property_data = {
        'property_name': property_name,
        'link': link,
        'society': society,
        'price': price,
        'area': area,
        'areaWithType': areaWithType,
        'bedRoom': bedRoom,
        'bathroom': bathroom,
        'balcony': balcony,
        'additionalRoom': additionalRoom,
        'address': address,
        'floorNum': floorNum,
        'facing': facing,
        'agePossession': agePossession,
        'nearbyLocations': nearbyLocations,
        'description': description,
        'furnishDetails': furnishDetails,
        'features': features,
        'rating': rating,
        'property_id': property_id
    }


        temp_df = pd.DataFrame.from_records([property_data])
        # print(temp_df)
        flats = pd.concat([flats, temp_df], ignore_index=True)
        i += 1
        if os.path.isfile(csv_file):
        # Append DataFrame to the existing file without header
            temp_df.to_csv(csv_file, mode='a', header=False, index=False)
        else:
            # Write DataFrame to the file with header
            temp_df.to_csv(csv_file, mode='a', header=True, index=False)

        if req % 4==0:
            time.sleep(10)
        if req % 15 == 0:
            time.sleep(50)
    print(f'{pageNumber} -> {i}')
    pageNumber += 1



127 -> 26
128 -> 26
129 -> 26


AttributeError: ignored

**Overview**<br><br>
The code scrapes property data from the website "99acres.com" for apartments in Gurgaon. It navigates through a range of pages, extracts details of each property, and saves the data to a CSV file. The script is designed to handle potential errors gracefully, using try and except blocks to manage missing data, and introduces pauses to avoid making rapid requests and potentially getting blocked by the website.




- **Initialization of Variables**:

    - start and end specify the range of web pages to scrape.
    - csv_file defines the path to the CSV file where data will be saved.
    - pageNumber starts from the initial value of start and will be incremented to navigate through the pages.
    - req counts the number of HTTP requests made.

- **Loop for Page Navigation**:

    - The while loop is used to navigate through each page in the range from start to end.
    - Inside this loop, the URL of the page to be scraped is constructed using the pageNumber.
    - An HTTP GET request is made to retrieve the content of the page, and the content is then parsed using BeautifulSoup.

- **Loop for Property Extraction**:

    - The nested for loop navigates through individual property sections on the current page.
    - The script attempts to extract the property name, its link, and its society name.
    - If any of these attributes are missing, it skips to the next property.

- **Detail Extraction**:

    - For each property, an HTTP request is made to its detail page.
    - The code then attempts to extract various property details like price, area, bedroom count, bathroom count, balcony count, address, and many other attributes. If any attribute is missing, the code handles it gracefully, assigning an empty string or an empty list as appropriate.

- **Creating and Saving Data**:

    - All extracted details are stored in a dictionary named property_data.
    - This dictionary is then converted to a temporary DataFrame temp_df.
    - The data is appended to a main DataFrame flats and also saved to the CSV file. If the file already exists, the new data is appended without writing the headers again.

- **Request Management**:

    - To avoid making too many rapid requests (which can lead to IP bans), the script introduces pauses.
    - Every 4 requests, it pauses for 10 seconds. Every 15 requests, it pauses for 50 seconds.

- **Page Counter and Loop Increment**:

    - After scraping all properties on a page, the code prints the page number and the number of properties processed.
    - pageNumber is incremented to move to the next page.


In [None]:
flats.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75 entries, 0 to 74
Data columns (total 20 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   property_name    75 non-null     object
 1   link             75 non-null     object
 2   society          75 non-null     object
 3   price            75 non-null     object
 4   area             75 non-null     object
 5   areaWithType     75 non-null     object
 6   bedRoom          75 non-null     object
 7   bathroom         75 non-null     object
 8   balcony          75 non-null     object
 9   additionalRoom   75 non-null     object
 10  address          75 non-null     object
 11  floorNum         75 non-null     object
 12  facing           75 non-null     object
 13  agePossession    75 non-null     object
 14  nearbyLocations  75 non-null     object
 15  description      75 non-null     object
 16  furnishDetails   75 non-null     object
 17  features         75 non-null     obje

In [None]:
flats

Unnamed: 0,property_name,link,society,price,area,areaWithType,bedRoom,bathroom,balcony,additionalRoom,address,floorNum,facing,agePossession,nearbyLocations,description,furnishDetails,features,rating,property_id
0,1 BHK Flat in Sector 40 Gurgaon,https://www.99acres.com/1-bhk-bedroom-apartmen...,LIG Flat,1.5 Crore,"₹ 50,000/sq.ft.",Built Up area: 300 (27.87 sq.m.),1 Bedroom,1 Bathroom,No Balcony,,"Lig Flat, Sector 40, Gurgaon, Sector 40 Gurgao...",Ground of 1 Floors,,undefined,"[Huda city centre metro station, Axis bank ATM...",Looking for a 1 bhk apartment for sale in gurg...,"[1 Light, No AC, No Bed, No Chimney, No Curtai...",,"[Safety4 out of 5, Lifestyle4 out of 5, Enviro...",E64375676
1,2 BHK Flat in Sector 3 Gurgaon,https://www.99acres.com/2-bhk-bedroom-apartmen...,Apna Enclave,55 Lac,"₹ 5,789/sq.ft.",Super Built up area 950(88.26 sq.m.)Built Up a...,2 Bedrooms,2 Bathrooms,2 Balconies,,"Near Railway Station, Sector 3 Gurgaon, Gurgao...",2nd of 3 Floors,North-East,10+ Year Old,"[Palam Vihar Vyapar kendra, Palam triangle, Ch...",Residential apartment for sale in apna enclave...,"[2 Wardrobe, 4 Fan, 1 Exhaust Fan, 1 Geyser, 5...","[Feng Shui / Vaastu Compliant, Security / Fire...","[Safety4.5 out of 5, Lifestyle4 out of 5, Envi...",L64373292
2,3 BHK Flat in Sector 108 Gurgaon,https://www.99acres.com/3-bhk-bedroom-apartmen...,Raheja Vedaanta3.6 ★,1.25 Crore,"₹ 5,681/sq.ft.",Super Built up area 2200(204.39 sq.m.),3 Bedrooms,2 Bathrooms,No Balcony,,"Sector 108 Gurgaon, Gurgaon, Haryana",Ground of 22 Floors,,1 to 5 Year Old,,Residential apartment for sell.Located on grou...,"[1 Wardrobe, 1 Fan, 1 Exhaust Fan, 1 Geyser, 1...","[Power Back-up, Feng Shui / Vaastu Compliant, ...","[Management3 out of 5, Green Area4 out of 5, C...",C59958296
3,2 BHK Flat in Dharam Colony,https://www.99acres.com/2-bhk-bedroom-apartmen...,Shri Balaji Apartment,35 Lac,"₹ 4,037/sq.ft.",Super Built up area 867(80.55 sq.m.)Built Up a...,2 Bedrooms,2 Bathrooms,1 Balcony,,Flat No 9 Balaji Apartment 2 Shiv Temple Road ...,1st of 4 Floors,North,5 to 10 Year Old,"[Palam triangle, Palam Vihar Vyapar kendra, HU...","Location is very prime.Market, schools,public ...",[],"[Security / Fire Alarm, Water purifier, Water ...","[Safety4 out of 5, Lifestyle4 out of 5, Enviro...",V39804453
4,3 BHK Flat in Sector 70 Gurgaon,https://www.99acres.com/3-bhk-bedroom-apartmen...,Krrish Florence Estate,1.3 Crore,"₹ 5,600/sq.ft.",Built Up area: 1865 (173.26 sq.m.),3 Bedrooms,3 Bathrooms,3 Balconies,,"Sector 70 Gurgaon, Gurgaon, Haryana",17th of 25 Floors,,Under Construction,"[Omaxe Gurgaon Mall, NH248A, The Vivekananda S...",Residential apartment for sell. The property h...,"[4 Wardrobe, 1 Exhaust Fan, 3 Geyser, 5 AC, 1 ...",,"[Safety3.5 out of 5, Lifestyle4 out of 5, Envi...",N25060831
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70,3 BHK Flat in Sector 59 Gurgaon,https://www.99acres.com/3-bhk-bedroom-apartmen...,Conscient Elevate,,"₹ 17,148/sq.ft.",,,,,,,,,,,,,,,
71,3 BHK Flat in Sector 61 Gurgaon,https://www.99acres.com/3-bhk-bedroom-apartmen...,Puri The Aravallis,,"₹ 16,940/sq.ft.",,,,,,,,,,,,,,,
72,4 BHK Flat in Sector 54 Gurgaon,https://www.99acres.com/4-bhk-bedroom-apartmen...,DLF Park Place4.1 ★,,"₹ 22,969/sq.ft.",,,,,,,,,,,,,,,
73,3 BHK Flat in Sector 59 Gurgaon,https://www.99acres.com/3-bhk-bedroom-apartmen...,Mahindra Luminare4.0 ★,,"₹ 16,419/sq.ft.",,,,,,,,,,,,,,,


In [None]:
def combine_csv_files(folder_path, combined_file_path):
    combined_data = pd.DataFrame()  # Create an empty DataFrame to hold the combined data

    # Iterate through all CSV files in the folder
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.csv'):
            file_path = os.path.join(folder_path, file_name)
            print('file_path')
            # Read the data from the current CSV file
            df = pd.read_csv(file_path)

            # Append the data to the combined DataFrame
            combined_data = combined_data.append(df, ignore_index=True)

            # Delete the original CSV file
            os.remove(file_path)

    # Save the combined data to a new CSV file
    combined_data.to_csv(combined_file_path, index=False)

# Example usage:

# Replace with the actual folder path
folder_path = '/content/drive/MyDrive/DSMP/Case Studies/Real estate/flats_appartment'

# Replace with the desired combined file path
combined_file_path = '/content/drive/MyDrive/DSMP/Case Studies/Real estate/flats_appartment/flats.csv'

combine_csv_files(folder_path, combined_file_path)


file_path


  combined_data = combined_data.append(df, ignore_index=True)
  combined_data = combined_data.append(df, ignore_index=True)


file_path


**Overview**:
The function combine_csv_files combines all the CSV files located in a specified folder into a single CSV file. After appending the data from each individual file to the combined file, the original file is deleted.


**Function Definition**:

_combine_csv_files(folder_path, combined_file_path)_:<br>
- _folder_path_: Path to the folder containing the CSV files you want to combine.
- _combined_file_path_: Path where the combined CSV file should be saved.

**Initialize an Empty DataFrame**:

- combined_data = pd.DataFrame(): An empty DataFrame combined_data is created to hold all the data from the individual CSV files.

**Iterate Through CSV Files**:

- The for loop iterates over each file in the directory specified by folder_path.
- Within the loop, the code checks if the current file ends with .csv to ensure that only CSV files are processed.

**Read and Append Data**:

- file_path = os.path.join(folder_path, file_name): Constructs the full path to the current CSV file.
- df = pd.read_csv(file_path): Reads the data from the current CSV file into a DataFrame df.
- combined_data = combined_data.append(df, ignore_index=True): Appends the data from df to the combined_data DataFrame. The ignore_index=True parameter ensures that the index is reset and continuous in the combined data.

**Delete the Original CSV File**:

- os.remove(file_path): Deletes the original CSV file after its data has been appended to the combined data. This step helps in conserving storage space.

**Save the Combined Data**:

- combined_data.to_csv(combined_file_path, index=False): Writes the combined_data DataFrame to a new CSV file at the specified combined_file_path. The parameter index=False ensures that the DataFrame's index is not written to the CSV.

**Example Usage**:

- The provided paths (folder_path and combined_file_path) specify the location of the individual CSV files and the path for the combined CSV file, respectively.
- Calling the combine_csv_files function with these paths will combine all CSV files in the specified folder and save the combined data to the desired location.

In [None]:
pd.read_csv(combined_file_path)

Unnamed: 0,property_name,link,society,price,area,areaWithType,bedRoom,bathroom,balcony,additionalRoom,address,floorNum,facing,agePossession,nearbyLocations,description,furnishDetails,features,rating,property_id
0,2 BHK Flat in Krishna Colony,https://www.99acres.com/2-bhk-bedroom-apartmen...,maa bhagwati residency,45 Lac,"₹ 5,000/sq.ft.",Carpet area: 900 (83.61 sq.m.),2 Bedrooms,2 Bathrooms,1 Balcony,,"Krishna Colony, Gurgaon, Haryana",4th of 4 Floors,West,1 to 5 Year Old,"['Chintapurni Mandir', 'State bank ATM', 'Pear...",So with lift.Maa bhagwati residency is one of ...,"['3 Fan', '4 Light', '1 Wardrobe', 'No AC', 'N...","['Feng Shui / Vaastu Compliant', 'Security / F...","['Environment4 out of 5', 'Safety4 out of 5', ...",C68850746
1,2 BHK Flat in Ashok Vihar,https://www.99acres.com/2-bhk-bedroom-apartmen...,Apna Enclave,50 Lac,"₹ 7,692/sq.ft.",Carpet area: 650 (60.39 sq.m.),2 Bedrooms,2 Bathrooms,1 Balcony,,"46b, Ashok Vihar, Gurgaon, Haryana",1st of 3 Floors,West,10+ Year Old,"['Chintapurni Mandir', 'Sheetla Mata Mandir', ...","Property situated on main road, railway statio...","['3 Wardrobe', '4 Fan', '1 Exhaust Fan', '1 Ge...","['Security / Fire Alarm', 'Maintenance Staff',...","['Environment4 out of 5', 'Safety4 out of 5', ...",H68850564
2,2 BHK Flat in Sohna,https://www.99acres.com/2-bhk-bedroom-apartmen...,Tulsiani Easy in Homes,40 Lac,"₹ 6,722/sq.ft.",Carpet area: 595 (55.28 sq.m.),2 Bedrooms,2 Bathrooms,3 Balconies,,"Sohna, Gurgaon, Haryana",12nd of 14 Floors,,0 to 1 Year Old,"['Huda City Metro', 'Golf Course extn road', '...","This property is 15 km away from badshapur, gu...",,"['Power Back-up', 'Feng Shui / Vaastu Complian...","['Environment4 out of 5', 'Safety4 out of 5', ...",J68850120
3,2 BHK Flat in Sector 61 Gurgaon,https://www.99acres.com/2-bhk-bedroom-apartmen...,Smart World Orchard,1.47 Crore,"₹ 12,250/sq.ft.",Carpet area: 1200 (111.48 sq.m.),2 Bedrooms,2 Bathrooms,2 Balconies,Study Room,"Sector 61 Gurgaon, Gurgaon, Haryana",2nd of 4 Floors,,Dec 2023,"['Sector 55-56 Metro station', 'Bestech Centra...",Near to metro station of sector 56 and opposit...,,"['Security / Fire Alarm', 'Private Garden / Te...",,S68849476
4,2 BHK Flat in Sector 92 Gurgaon,https://www.99acres.com/2-bhk-bedroom-apartmen...,Parkwood Westend,70 Lac,"₹ 5,204/sq.ft.",Super Built up area 1345(124.95 sq.m.),2 Bedrooms,2 Bathrooms,3 Balconies,Study Room,"Sector 92 Gurgaon, Gurgaon, Haryana",5th of 8 Floors,,Under Construction,"['Yadav Clinic', 'Bangali Clinic', 'Dr. J. S. ...",We are the proud owners of this 2 bhk alongwit...,[],,"['Environment5 out of 5', 'Safety3 out of 5', ...",L47956793
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3033,3 BHK Flat in Sector 59 Gurgaon,https://www.99acres.com/3-bhk-bedroom-apartmen...,Conscient Elevate,,"₹ 17,148/sq.ft.",,,,,,,,,,,,,,,
3034,3 BHK Flat in Sector 61 Gurgaon,https://www.99acres.com/3-bhk-bedroom-apartmen...,Puri The Aravallis,,"₹ 16,940/sq.ft.",,,,,,,,,,,,,,,
3035,4 BHK Flat in Sector 54 Gurgaon,https://www.99acres.com/4-bhk-bedroom-apartmen...,DLF Park Place4.1 ★,,"₹ 22,969/sq.ft.",,,,,,,,,,,,,,,
3036,3 BHK Flat in Sector 59 Gurgaon,https://www.99acres.com/3-bhk-bedroom-apartmen...,Mahindra Luminare4.0 ★,,"₹ 16,419/sq.ft.",,,,,,,,,,,,,,,
