## 1. Import Necessary Libraries

In [1]:
import timeit
import pandas as pd
from requests import get
from bs4 import BeautifulSoup

import warnings
warnings.filterwarnings('ignore')

### 2. Scraping the Properties

In [2]:
start = 101
stop  = 111

headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
           'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
           'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
           'Accept-Encoding': 'none',
           'Accept-Language': 'en-US,en;q=0.8',
           'Connection': 'keep-alive'}

#### 2.1 Define a function for each Colums

In [3]:
def property_nane(soupy_object):   # return house or property name
    try:
        name = soupy_object.find('span', attrs = {'class':'undefined'}).text
    except:
        name = None
    return name

def address_details(soupy_object):   # return address of property
    try:
        address = soupy_object.find('i', attrs = {'id':'address'}).text
    except:
        address = None
    return address

def total_price(soupy_object):   # return total price of property
    try:
        price = soupy_object.find('span', attrs = {'id':"pdPrice2"}).text
    except:
        price = None
    return price

def rate_sqft(soupy_object):   # return total price of property
    try:
        rate = soupy_object.find('div', attrs = {'id':"pricePerUnitArea"}).text.split(' ')[1]
    except:
        rate = None
    return rate

def area_type(soupy_object):   # return area parameters
    try:
        areatyp = soupy_object.find('div', attrs = {'id':'factArea'}).text
    except:
        areatyp = None
    return areatyp

def bedroom_count(soupy_object):   # return number of bedrooms
    try:
        bedroom = soupy_object.find('span', attrs={"id":"bedRoomNum"}).text.split(' ')[0]
    except:
        bedroom = None
    return bedroom

def bathroom_count(soupy_object):   # return number of bathrooms
    try:
        bathroom =  soupy_object.find('span', attrs= {'id':'bathroomNum'}).text.split(' ')[0]
    except:
        bathroom = None
    return bathroom

def floor_num(soupy_object):   # return number of floor
    try:
        floornum = soupy_object.find('span', attrs = {'id':'floorNumLabel'}).text.split(' ')[0]
    except:
        floornum = None
    return floornum

def property_age(soupy_object):   # return age of property
    try:
        age = soupy_object.find('span', attrs ={'id':'agePossessionLbl'}).text
    except:
        age = None
    return age

def availability(soupy_object):   # return area parameters
    try:
        avail = soupy_object.find('span', attrs = {'id':'Availability_Lbl'}).text
    except:
        avail = None
    return avail

#### 2.2 Return a Complete DataFrame into CSV File

In [4]:
data_list = []
def get_all(start, stop):
    for pagenubmer in range(start, stop):
        url = f'https://www.99acres.com/property-in-mumbai-ffid-page-{pagenubmer}'
        req = get(url, headers = headers)
        soup = BeautifulSoup(req.content, 'html.parser')
        links = soup.find_all('a', attrs= {"class":"body_med srpTuple__propertyName"})
        
        for k, item in enumerate(links):
            main_url = 'https://www.99acres.com'
            sub_url = item.get('href')
            data_url = main_url + sub_url
            request = get(data_url, headers=headers)
            soup_get = BeautifulSoup(request.content, 'html.parser')
            
            name = property_nane(soup_get)
            address = address_details(soup_get)
            price = total_price(soup_get)
            rate = rate_sqft(soup_get)
            areatyp = area_type(soup_get)
            bedroom = bedroom_count(soup_get)
            bathroom = bathroom_count(soup_get)
            floornum = floor_num(soup_get)
            age = property_age(soup_get)
            avail = availability(soup_get)
        
            data = {'Property_Name': name, 'Location': address, 'Price':price, 'Rate_SqFt':rate, 'Area_Tpye':areatyp,
                    'Bedroom': bedroom, 'Bathroom':bathroom, 'Floor_No':floornum, 'Property_Age':age, 'Availability':avail}
            data_list.append(data)

        timestart = timeit.default_timer()
        timestop = timeit.default_timer()
        print(f'You scraped page no : {pagenubmer}')
        print('Time :', timestop - timestart)
        
    return data_list

#### 2.3 Define a DataFrame

In [5]:
df1 = pd.DataFrame(get_all(start, stop))

You scraped page no : 101
Time : 9.999999406318238e-08
You scraped page no : 102
Time : 4.999999987376214e-07
You scraped page no : 103
Time : 6.999999868639861e-07
You scraped page no : 104
Time : 5.000000555810402e-07
You scraped page no : 105
Time : 6.000000212225132e-07
You scraped page no : 106
Time : 6.999999868639861e-07
You scraped page no : 107
Time : 8.000000661922968e-07
You scraped page no : 108
Time : 8.99999918146932e-07
You scraped page no : 109
Time : 9.000000318337698e-07
You scraped page no : 110
Time : 9.000000318337698e-07


#### 2.4 Data Understanding

In [6]:
df1.head()

Unnamed: 0,Property_Name,Location,Price,Rate_SqFt,Area_Tpye,Bedroom,Bathroom,Floor_No,Property_Age,Availability
0,Yogsiddhi Sumukh Hills,"703, Kandivali East, Mumbai Andheri-Dahisar, M...",1.65 Crore,26612,Carpet area: 620 (57.6 sq.m.),2,2,7th,0 to 1 Year Old,Ready to move
1,Raunak Unnathi Woods Phase 6,"Thane West, Thane, Mumbai",85 Lac,8594,Super Built up area 989(91.88 sq.m.)Carpet are...,2,2,5th,1 to 5 Year Old,Ready to move
2,Prasham Vishal 2,"602, Borivali West, Mumbai Andheri-Dahisar, Mu...",3.05 Crore,33738,Carpet area: 904 (83.98 sq.m.),3,3,6th,1 to 5 Year Old,Ready to move
3,Toll Free 1800 41 99099,"Malad West, Mumbai Andheri-Dahisar, Mumbai",72 Lac,14400,Carpet area: 500 (46.45 sq.m.),1,1,5th,10+ Year Old,Ready to move
4,Vijay Park,"Kasar vadavali, Thane, Mumbai",48.5 Lac,8083,Super Built up area 600(55.74 sq.m.)Carpet are...,1,1,3rd,10+ Year Old,Ready to move


In [7]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175 entries, 0 to 174
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Property_Name  160 non-null    object
 1   Location       160 non-null    object
 2   Price          160 non-null    object
 3   Rate_SqFt      160 non-null    object
 4   Area_Tpye      160 non-null    object
 5   Bedroom        159 non-null    object
 6   Bathroom       159 non-null    object
 7   Floor_No       159 non-null    object
 8   Property_Age   160 non-null    object
 9   Availability   160 non-null    object
dtypes: object(10)
memory usage: 13.8+ KB


In [8]:
df1.duplicated().sum()

14

In [9]:
df1.nunique()

Property_Name     95
Location         121
Price            111
Rate_SqFt        155
Area_Tpye        144
Bedroom            5
Bathroom           5
Floor_No          35
Property_Age       5
Availability       3
dtype: int64

#### 2.5 Create a CSV File

In [10]:
df1.to_csv('Prop_101to110.csv', index_label = False)

In [11]:
projectlist = pd.read_csv("Prop_101to110.csv")
projectlist.head(16)

Unnamed: 0,Property_Name,Location,Price,Rate_SqFt,Area_Tpye,Bedroom,Bathroom,Floor_No,Property_Age,Availability
0,Yogsiddhi Sumukh Hills,"703, Kandivali East, Mumbai Andheri-Dahisar, M...",1.65 Crore,26612,Carpet area: 620 (57.6 sq.m.),2.0,2.0,7th,0 to 1 Year Old,Ready to move
1,Raunak Unnathi Woods Phase 6,"Thane West, Thane, Mumbai",85 Lac,8594,Super Built up area 989(91.88 sq.m.)Carpet are...,2.0,2.0,5th,1 to 5 Year Old,Ready to move
2,Prasham Vishal 2,"602, Borivali West, Mumbai Andheri-Dahisar, Mu...",3.05 Crore,33738,Carpet area: 904 (83.98 sq.m.),3.0,3.0,6th,1 to 5 Year Old,Ready to move
3,Toll Free 1800 41 99099,"Malad West, Mumbai Andheri-Dahisar, Mumbai",72 Lac,14400,Carpet area: 500 (46.45 sq.m.),1.0,1.0,5th,10+ Year Old,Ready to move
4,Vijay Park,"Kasar vadavali, Thane, Mumbai",48.5 Lac,8083,Super Built up area 600(55.74 sq.m.)Carpet are...,1.0,1.0,3rd,10+ Year Old,Ready to move
5,Omkar Alta Monte,"Omkar Alta Monte Malad East, Malad East, Mumba...",3.18 Crore,25645,Super Built up area 1750(162.58 sq.m.)Built Up...,3.0,3.0,39th,0 to 1 Year Old,Ready to move
6,Haware Tiara,"Sector 13 Kharghar, Navi Mumbai, Mumbai",1.7 Crore,10967,Built Up area: 1550 (144 sq.m.)Carpet area: 11...,3.0,3.0,11st,5 to 10 Year Old,Ready to move
7,Toll Free 1800 41 99099,"Kharghar, Navi Mumbai, Mumbai",92 Lac,7666,Super Built up area 1200(111.48 sq.m.)Built Up...,2.0,2.0,4th,10+ Year Old,Ready to move
8,Sai Ashirwad,"Mira Road, Mira Road And Beyond, Mumbai",36.5 Lac,10138,Super Built up area 360(33.45 sq.m.)Carpet are...,1.0,1.0,Ground,10+ Year Old,Ready to move
9,Advance Heights,"Kharghar, Navi Mumbai, Mumbai",1 Crore,8849,Super Built up area 1130(104.98 sq.m.)Carpet a...,2.0,2.0,13rd,0 to 1 Year Old,Ready to move


### 3. Import the all Datasets and Concating

In [12]:
df1 = pd.read_csv("Prop_001to050.csv")
df2 = pd.read_csv("Prop_051to100.csv")
df3 = pd.read_csv("Prop_101to150.csv")

In [13]:
df = pd.concat([df1, df2, df3], ignore_index=False)

In [14]:
df

Unnamed: 0,Property_Name,Location,Price,Rate_SqFt,Area_Tpye,Bedroom,Bathroom,Floor_No,Property_Age,Availability
0,Omkar Alta Monte,"W.E.Highway, Malad East, Mumbai Andheri-Dahisa...",5 Crore,17241,Super Built up area 2900(269.42 sq.m.)Built Up...,3.0,4.0,14th,0 to 1 Year Old,Ready to move
1,T Bhimjyani Neelkanth Woods,"Manpada, Thane, Mumbai",2.4 Crore,12631,Super Built up area 1900(176.52 sq.m.)Built Up...,3.0,3.0,8th,1 to 5 Year Old,Ready to move Property
2,Legend 1 Pramila Nagar,"Dahisar West, Mumbai Andheri-Dahisar, Mumbai",95 Lac,15966,Super Built up area 595(55.28 sq.m.),1.0,2.0,3rd,10+ Year Old,Ready to move
3,Toll Free 1800 41 99099,"Vidyavihar West, Vidyavihar West, Central Mumb...",3.75 Crore,25862,Built Up area: 1450 (134.71 sq.m.),3.0,3.0,1st,5 to 10 Year Old,Ready to move
4,Toll Free 1800 41 99099,"176 Cst Road, Kalina, Mumbai 400098, Santacruz...",3.5 Crore,39954,Carpet area: 876 (81.38 sq.m.),2.0,2.0,5th,5 to 10 Year Old,Ready to move
...,...,...,...,...,...,...,...,...,...,...
1184,Shagun White Woods,"Sector 23 Ulwe, Navi Mumbai, Mumbai",1.22 Crore,10338,Built Up area: 1180 (109.63 sq.m.),2.0,2.0,2nd,1 to 5 Year Old,Ready to move
1185,Guru Anant,"Sector 2 Ulwe, Navi Mumbai, Mumbai",88 Lac,8073,Built Up area: 1090 (101.26 sq.m.),2.0,2.0,11st,0 to 1 Year Old,Ready to move
1186,Balaji Mayuresh Delta,"Ulwe, Navi Mumbai, Mumbai",1.37 Crore,10579,Built Up area: 1295 (120.31 sq.m.),2.0,2.0,6th,1 to 5 Year Old,Ready to move
1187,Balaji Mayuresh Delta,"Ulwe, Navi Mumbai, Mumbai",1.71 Crore,9243,Built Up area: 1850 (171.87 sq.m.),3.0,3.0,6th,1 to 5 Year Old,Ready to move


In [15]:
df["Property_Name"] = df["Property_Name"].str.replace('Toll Free 1800 41 99099','Unnamed Property')

In [16]:
df.isna().sum()

Property_Name     3
Location          5
Price             5
Rate_SqFt         3
Area_Tpye         3
Bedroom          82
Bathroom         82
Floor_No         72
Property_Age     24
Availability     11
dtype: int64

In [17]:
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)

In [18]:
df.isna().sum()

Property_Name    0
Location         0
Price            0
Rate_SqFt        0
Area_Tpye        0
Bedroom          0
Bathroom         0
Floor_No         0
Property_Age     0
Availability     0
dtype: int64

In [19]:
df.duplicated().sum()

9

In [20]:
df = df.drop_duplicates(ignore_index=True)

In [21]:
df.to_csv('Raw_Property.csv', index=False)

In [22]:
df = pd.read_csv('Raw_Property.csv')

In [23]:
print('Shape of Data :', df.shape)
df

Shape of Data : (2581, 10)


Unnamed: 0,Property_Name,Location,Price,Rate_SqFt,Area_Tpye,Bedroom,Bathroom,Floor_No,Property_Age,Availability
0,Omkar Alta Monte,"W.E.Highway, Malad East, Mumbai Andheri-Dahisa...",5 Crore,17241,Super Built up area 2900(269.42 sq.m.)Built Up...,3.0,4.0,14th,0 to 1 Year Old,Ready to move
1,T Bhimjyani Neelkanth Woods,"Manpada, Thane, Mumbai",2.4 Crore,12631,Super Built up area 1900(176.52 sq.m.)Built Up...,3.0,3.0,8th,1 to 5 Year Old,Ready to move Property
2,Legend 1 Pramila Nagar,"Dahisar West, Mumbai Andheri-Dahisar, Mumbai",95 Lac,15966,Super Built up area 595(55.28 sq.m.),1.0,2.0,3rd,10+ Year Old,Ready to move
3,Unnamed Property,"Vidyavihar West, Vidyavihar West, Central Mumb...",3.75 Crore,25862,Built Up area: 1450 (134.71 sq.m.),3.0,3.0,1st,5 to 10 Year Old,Ready to move
4,Unnamed Property,"176 Cst Road, Kalina, Mumbai 400098, Santacruz...",3.5 Crore,39954,Carpet area: 876 (81.38 sq.m.),2.0,2.0,5th,5 to 10 Year Old,Ready to move
...,...,...,...,...,...,...,...,...,...,...
2576,Shagun White Woods,"Sector 23 Ulwe, Navi Mumbai, Mumbai",1.22 Crore,10338,Built Up area: 1180 (109.63 sq.m.),2.0,2.0,2nd,1 to 5 Year Old,Ready to move
2577,Guru Anant,"Sector 2 Ulwe, Navi Mumbai, Mumbai",88 Lac,8073,Built Up area: 1090 (101.26 sq.m.),2.0,2.0,11st,0 to 1 Year Old,Ready to move
2578,Balaji Mayuresh Delta,"Ulwe, Navi Mumbai, Mumbai",1.37 Crore,10579,Built Up area: 1295 (120.31 sq.m.),2.0,2.0,6th,1 to 5 Year Old,Ready to move
2579,Balaji Mayuresh Delta,"Ulwe, Navi Mumbai, Mumbai",1.71 Crore,9243,Built Up area: 1850 (171.87 sq.m.),3.0,3.0,6th,1 to 5 Year Old,Ready to move


# The End !!!