## Extract Data from Domain.com.au

#### 1. Parse URL to extract Auction/Sales results for individual suburbs (URL specific) 

In [1]:
import pandas as pd
import numpy as np
import requests

from bs4 import BeautifulSoup


In [345]:

soup = requests.get('https://www.domain.com.au/sold-listings/mernda-vic-3754/?page=1')
result = BeautifulSoup(soup.content,'html5lib')

### 1st Page of Search Results

In [346]:
# Extract Sold Price
price = result.find_all('p', attrs={"class":"css-mgq8yx"})
prices = []
for i in price:
    prices.append(i.text)
Sold_Price = pd.DataFrame(prices, columns=["Sold_Price"])
#Sold_Price

In [347]:
# Extract Property Address
st_address = result.find_all('span', attrs={"itemprop":"streetAddress"})
st_addresses = []
for i in st_address:
    cleaned = i.text.rsplit(',')[0]
    st_addresses.append(cleaned)
Address = pd.DataFrame(st_addresses, columns=["Address"])
#Address

In [348]:
# Extract Property Suburb
suburb = result.find_all('span', attrs={"itemprop":"addressLocality"})
suburb
suburbs = []
for i in suburb:
    suburbs.append(i.text)
Suburb = pd.DataFrame(suburbs, columns=["Suburb"])

In [349]:
# Extract Property Type

proptype_finder = result.find_all(class_ = 'css-11n8uyu')
proptype_finder

proptype_list = []
for i in proptype_finder:
    x = i.text.rsplit(' ')[0]
    proptype_list.append(x)

proptype_list
    
PropType = pd.DataFrame(proptype_list, columns=["PropType"])
PropType.replace('.css-693528{max-width:100%;text-overflow:ellipsis;white-space:nowrap;overflow:hidden;font-weight:bold;font-size:14px;line-height:24px;}House','House', True)
PropType.replace('Vacant','Vacant land', True)

#PropType

In [354]:
PropList = pd.concat([Address,Suburb,Sold_Price,PropType],axis=1,sort=False)
PropList["Bed"] = np.nan
PropList["Bath"] = np.nan
PropList["Parking"] = np.nan
PropList["LandSize"] = np.nan

# Set index to start @ 1 instead of 0
#PropList.index = PropList.index + 1
#PropList.iloc[0]['Bed']
PropList

Unnamed: 0,Address,Suburb,Sold_Price,PropType,Bed,Bath,Parking,LandSize
0,6 Jackaroo Street,MERNDA,"$546,000",House,,,,
1,Lot 111/125 Regent Street,MERNDA,"$312,500",Vacant land,,,,
2,13 Chanticleer Drive,MERNDA,"$332,000",Townhouse,,,,
3,4 Balfour Drive,MERNDA,"$575,000",House,,,,
4,3/20 Stourhead Avenue,MERNDA,"$461,000",House,,,,
5,23 Herschel Way,MERNDA,"$780,000",House,,,,
6,52 Balerno Way,MERNDA,"$500,000",House,,,,
7,23 Marlowe Grange,MERNDA,"$645,000",House,,,,
8,31 Ragusa Terrace,MERNDA,"$310,000",Vacant land,,,,
9,175 Everard Road,MERNDA,"$510,000",House,,,,


In [355]:
# Extract Property Features
feature_finder = result.find_all(class_ = 'css-1rzse3v')
feature_finder

feature_list = []
for i in feature_finder:
    x = i.text.rsplit(' ')
    feature_list.append(x)


    
Features = pd.DataFrame(feature_list, columns=['Spec','Feature'])
Features.replace('.css-9fxapx{position:absolute;width:1px;height:1px;margin:-1px;padding:0;-webkit-clip:rect(1px,1px,1px,1px);clip:rect(1px,1px,1px,1px);border:0;overflow:hidden;-webkit-clip-path:inset(100%);clip-path:inset(100%);-webkit-clip-path:none;}Beds','Beds',inplace=True)
Features.replace('Baths','Bath',inplace=True)
Features.replace('Beds','Bed',inplace=True)
Features.replace('','LandSize',inplace=True)



#Features


In [356]:
# Fill in blanks (e.g. vacant land w/o BedBathParking, props w/o LandSize). 

if j != (80-len(Features)): 
    #print('j',j,'len(Features)')
    for i in range(80): 
        if i == 79:
            #print(i,'break')
            break
        elif Features.loc[i, 'Feature'] == 'LandSize' and Features.loc[i+1, 'Feature'] == 'LandSize':
            #print(i, 'land')
            blank1 = pd.DataFrame({'Spec': 0, 'Feature': 'Bed'}, index=[i+1])
            blank2 = pd.DataFrame({'Spec': 0, 'Feature': 'Bath'}, index=[i+2])
            blank3 = pd.DataFrame({'Spec': 0, 'Feature': 'Parking'}, index=[i+3])
            Features = pd.concat([Features.iloc[:i+1], blank1, blank2, blank3, Features.iloc[i+1:]]).reset_index(drop=True)
            #print('elif',i,'here',len(Features))
            #print(Features)
        elif Features.loc[i, 'Feature'] == 'Parking' and Features.loc[i+1, 'Feature'] == 'Bed':
            blank = pd.DataFrame({'Spec': 0, 'Feature': 'LandSize'}, index=[i+1])
            Features = pd.concat([Features.iloc[:i+1], blank, Features.iloc[i+1:]]).reset_index(drop=True)
            #print('2nd elif',i,'here',len(Features))
            #print(Features)
# If last row = no LandSize, manual fill. 
        elif Features.loc[(len(Features)-1), 'Feature'] == 'Parking':
            Features = Features.append(blank).reset_index(drop=True)

#Features

In [357]:
# Allocate Feature Specs to PropList

# iterate every 4x Features rows into 1x PropList row

for i in range(20):
    PropList.loc[i, 'Bed'] = Features.loc[4*i, 'Spec']
    PropList.loc[i, 'Bath'] = Features.loc[4*i+1, 'Spec']
    PropList.loc[i, 'Parking'] = Features.loc[4*i+2, 'Spec']
    PropList.loc[i, 'LandSize'] = Features.loc[4*i+3, 'Spec']
    


#for i in range(len(Features)): 
#    if Features.loc[i, 'Feature'] == 'Bed':
#        PropList.loc[i, 'Bed'] = Features.loc[i, 'Spec'] 
#    elif Features.loc[i, 'Feature'] == 'Bath':
#        PropList.loc[i, 'Bath'] = Features.loc[i, 'Spec'] 
#    elif Features.loc[i, 'Feature'] == 'Parking':
#        PropList.loc[i, 'Parking'] = Features.loc[i, 'Spec'] 
#    elif Features.loc[i, 'Feature'] == 'LandSize':
#        PropList.loc[i, 'LandSize'] = Features.loc[i, 'Spec'] 

PropList

Unnamed: 0,Address,Suburb,Sold_Price,PropType,Bed,Bath,Parking,LandSize
0,6 Jackaroo Street,MERNDA,"$546,000",House,3,2,2,409m²
1,Lot 111/125 Regent Street,MERNDA,"$312,500",Vacant land,0,0,0,376m²
2,13 Chanticleer Drive,MERNDA,"$332,000",Townhouse,2,1,1,0
3,4 Balfour Drive,MERNDA,"$575,000",House,3,2,2,448m²
4,3/20 Stourhead Avenue,MERNDA,"$461,000",House,3,2,2,172m²
5,23 Herschel Way,MERNDA,"$780,000",House,4,2,2,439m²
6,52 Balerno Way,MERNDA,"$500,000",House,4,2,2,0
7,23 Marlowe Grange,MERNDA,"$645,000",House,3,2,2,462m²
8,31 Ragusa Terrace,MERNDA,"$310,000",Vacant land,0,0,0,343m²
9,175 Everard Road,MERNDA,"$510,000",House,4,2,1,0


In [343]:
# Remove 'm2', rename LandSize to incl measurement, convert Nan to 0
PropList['LandSize'] = PropList['LandSize'].str.extract('(\d+)')
PropList.rename(columns = {'LandSize':'LandSize(m2)'},inplace = True)
PropList.fillna(0)

KeyError: 'LandSize'

In [344]:
#PropList = pd.concat([Address,Suburb,Sold_Price,PropType],axis=1,sort=False)

# Set index to start @ 1 instead of 0

print(PropList)

                      Address  Suburb       Sold_Price     PropType Bed Bath  \
2           6 Jackaroo Street  MERNDA        $546,000         House   3    2   
3   Lot 111/125 Regent Street  MERNDA        $312,500   Vacant land   0    0   
4        13 Chanticleer Drive  MERNDA        $332,000     Townhouse   2    1   
5             4 Balfour Drive  MERNDA        $575,000         House   3    2   
6       3/20 Stourhead Avenue  MERNDA        $461,000         House   3    2   
7             23 Herschel Way  MERNDA        $780,000         House   4    2   
8              52 Balerno Way  MERNDA        $500,000         House   4    2   
9           23 Marlowe Grange  MERNDA        $645,000         House   3    2   
10          31 Ragusa Terrace  MERNDA        $310,000   Vacant land   0    0   
11           175 Everard Road  MERNDA        $510,000         House   4    2   
12              22 Arum  Walk  MERNDA        $593,000         House   3    2   
13         22 Delbridge Drive  MERNDA  P

## Test Test