## Extract Data from Domain.com.au

#### 1. Parse URL to extract Auction/Sales results for individual suburbs (URL specific) 

In [1]:
import pandas as pd
import numpy as np
import requests
import re

from bs4 import BeautifulSoup


In [153]:
soup = requests.get('https://www.domain.com.au/sold-listings/mernda-vic-3754/?page=2')
result = BeautifulSoup(soup.content,'html5lib')

### 1st Page of Search Results

In [154]:
# Extract Sold Price
price = result.find_all('p', attrs={"class":"css-mgq8yx"})
prices = []
for i in price:
    prices.append(i.text)
Sold_Price = pd.DataFrame(prices, columns=["Sold_Price"])
# Sold_Price

In [155]:
# Extract Property Address
st_address = result.find_all('span', attrs={"itemprop":"streetAddress"})
st_addresses = []
for i in st_address:
    cleaned = i.text.rsplit(',')[0]
    st_addresses.append(cleaned)
Address = pd.DataFrame(st_addresses, columns=["Address"])
#Address

In [156]:
# Extract Property Suburb
suburb = result.find_all('span', attrs={"itemprop":"addressLocality"})
suburb
suburbs = []
for i in suburb:
    suburbs.append(i.text)
Suburb = pd.DataFrame(suburbs, columns=["Suburb"])

In [157]:
# Extract Property Type

proptype_finder = result.find_all(class_ = 'css-11n8uyu')
proptype_finder

proptype_list = []
for i in proptype_finder:
    x = i.text.rsplit(' ')[0]
    proptype_list.append(x)

proptype_list
    
PropType = pd.DataFrame(proptype_list, columns=["PropType"])
PropType.replace('.css-693528{max-width:100%;text-overflow:ellipsis;white-space:nowrap;overflow:hidden;font-weight:bold;font-size:14px;line-height:24px;}House','House', True)
PropType.replace('Vacant','Vacant land', True)

#PropType

In [158]:
PropList = pd.concat([Address,Suburb,Sold_Price,PropType],axis=1,sort=False)
PropList["Bed"] = np.nan
PropList["Bath"] = np.nan
PropList["Parking"] = np.nan
PropList["LandSize"] = np.nan

# Set index to start @ 1 instead of 0
#PropList.index = PropList.index + 1
#PropList.iloc[0]['Bed']
# PropList

In [159]:
# Extract Property Features
feature_finder = result.find_all(class_ = 'css-1rzse3v')
feature_finder

feature_list = []
for i in feature_finder:
    x = i.text.rsplit(' ')#.extract()
    feature_list.append(x)

for i in range(len(feature_list)):
    x = feature_list[i][1]
    if re.search('Bed', x):
        feature_list[i][1] = re.search('Bed', x).group()
#         print(i,'th is true')
    elif re.search('Bath', x):
        feature_list[i][1] = re.search('Bath', x).group()
    elif re.search('Parking', x):
        feature_list[i][1] = re.search('Parking', x).group()
    elif re.search('', x):
        feature_list[i][1] = 'LandSize'

feature_list

[['3', 'Bed'],
 ['2', 'Bath'],
 ['1', 'Parking'],
 ['4', 'Bed'],
 ['2', 'Bath'],
 ['2', 'Parking'],
 ['375m²', 'LandSize'],
 ['4', 'Bed'],
 ['2', 'Bath'],
 ['−', 'Parking'],
 ['4', 'Bed'],
 ['2', 'Bath'],
 ['2', 'Parking'],
 ['580m²', 'LandSize'],
 ['4', 'Bed'],
 ['3', 'Bath'],
 ['8', 'Parking'],
 ['5,665m²', 'LandSize'],
 ['4', 'Bed'],
 ['2', 'Bath'],
 ['3', 'Parking'],
 ['829m²', 'LandSize'],
 ['4', 'Bed'],
 ['2', 'Bath'],
 ['2', 'Parking'],
 ['3', 'Bed'],
 ['2', 'Bath'],
 ['2', 'Parking'],
 ['313m²', 'LandSize'],
 ['3', 'Bed'],
 ['2', 'Bath'],
 ['2', 'Parking'],
 ['1.25ha', 'LandSize'],
 ['4', 'Bed'],
 ['2', 'Bath'],
 ['2', 'Parking'],
 ['351m²', 'LandSize'],
 ['424m²', 'LandSize'],
 ['3', 'Bed'],
 ['2', 'Bath'],
 ['2', 'Parking'],
 ['325m²', 'LandSize'],
 ['405m²', 'LandSize'],
 ['4', 'Bed'],
 ['2', 'Bath'],
 ['2', 'Parking'],
 ['4', 'Bed'],
 ['2', 'Bath'],
 ['2', 'Parking'],
 ['4', 'Bed'],
 ['2', 'Bath'],
 ['3', 'Parking'],
 ['392m²', 'LandSize'],
 ['375m²', 'LandSize'],
 ['4', 'B

In [148]:
len(feature_list)

58

In [160]:
while len(feature_list) < 80:
    blank_bed = [['0', 'Bed']]
    blank_bath = [['0', 'Bath']]
    blank_park = [['0', 'Parking']]
    blank_land = [['0', 'LandSize']]
    for i in range(len(feature_list)):
        if feature_list[i][1] == 'Bed' and feature_list[i+1][1] != 'Bath':
            feature_list[i+1:i+1] = blank_bath
            print(i,'Bath', len(feature_list))
        elif feature_list[i][1] == 'Bath' and feature_list[i+1][1] != 'Parking':
            feature_list[i+1:i+1] = blank_park
            print(i,'Park', len(feature_list))
        elif feature_list[i][1] == 'Parking' and feature_list[i+1][1] != 'LandSize':
            feature_list[i+1:i+1] = blank_land
            print(i,'Land', len(feature_list))
        elif feature_list[i][1] == 'LandSize' and feature_list[i+1][1] != 'Bed':
            feature_list[i+1:i+1] = blank_bed
            print(i,'Bed', len(feature_list))
            print(feature_list)

2 Land 59
10 Land 60
26 Land 61
39 Bed 62
[['3', 'Bed'], ['2', 'Bath'], ['1', 'Parking'], ['0', 'LandSize'], ['4', 'Bed'], ['2', 'Bath'], ['2', 'Parking'], ['375m²', 'LandSize'], ['4', 'Bed'], ['2', 'Bath'], ['−', 'Parking'], ['0', 'LandSize'], ['4', 'Bed'], ['2', 'Bath'], ['2', 'Parking'], ['580m²', 'LandSize'], ['4', 'Bed'], ['3', 'Bath'], ['8', 'Parking'], ['5,665m²', 'LandSize'], ['4', 'Bed'], ['2', 'Bath'], ['3', 'Parking'], ['829m²', 'LandSize'], ['4', 'Bed'], ['2', 'Bath'], ['2', 'Parking'], ['0', 'LandSize'], ['3', 'Bed'], ['2', 'Bath'], ['2', 'Parking'], ['313m²', 'LandSize'], ['3', 'Bed'], ['2', 'Bath'], ['2', 'Parking'], ['1.25ha', 'LandSize'], ['4', 'Bed'], ['2', 'Bath'], ['2', 'Parking'], ['351m²', 'LandSize'], ['0', 'Bed'], ['424m²', 'LandSize'], ['3', 'Bed'], ['2', 'Bath'], ['2', 'Parking'], ['325m²', 'LandSize'], ['405m²', 'LandSize'], ['4', 'Bed'], ['2', 'Bath'], ['2', 'Parking'], ['4', 'Bed'], ['2', 'Bath'], ['2', 'Parking'], ['4', 'Bed'], ['2', 'Bath'], ['3', 'Parkin

IndexError: list index out of range

In [None]:
# # Fill in blanks (e.g. vacant land w/o BedBathParking, props w/o LandSize). 

# for i in range(len(feature_list)):
#     x = feature_list[i][1]
#     if re.search('Bed', x):
#         feature_list[i][1] = re.search('Bed', x).group()
# #         print(i,'th is true')
#     elif re.search('Bath', x):
#         feature_list[i][1] = re.search('Bath', x).group()
#     elif re.search('Parking', x):
#         feature_list[i][1] = re.search('Parking', x).group()
#     elif re.search('', x):
#         feature_list[i][1] = 'LandSize'

In [161]:
Features = pd.DataFrame(feature_list, columns=['Spec','Feature'])
# Features
# Features[0]
# Features.replace('Baths','Bath',inplace=True)
# Features.replace('Beds','Bed',inplace=True)
# Features.replace('','LandSize',inplace=True)
Features

Unnamed: 0,Spec,Feature
0,3,Bed
1,2,Bath
2,1,Parking
3,0,LandSize
4,4,Bed
5,2,Bath
6,2,Parking
7,375m²,LandSize
8,4,Bed
9,2,Bath


In [99]:
# Fill in blanks (e.g. vacant land w/o BedBathParking, props w/o LandSize). 

for i in range(80): 
    if i == 79:
        #print(i,'break')
        break
    elif Features.loc[i, 'Feature'] == 'LandSize' and Features.loc[i+1, 'Feature'] == 'LandSize':
        #print(i, 'land')
        blank1 = pd.DataFrame({'Spec': 0, 'Feature': 'Bed'}, index=[i+1])
        blank2 = pd.DataFrame({'Spec': 0, 'Feature': 'Bath'}, index=[i+2])
        blank3 = pd.DataFrame({'Spec': 0, 'Feature': 'Parking'}, index=[i+3])
        Features = pd.concat([Features.iloc[:i+1], blank1, blank2, blank3, Features.iloc[i+1:]]).reset_index(drop=True)
        #print('elif',i,'here',len(Features))
        #print(Features)
    elif Features.loc[i, 'Feature'] == 'Parking' and Features.loc[i+1, 'Feature'] == 'Bed':
        blank = pd.DataFrame({'Spec': 0, 'Feature': 'LandSize'}, index=[i+1])
        Features = pd.concat([Features.iloc[:i+1], blank, Features.iloc[i+1:]]).reset_index(drop=True)
        #print('2nd elif',i,'here',len(Features))
        #print(Features)
# If last row = no LandSize, manual fill. 
    elif Features.loc[(len(Features)-1), 'Feature'] == 'Parking':
        Features = Features.append(blank).reset_index(drop=True)

#Features

KeyError: 72

In [44]:
# Allocate Feature Specs to PropList

# iterate every 4x Features rows into 1x PropList row

for i in range(20):
    PropList.loc[i, 'Bed'] = Features.loc[4*i, 'Spec']
    PropList.loc[i, 'Bath'] = Features.loc[4*i+1, 'Spec']
    PropList.loc[i, 'Parking'] = Features.loc[4*i+2, 'Spec']
    PropList.loc[i, 'LandSize'] = Features.loc[4*i+3, 'Spec']
    


#for i in range(len(Features)): 
#    if Features.loc[i, 'Feature'] == 'Bed':
#        PropList.loc[i, 'Bed'] = Features.loc[i, 'Spec'] 
#    elif Features.loc[i, 'Feature'] == 'Bath':
#        PropList.loc[i, 'Bath'] = Features.loc[i, 'Spec'] 
#    elif Features.loc[i, 'Feature'] == 'Parking':
#        PropList.loc[i, 'Parking'] = Features.loc[i, 'Spec'] 
#    elif Features.loc[i, 'Feature'] == 'LandSize':
#        PropList.loc[i, 'LandSize'] = Features.loc[i, 'Spec'] 

PropList

Unnamed: 0,Address,Suburb,Sold_Price,PropType,Bed,Bath,Parking,LandSize
0,6 Jackaroo Street,MERNDA,"$546,000",House,3,2,2,409m²
1,Lot 111/125 Regent Street,MERNDA,"$312,500",Vacant land,0,0,0,376m²
2,13 Chanticleer Drive,MERNDA,"$332,000",Townhouse,2,1,1,0
3,4 Balfour Drive,MERNDA,"$575,000",House,3,2,2,448m²
4,84 Breadalbane Avenue,MERNDA,"$690,000",House,4,3,2,519m²
5,3/20 Stourhead Avenue,MERNDA,"$461,000",House,3,2,2,172m²
6,23 Herschel Way,MERNDA,"$780,000",House,4,2,2,439m²
7,52 Balerno Way,MERNDA,"$500,000",House,4,2,2,0
8,23 Marlowe Grange,MERNDA,"$645,000",House,3,2,2,462m²
9,31 Ragusa Terrace,MERNDA,"$310,000",Vacant land,0,0,0,343m²


In [27]:
# Remove 'm2', rename LandSize to incl measurement, convert Nan to 0
PropList['LandSize'] = PropList['LandSize'].str.extract('(\d+)')
PropList.rename(columns = {'LandSize':'LandSize(m2)'},inplace = True)
PropList.fillna(0)

Unnamed: 0,Address,Suburb,Sold_Price,PropType,Bed,Bath,Parking,LandSize(m2)
0,148 Everard Road,MERNDA,"$470,000",House,3,2,1,0
1,3 Brandybuck Lane,MERNDA,"$545,000",House,4,2,2,375
2,81 Kerrabee Drive,MERNDA,"$524,500",House,4,2,−,4
3,5 Jondarvan Drive,MERNDA,"$704,250",House,2,2,580m²,4
4,116 Schotters Road,MERNDA,Price Withheld,House,3,8,"5,665m²",4
5,514 Masons Road,MERNDA,"$875,000",House,2,3,829m²,4
6,6 Chiswick Place,MERNDA,"$610,000",House,2,2,0,3
7,8 Borrack Crescent,MERNDA,"$530,000",House,2,2,313m²,3
8,23 Sunridge Drive,MERNDA,"$515,000",House,2,2,1.25ha,4
9,13 Hayes Road,MERNDA,Price Withheld,Vacant land,2,2,351m²,0


In [359]:
#PropList = pd.concat([Address,Suburb,Sold_Price,PropType],axis=1,sort=False)

# Set index to start @ 1 instead of 0

print(PropList)

                      Address  Suburb       Sold_Price     PropType Bed Bath  \
0           6 Jackaroo Street  MERNDA        $546,000         House   3    2   
1   Lot 111/125 Regent Street  MERNDA        $312,500   Vacant land   0    0   
2        13 Chanticleer Drive  MERNDA        $332,000     Townhouse   2    1   
3             4 Balfour Drive  MERNDA        $575,000         House   3    2   
4       3/20 Stourhead Avenue  MERNDA        $461,000         House   3    2   
5             23 Herschel Way  MERNDA        $780,000         House   4    2   
6              52 Balerno Way  MERNDA        $500,000         House   4    2   
7           23 Marlowe Grange  MERNDA        $645,000         House   3    2   
8           31 Ragusa Terrace  MERNDA        $310,000   Vacant land   0    0   
9            175 Everard Road  MERNDA        $510,000         House   4    2   
10              22 Arum  Walk  MERNDA        $593,000         House   3    2   
11         22 Delbridge Drive  MERNDA  P

## Test Test