# Scraping data from a real estate agency website

## 0) Import all required libraries

In [2]:
# to extract all properties urls (needed to handle with javascript)
from selenium import webdriver 

# to access the html content of a single property url
import requests 

# to select parts of an XML or HTML text using CSS or XPath and extract data from it
from parsel import Selector 

# to select parts of an XML or HTML using BeautifulSoup (XPath not supported)
from bs4 import BeautifulSoup 

# to use regular expressions
import re 

# to build the dataframe
import pandas as pd 

# to build a dictionary form a string
import json 

# to build a defaultdict
from collections import defaultdict

## 1) Obtain 10000 url of houses with webdriver (appartments below)

In [None]:
driver = webdriver.Chrome(executable_path='chromedriver.exe')

# The url of each house that resulted from the search will be stored in the "houses_url" list.
houses_url = []

# Iterate through all result pages (i) and get the url of each of them
for i in range(1, 334):
    apikey = str(i)+'&orderBy=relevance'
    url = 'https://www.immoweb.be/en/search/house/for-sale?countries=BE&page='+apikey

    # An implicit wait tells WebDriver to poll the DOM for a
    #  certain amount of time when trying to find any element 
    #     (or elements) not immediately available. 
    driver.implicitly_wait(10)
    
    # The first thing you’ll want to do with WebDriver is navigate
    #   to a link. The normal way to do this is by calling get method:    
    driver.get(url)

    # Selector allows you to select parts of an XML or HTML text using CSS
    #   or XPath expressions and extract data from it.
    sel = Selector(text=driver.page_source) 

    # Store the xpath query of houses
    xpath_houses = '//*[@id="main-content"]/li//h2//a/@href'
    
    # Find nodes matching the xpath ``query`` and return the result
    page_houses_url = sel.xpath(xpath_houses).extract()
    
    # There are approximately 30 houses in each page.
    # Add each page url list to houses_url, like in a matrix.
    houses_url.append(page_houses_url)

print(len(houses_url))
print(houses_url[1][4])

In [None]:
# Store all houses urls in a csv file
with open('houses_apartments_urls.csv', 'w') as file:
    for page_url in houses_url:
        for url in page_url:
            file.write(url+'\n')

### Repeat the procedure with appartments

In [None]:
# The url of each appartment that resulted from the search will be stored in the "houses_url" list
apartments_url = []

for i in range(1, 334):
    # We used 'i' to build urls of the 333 page in immoweb.
    #   So, we can reach 333 pages with for loop.
    apikey = str(i)+'&orderBy=relevance'
    url = 'https://www.immoweb.be/en/search/apartment/for-sale?countries=BE&page='+apikey

    # An implicit wait tells WebDriver to poll the DOM for a
    #   certain amount of time when trying to find any element 
    #     (or elements) not immediately available. 
    driver.implicitly_wait(10)
    
    # The first thing you’ll want to do with WebDriver is navigate
    #   to a link. The normal way to do this is by calling get method:    
    driver.get(url)

    # Selector` allows you to select parts of an XML or HTML text using CSS
    #   or XPath expressions and extract data from it.
    sel = Selector(text=driver.page_source) 

    # xpath query of the houses in the immoweb page
    xpath_apartments = '//*[@id="main-content"]/li//h2//a/@href'
    
    # Find nodes matching the xpath ``query`` and return the result
    page_apartments_url = sel.xpath(xpath_apartments).extract()
    
    # There are approximately 30 houses in each page.
    # We add each page url list to houses_url like matrix.
    apartments_url.append(page_apartments_url)

print(len(apartments_url))
print(apartments_url[1][4])

In [None]:
# As with houses, store all appartments urls in the same csv file
with open('houses_apartments_urls.csv', 'a') as file:
    for page_url in apartments_url:
        for url in page_url:
            file.write(url+'\n')

## 2) Define a class through which properties' data will be scraped

#### Each url represents a property (house or appartment), each of which has a number of attributes (e.g., locality, type_property etc.). We thus create a class defining the attributes of each property

In [None]:
class HouseApartmentScraping:
    def __init__(self, url):
        self.url = url
        
        # attributes to obtain html code (self.html) and select parts of it (self.soup)
        self.html = requests.get(self.url).content
        self.soup = BeautifulSoup(self.html,'html.parser')
        
        # attribute referring to the set of houses data (stored in a dictionary; see below)
        self.house_dict = self.house_dict()
        
        # set of attributes collected in the dictionary
        self.type_property = self.type_property()
        self.locality = self.locality()
        self.subtype = self.subtype()
        self.price = self.price()
        self.type_sale = self.type_sale()
        self.num_rooms = self.num_rooms()
        self.area = self.area()
        self.kitchen = self.kitchen()
        self.furnished = self.furnished()
        self.fire = self.fire()
        self.terrace_area = self.terrace_area()
        self.garden_area = self.garden_area()
        self.land = self.land()
        self.num_facade = self.num_facade()
        self.pool = self.pool()
        self.state = self.state()
        
    # Define a method that creates the dictionary with attributes as keys and houses' values as values    
    def house_dict(self):
        try:
            # The relevant info is under a "script" tag in the website
            result_set = self.soup.find_all('script',attrs={"type" :"text/javascript"})
            
            # Iterate through the "script" tags found and keep the one containing the substring "window.classified"
            # which contains all the relevant info
            for tag in result_set:
                if 'window.classified' in str(tag.string):
                    window_classified = tag
                    #when we've found the right tag we can stop the loop earlier
            
            
            # Access to the string attribute of the tag and remove leading and trailing whitespaces (strip)break
            wcs = window_classified.string
            wcs.strip()
            
            # Keep only the part of the string that will be converted into a dictionary
            wcs = wcs[wcs.find("{"):wcs.rfind("}")+1]
            
            # Convert it into a dictionary through json library
            house_dict = json.loads(wcs)
            return house_dict
        except:
            return None

    # Define a method to scrap each property attribute
    def type_property(self):
        try:
            return self.house_dict['property']['type']
        except:
            return None        
    
    def locality(self):
        try:
            return self.house_dict['property']['location']['postalCode']
        except:
            return None
    
    def subtype(self):
        try:
            return self.house_dict['property']['subtype']
        except:
            return None
    
    def price(self):
        try:
            return int(self.house_dict['transaction']['sale']['price'])
        except:
            return None
    
    def type_sale(self):
        try:
            if self.house_dict['flags']['isPublicSale'] == True:
                return 'Public Sale'
            elif self.house_dict['flags']['isNotarySale'] == True:
                return 'Notary Sale'
            elif self.house_dict['flags']['isAnInteractiveSale'] == True:
                return 'Intractive Sale'
            else:
                return None
        except:
            return None 
    
    def num_rooms(self):
        try:
            return int(self.house_dict['property']['bedroomCount'])
        except:
            return None
    
    def area(self):
        try:
            return int(self.house_dict['property']['netHabitableSurface'])
        except:
            return None
    
    def kitchen(self):
        try: 
            kitchen_type = self.house_dict['property']['kitchen']['type']
            if kitchen_type:
                return 1
            else:
                return 0        
        except:
            return None
        
    def furnished(self):
        try:
            furnished = self.house_dict['transaction']['sale']['isFurnished']
            if furnished == True:
                return 1
            else:
                return 0
            
        except:
            return None
    
    def fire(self):
        try:
            fire = self.house_dict['property']['fireplaceExists']
            if fire == True:
                return 1 
            else:
                return 0                
        except:
            return None
    
    def terrace_area(self):
        try:
            if self.house_dict['property']['hasTerrace'] == True:
                return int(self.house_dict['property']['terraceSurface'])
            else:
                return 0
        except:
            return None
    
    def garden_area(self):
        try:
            if self.house_dict['property']['hasGarden'] ==  True:
                return self.house_dict['property']['gardenSurface']
            else:
                return 0
        except:
            return None
    
    def land(self):
        try:
            if self.house_dict['property']['land'] != None:
                return self.house_dict['property']['land']['surface']
            else:
                return 0
        except:
            return None
        
    def num_facade(self):
        try:
            return int(self.house_dict['property']['building']['facadeCount'])
        except:
            return None
        
    def pool(self):
        try: 
            swim_regex = re.findall('swimming pool', str(self.html))
            if swim_regex:
                return 1
            else:
                return 0
        except:
            return None
        
    def state(self): 
        try:
            return self.house_dict['property']['building']['condition']
        except:
            return None

## 3) Collect all data in a defaultdict

In [None]:
houses_apartments_dict = defaultdict(list)

with open('houses_apartments_urls.csv', 'r') as file:
    url = file.readline()
    while url != "":
        
        houses_class = HouseApartmentScraping(url)
        
        houses_apartments_dict['Locality'].append(houses_class.locality)
        houses_apartments_dict['Type of property'].append(houses_class.type_property)
        houses_apartments_dict['Subtype of property'].append(houses_class.subtype)
        houses_apartments_dict['Price'].append(houses_class.price)
        houses_apartments_dict['Type of sale'].append(houses_class.type_sale)
        houses_apartments_dict['Number of rooms'].append(houses_class.num_rooms)
        houses_apartments_dict['Living surface area'].append(houses_class.area)
        houses_apartments_dict['Kitchen'].append(houses_class.kitchen)
        houses_apartments_dict['Furnished'].append(houses_class.furnished)
        houses_apartments_dict['Open fire'].append(houses_class.fire)
        houses_apartments_dict['Terrace'].append(houses_class.terrace_area)
        houses_apartments_dict['Garden'].append(houses_class.garden_area)
        houses_apartments_dict['Surface of the land'].append(houses_class.land)
        houses_apartments_dict['Number of facades'].append(houses_class.num_facade)
        houses_apartments_dict['Swimming pool'].append(houses_class.pool)
        houses_apartments_dict['State of the building'].append(houses_class.state)

        url = file.readline()

## 3) We store all data to a csv file with dataframe.

In [None]:
df = pd.DataFrame(houses_apartments_dict)
df.to_csv('all_data_of_the_houses.csv')

## 4) We show all data with pandas dataframe

In [3]:
df = pd.read_csv('all_data_of_the_houses.csv', index_col=0)

In [4]:
df.shape

(11523, 16)

In [5]:
df.head()

Unnamed: 0,Locality,Type of property,Subtype of property,Price,Type of sale,Number of rooms,Living surface area,Kitchen,Furnished,Open fire,Terrace,Garden,Surface of the land,Number of facades,Swimming pool,State of the building
0,1150.0,HOUSE,HOUSE,990000.0,,6.0,485.0,1.0,0.0,0.0,16.0,0.0,251.0,2.0,0,AS_NEW
1,4180.0,HOUSE,HOUSE,195000.0,,3.0,242.0,1.0,0.0,0.0,36.0,100.0,475.0,3.0,0,GOOD
2,7500.0,HOUSE_GROUP,HOUSE_GROUP,,,,,,0.0,0.0,0.0,0.0,0.0,,0,
3,4180.0,HOUSE,MIXED_USE_BUILDING,295000.0,,3.0,242.0,1.0,0.0,0.0,36.0,1000.0,1403.0,3.0,0,GOOD
4,4570.0,HOUSE,HOUSE,549000.0,,5.0,365.0,1.0,0.0,0.0,15.0,1850.0,25157.0,4.0,0,GOOD


In [6]:
df.tail()

Unnamed: 0,Locality,Type of property,Subtype of property,Price,Type of sale,Number of rooms,Living surface area,Kitchen,Furnished,Open fire,Terrace,Garden,Surface of the land,Number of facades,Swimming pool,State of the building
11518,1000.0,APARTMENT,APARTMENT,210000.0,,1.0,60.0,1.0,0.0,0.0,16.0,0.0,0.0,,0,JUST_RENOVATED
11519,9700.0,APARTMENT,SERVICE_FLAT,210000.0,,1.0,80.0,1.0,0.0,0.0,3.0,0.0,0.0,4.0,0,
11520,3000.0,APARTMENT,FLAT_STUDIO,215000.0,,1.0,26.0,1.0,1.0,0.0,0.0,0.0,0.0,2.0,0,JUST_RENOVATED
11521,9950.0,APARTMENT,APARTMENT,215000.0,,2.0,72.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,0,GOOD
11522,1090.0,APARTMENT,APARTMENT,219000.0,,1.0,83.0,1.0,0.0,0.0,0.0,0.0,0.0,4.0,0,
