In [14]:
from selenium import webdriver
import requests
from parsel import Selector
from bs4 import BeautifulSoup
import re
import pandas as pd
import json

## 1) We obtain 10000 url of the properties with web driver.

In [2]:
driver = webdriver.Chrome(executable_path='chromedriver.exe')

# We add url of each house in 333 page in immoweb to houses_url list.
houses_url = []

for i in range(1, 3):
    # We used 'i' to build urls of the 333 page in immoweb.
    #   So, we can reach 333 pages with for loop.
    apikey = str(i)+'&orderBy=relevance'
    url = 'https://www.immoweb.be/en/search/house/for-sale?countries=BE&page='+apikey

    # An implicit wait tells WebDriver to poll the DOM for a
    #   certain amount of time when trying to find any element 
    #     (or elements) not immediately available. 
    driver.implicitly_wait(10)
    
    # The first thing you’ll want to do with WebDriver is navigate
    #   to a link. The normal way to do this is by calling get method:    
    driver.get(url)

    # Selector` allows you to select parts of an XML or HTML text using CSS
    #   or XPath expressions and extract data from it.
    sel = Selector(text=driver.page_source) 

    # xpath query of the houses in the immoweb page
    xpath_houeses = '//*[@id="main-content"]/li//h2//a/@href'
    
    # Find nodes matching the xpath ``query`` and return the result
    page_houses_url = sel.xpath(xpath_houeses).extract()
    
    # There are approximately 30 houses in each page.
    # We add each page url list to houses_url like matrix.
    houses_url.append(page_houses_url)

print(len(houses_url))
print(houses_url[1][4])

2
https://www.immoweb.be/en/classified/villa/for-sale/oreye/4360/8728125?searchId=5f6dac2aa7076


In [3]:
with open('houses_apartments_urls.csv', 'w') as file:
    for page_url in houses_url:
        for url in page_url:
            file.write(url+'\n')

In [4]:
# We add url of each house in 333 page in immoweb to houses_url list.
apartments_url = []

for i in range(1, 3):
    # We used 'i' to build urls of the 333 page in immoweb.
    #   So, we can reach 333 pages with for loop.
    apikey = str(i)+'&orderBy=relevance'
    url = 'https://www.immoweb.be/en/search/apartment/for-sale?countries=BE&page='+apikey

    # An implicit wait tells WebDriver to poll the DOM for a
    #   certain amount of time when trying to find any element 
    #     (or elements) not immediately available. 
    driver.implicitly_wait(10)
    
    # The first thing you’ll want to do with WebDriver is navigate
    #   to a link. The normal way to do this is by calling get method:    
    driver.get(url)

    # Selector` allows you to select parts of an XML or HTML text using CSS
    #   or XPath expressions and extract data from it.
    sel = Selector(text=driver.page_source) 

    # xpath query of the houses in the immoweb page
    xpath_apartments = '//*[@id="main-content"]/li//h2//a/@href'
    
    # Find nodes matching the xpath ``query`` and return the result
    page_apartments_url = sel.xpath(xpath_apartments).extract()
    
    # There are approximately 30 houses in each page.
    # We add each page url list to houses_url like matrix.
    apartments_url.append(page_apartments_url)

print(len(apartments_url))
print(apartments_url[1][4])

2
https://www.immoweb.be/en/classified/new-real-estate-project-apartments/for-sale/neufchateau-longlier/6840/8925951?searchId=5f6dadf123db6


In [6]:
with open('houses_apartments_urls.csv', 'a') as file:
    for page_url in apartments_url:
        for url in page_url:
            file.write(url+'\n')

## 2) We scrape all data of each house with requests

#### Each url represents a house and a house have many attributes like localty, type_property etc.  

In [19]:
import re
class HouseApartmentScraping:
    def __init__(self, url, types):
        self.url = url
        self.type_property = types
        
        self.html = requests.get(self.url).content
        self.sel = Selector(text=str(self.html))
        self.soup = BeautifulSoup(self.html,'html.parser')
        
        self.house_dict = self.house_dict()
        
        self.locality = self.locality()
        self.subtype = self.subtype()
        self.price = self.price()
        self.type_sale = self.type_sale()
        self.num_rooms = self.num_rooms()
        self.area = self.area()
        self.kitchen = self.kitchen()
        self.furnished = self.furnished()
        self.fire = self.fire()
        self.terrace_area = self.terrace_area()
        self.garden_area = self.garden_area()
        self.land = self.land()
        self.land_plot = self.land_plot()
        self.num_facade = self.num_facade()
        self.pool = self.pool()
        self.state = self.state()
        
    def house_dict(self):
        result_set = self.soup.find_all('script',attrs={"type" :"text/javascript"})
        for tag in result_set:
            #If it contains substring 'window.classified', we've found the right tag
            if 'window.classified' in str(tag.string):
                # print(tag.string)
                window_classified = tag
                #when we've found the right tag we can stop the loop earlier
                break
        wcs = window_classified.string
        wcs.strip()
        wcs = wcs[wcs.find("{"):wcs.rfind("}")+1]
        house_dict = json.loads(wcs)
        return house_dict

    def locality(self):
        try:
            return self.house_dict['customers'][0]["location"]["locality"]
        except:
            return None
    
    def subtype(self):
        try:
            return self.house_dict['property']['subtype']
        except:
            return None
    
    def price(self):
        try:
            return self.house_dict['price']['mainDisplayPrice']
        except:
            return None
    
    def type_sale(self):
        return None
    
    def num_rooms(self):
        try:
            return self.house_dict['property']['bedroomCount']
        except:
            return None
    
    def area(self):
        return None
    
    def kitchen(self):
        try: 
            kitchen_type = self.house_dict['property']['kitchen']['type']
            if kitchen_type == 'Installed':
                return 1
            else:
                return 0        
        except:
            return None
        
    def furnished(self):
        return None
    
    def fire(self):
        return None
    
    def terrace_area(self):
        return None
    
    def garden_area(self):
        return None
    
    def land(self):
        try:
            return self.house_dict['property']['land']['surface']
        except:
            return None
    
    def land_plot(self):
        return None
    
    def num_facade(self):
        return None
    
    def pool(self):
        swim_regex = re.findall('swimming pool', str(self.html))
        if swim_regex:
            return 1
        else:
            return 0
        
    def state(self): 
        return None

####       We collect all data in the houses_dict.

In [20]:
from collections import defaultdict
houses_apartments_dict = defaultdict(list)

for page_list in houses_url:
    for url_a_house in page_list[:2]:
        
        houses_class = HouseApartmentScraping(url_a_house, 'House')
        
        houses_apartments_dict['Locality'].append(houses_class.locality)
        houses_apartments_dict['Type of property'].append(houses_class.type_property)
        houses_apartments_dict['Subtype of property'].append(houses_class.subtype)
        houses_apartments_dict['Price'].append(houses_class.price)
        houses_apartments_dict['Type of sale'].append(houses_class.type_sale)
        houses_apartments_dict['Number of rooms'].append(houses_class.num_rooms)
        houses_apartments_dict['Area'].append(houses_class.area)
        houses_apartments_dict['Kitchen'].append(houses_class.kitchen)
        houses_apartments_dict['Furnished'].append(houses_class.furnished)
        houses_apartments_dict['Open fire'].append(houses_class.fire)
        houses_apartments_dict['Terrace'].append(houses_class.terrace_area)
        houses_apartments_dict['Garden'].append(houses_class.garden_area)
        houses_apartments_dict['Surface of the land'].append(houses_class.land)
        houses_apartments_dict['Surface area of the plot of land'].append(houses_class.land_plot)
        houses_apartments_dict['Number of facades'].append(houses_class.num_facade)
        houses_apartments_dict['Swimming pool'].append(houses_class.pool)
        houses_apartments_dict['State of the building'].append(houses_class.state)

In [21]:
for page_list in apartments_url:
    for url_an_apartment in page_list[:2]:
        
        apartments_class = HouseApartmentScraping(url_an_apartment, 'Apartment')
        
        houses_apartments_dict['Locality'].append(apartments_class.locality)
        houses_apartments_dict['Type of property'].append(apartments_class.type_property)
        houses_apartments_dict['Subtype of property'].append(apartments_class.subtype)
        houses_apartments_dict['Price'].append(apartments_class.price)
        houses_apartments_dict['Type of sale'].append(apartments_class.type_sale)
        houses_apartments_dict['Number of rooms'].append(apartments_class.num_rooms)
        houses_apartments_dict['Area'].append(apartments_class.area)
        houses_apartments_dict['Kitchen'].append(apartments_class.kitchen)
        houses_apartments_dict['Furnished'].append(apartments_class.furnished)
        houses_apartments_dict['Open fire'].append(apartments_class.fire)
        houses_apartments_dict['Terrace'].append(apartments_class.terrace_area)
        houses_apartments_dict['Garden'].append(apartments_class.garden_area)
        houses_apartments_dict['Surface of the land'].append(apartments_class.land)
        houses_apartments_dict['Surface area of the plot of land'].append(apartments_class.land_plot)
        houses_apartments_dict['Number of facades'].append(apartments_class.num_facade)
        houses_apartments_dict['Swimming pool'].append(apartments_class.pool)
        houses_apartments_dict['State of the building'].append(apartments_class.state)

## 3) We store all data to a csv file with dataframe.

In [22]:
df = pd.DataFrame(houses_apartments_dict)
df.to_csv('all_data_of_the_houses.csv')

In [23]:
df

Unnamed: 0,Locality,Type of property,Subtype of property,Price,Type of sale,Number of rooms,Area,Kitchen,Furnished,Open fire,Terrace,Garden,Surface of the land,Surface area of the plot of land,Number of facades,Swimming pool,State of the building
0,Woluwe-St-Pierre,House,HOUSE,"€990,000",,6,,0,,,,,251.0,,,0,
1,Trooz,House,HOUSE,"€195,000",,3,,0,,,,,475.0,,,0,
2,Woluwe-Saint-Lambert,House,HOUSE,"€500,000",,6,,0,,,,,95.0,,,0,
3,Woluwe-Saint-Lambert,House,HOUSE,"€500,000",,6,,0,,,,,95.0,,,0,
4,Uccle,Apartment,APARTMENT,"€380,000",,2,,0,,,,,,,,0,
5,Uccle,Apartment,APARTMENT,"€295,000",,1,,0,,,,,,,,0,
6,Ixelles,Apartment,APARTMENT,"€630,000",,3,,0,,,,,,,,0,
7,Ixelles,Apartment,PENTHOUSE,"€1,500,000",,3,,0,,,,,,,,0,
