In [2]:
from selenium import webdriver
import requests
from parsel import Selector
from bs4 import BeautifulSoup
import re
import pandas as pd

## 1) We obtain 10000 url of the properties with web driver.

In [3]:
driver = webdriver.Chrome(executable_path='chromedriver.exe')

# We add url of each house in 333 page in immoweb to houses_url list.
houses_url = []

for i in range(1, 3):
    # We used 'i' to build urls of the 333 page in immoweb.
    #   So, we can reach 333 pages with for loop.
    apikey = str(i)+'&orderBy=relevance'
    url = 'https://www.immoweb.be/en/search/house/for-sale?countries=BE&page='+apikey

    # An implicit wait tells WebDriver to poll the DOM for a
    #   certain amount of time when trying to find any element 
    #     (or elements) not immediately available. 
    driver.implicitly_wait(10)
    
    # The first thing you’ll want to do with WebDriver is navigate
    #   to a link. The normal way to do this is by calling get method:    
    driver.get(url)

    # Selector` allows you to select parts of an XML or HTML text using CSS
    #   or XPath expressions and extract data from it.
    sel = Selector(text=driver.page_source) 

    # xpath query of the houses in the immoweb page
    xpath_houeses = '//*[@id="main-content"]/li//h2//a/@href'
    
    # Find nodes matching the xpath ``query`` and return the result
    page_houses_url = sel.xpath(xpath_houeses).extract()
    
    # There are approximately 30 houses in each page.
    # We add each page url list to houses_url like matrix.
    houses_url.append(page_houses_url)

print(len(houses_url))
print(houses_url[1][4])

2
https://www.immoweb.be/en/classified/house/for-sale/uccle/1180/8883806?searchId=5f6d8eadc20c3


In [4]:
with open('houses_apartments_urls.csv', 'w') as file:
    for page_url in houses_url:
        for url in page_url:
            file.write(url+'\n')

In [5]:
# We add url of each house in 333 page in immoweb to houses_url list.
apartments_url = []

for i in range(1, 3):
    # We used 'i' to build urls of the 333 page in immoweb.
    #   So, we can reach 333 pages with for loop.
    apikey = str(i)+'&orderBy=relevance'
    url = 'https://www.immoweb.be/en/search/apartment/for-sale?countries=BE&page='+apikey

    # An implicit wait tells WebDriver to poll the DOM for a
    #   certain amount of time when trying to find any element 
    #     (or elements) not immediately available. 
    driver.implicitly_wait(10)
    
    # The first thing you’ll want to do with WebDriver is navigate
    #   to a link. The normal way to do this is by calling get method:    
    driver.get(url)

    # Selector` allows you to select parts of an XML or HTML text using CSS
    #   or XPath expressions and extract data from it.
    sel = Selector(text=driver.page_source) 

    # xpath query of the houses in the immoweb page
    xpath_apartments = '//*[@id="main-content"]/li//h2//a/@href'
    
    # Find nodes matching the xpath ``query`` and return the result
    page_apartments_url = sel.xpath(xpath_apartments).extract()
    
    # There are approximately 30 houses in each page.
    # We add each page url list to houses_url like matrix.
    apartments_url.append(page_apartments_url)

print(len(apartments_url))
print(apartments_url[1][4])

2
https://www.immoweb.be/en/classified/new-real-estate-project-apartments/for-sale/neufchateau-longlier/6840/8925951?searchId=5f6d8edc3ef18


In [6]:
with open('houses_apartments_urls.csv', 'a') as file:
    for page_url in apartments_url:
        for url in page_url:
            file.write(url+'\n')

## 2) We scrape all data of each house with requests

#### Each url represents a house and a house have many attributes like localty, type_property etc.  

In [41]:
import re
class HouseApartmentScraping:
    def __init__(self, url, types):
        self.url = url
        self.type_property = types
        
        self.html = str(requests.get(self.url).content)
        self.sel = Selector(text=self.html)
        self.soup = BeautifulSoup(self.html)
        
        self.general = self.general()
        self.interior = self.interior()
        self.exterior = self.exterior()
        
        self.locality = self.locality()
        self.subtype = self.subtype()
        self.price = self.price()
        self.type_sale = self.type_sale()
        self.num_rooms = self.num_rooms()
        self.area = self.area()
        self.kitchen = self.kitchen()
        self.furnished = self.furnished()
        self.fire = self.fire()
        self.terrace_area = self.terrace_area()
        self.garden_area = self.garden_area()
        self.land = self.land()
        self.land_plot = self.land_plot()
        self.num_facade = self.num_facade()
        self.pool = self.pool()
        self.state = self.state()
        
    def description(self):    
        descriptions_xpath = '/html/body/div[1]/div[2]/div/div/main/div[2]/div[2]/div/div/div//text()'
        descriptions = self.sel.xpath(descriptions_xpath).extract()
        descriptions_list = []
        for description in descriptions:
            description = description.replace('\\n', '').strip()
            descriptions_list.append(description)
        description = " ".join(descriptions_list)
        return description 
        
    def general(self):
        general_keys_xpath = '/html/body/div[1]/div[2]/div/div/main/div[2]/div[3]/div/div/div/iw-accordion/template/table/tbody/tr/th/text()'
        general_keys = self.format_items(general_keys_xpath)
        general_values_xpath = '/html/body/div[1]/div[2]/div/div/main/div[2]/div[3]/div/div/div/iw-accordion/template/table/tbody/tr/td/text()'
        general_values = self.format_items(general_values_xpath)
        general_dict = self.format_dict(general_keys, general_values)
        return general_dict

    def interior(self):
        interior_keys_xpath = '/html/body/div[1]/div[2]/div/div/main/div[2]/div[4]/div/div/div/iw-accordion/template/table/tbody/tr/th/text()'
        interior_keys = self.format_items(interior_keys_xpath)
        interior_values_xpath = '/html/body/div[1]/div[2]/div/div/main/div[2]/div[4]/div/div/div/iw-accordion/template/table/tbody/tr/td/text()'
        interior_values = self.format_items(interior_values_xpath)
        interior_dict = self.format_dict(interior_keys, interior_values)
        return interior_dict
        
    def exterior(self):
        exterior_keys_xpath = '/html/body/div[1]/div[2]/div/div/main/div[2]/div[5]/div/div/div/iw-accordion/template/table/tbody/tr/th/text()'
        exterior_keys = self.format_items(exterior_keys_xpath)
        exterior_values_xpath = '/html/body/div[1]/div[2]/div/div/main/div[2]/div[5]/div/div/div/iw-accordion/template/table/tbody/tr/td/text()'
        exterior_values = self.format_items(exterior_values_xpath)
        exterior_dict = self.format_dict(exterior_keys, exterior_values)
        return exterior_dict           

    def format_items(self, xpaths):
        format_items = self.sel.xpath(xpaths).extract()
        format_list = []
        for format_item in format_items:
            format_item = format_item.replace('\\n', '').strip()
            if format_item:
                format_list.append(format_item)  
        return format_list
    
    def format_dict(self, keys, values):
        format_dict = {}
        if len(keys) <= len(values):
            for i in range(len(keys)):
                format_dict[keys[i]] = values[i]
            return format_dict
        else:
            for i in range(len(values)):
                format_dict[keys[i]] = values[i]
            return format_dict
        
    def locality(self):
        try:
            locality_values_xpath = '/html/body/div[1]/div[2]/div/div/main/div[3]/section/div/iw-accordion/template[2]/div/div/div/table/tbody/tr[1]/td/text()'
            locality_values = self.sel.xpath(locality_values_xpath).extract()
            locality_values = locality_values[1].replace('\\n', '').strip()
            locality_values = re.findall('\d+', locality_values)
            return locality_values[0]
        except:
            return None
    
    def subtype(self):
        return None
    
    def price(self):
        try:
            price_values_xpath = '/html/body/div[1]/div[2]/div/div/main/div[2]/div[8]/div/div/div/iw-accordion/template/table/tbody/tr[1]/td/span[2]/text()'
            price_values = self.sel.xpath(price_values_xpath).extract()
            
            if len(price_values) == 0:
                price_values_xpath = '/html/body/div[1]/div[2]/div/div/main/div[2]/div[9]/div/div/div/iw-accordion/template/table/tbody/tr[1]/td/span[2]/text()'
                price_values = self.sel.xpath(price_values_xpath).extract()
            price_values = re.findall('\d+', price_values[0]) 
            price_values = int(price_values[0])
            return price_values
        except:
            return None
    
    def type_sale(self):
        return None
    
    def num_rooms(self):
        try:
            return int(self.interior['Bedrooms'])
        except:
            return None
    
    def area(self):
        try: 
            area = self.interior['Living area']
            return int(area)
        except:
            return None
    
    def kitchen(self):
        try: 
            kitchen_type = self.interior['Kitchen type']
            if kitchen_type == 'Installed':
                return 1
            else:
                return 0
        except:
            return None
        
    def furnished(self):
        try:
            furnished = self.interior['Furnished']
            if furnished == 'Yes':
                return 1
            else:
                return 0
        except:
            return None
    
    def fire(self):
        try:
            fire = self.interior['Fireplace']
            if fire:
                return 1
            else:
                return 0
        except:
            return None
    
    def terrace_area(self):
        try:
            terrace_area = int(self.exterior['Terrace surface'])
            return terrace_area
        except:
            return None
    
    def garden_area(self):
        try: 
            garden_area = int(self.exterior['Garden surface'])
            return garden_area    
        except:
            return None
    
    def land(self):
        return None
    
    def land_plot(self):
        try:
            land_plot = int(self.exterior['Surface of the plot'])
            return land_plot 
        except:
            return None
    
    def num_facade(self):
        try:
            general = self.general
            num_facades = general['Facades']
            if num_facades:
                return int(num_facades)
        except:
            return None
    
    def pool(self):
        swim_regex = re.findall('swimming pool', self.html)
        if swim_regex:
            return 1
        else:
            return 0
        
    def state(self): 
        try:
            return self.general['Building condition']
        except:
            return None

####       We collect all data in the houses_dict.

In [42]:
from collections import defaultdict
houses_apartments_dict = defaultdict(list)

for page_list in houses_url:
    for url_a_house in page_list[:2]:
        
        houses_class = HouseApartmentScraping(url_a_house, 'House')
        
        houses_apartments_dict['Locality'].append(houses_class.locality)
        houses_apartments_dict['Type of property'].append(houses_class.type_property)
        houses_apartments_dict['Subtype of property'].append(houses_class.subtype)
        houses_apartments_dict['Price'].append(houses_class.price)
        houses_apartments_dict['Type of sale'].append(houses_class.type_sale)
        houses_apartments_dict['Number of rooms'].append(houses_class.num_rooms)
        houses_apartments_dict['Area'].append(houses_class.area)
        houses_apartments_dict['Kitchen'].append(houses_class.kitchen)
        houses_apartments_dict['Furnished'].append(houses_class.furnished)
        houses_apartments_dict['Open fire'].append(houses_class.fire)
        houses_apartments_dict['Terrace'].append(houses_class.terrace_area)
        houses_apartments_dict['Garden'].append(houses_class.garden_area)
        houses_apartments_dict['Surface of the land'].append(houses_class.land)
        houses_apartments_dict['Surface area of the plot of land'].append(houses_class.land_plot)
        houses_apartments_dict['Number of facades'].append(houses_class.num_facade)
        houses_apartments_dict['Swimming pool'].append(houses_class.pool)
        houses_apartments_dict['State of the building'].append(houses_class.state)

In [43]:
for page_list in apartments_url:
    for url_an_apartment in page_list[:2]:
        
        apartments_class = HouseApartmentScraping(url_an_apartment, 'Apartment')
        
        houses_apartments_dict['Locality'].append(apartments_class.locality)
        houses_apartments_dict['Type of property'].append(apartments_class.type_property)
        houses_apartments_dict['Subtype of property'].append(apartments_class.subtype)
        houses_apartments_dict['Price'].append(apartments_class.price)
        houses_apartments_dict['Type of sale'].append(apartments_class.type_sale)
        houses_apartments_dict['Number of rooms'].append(apartments_class.num_rooms)
        houses_apartments_dict['Area'].append(apartments_class.area)
        houses_apartments_dict['Kitchen'].append(apartments_class.kitchen)
        houses_apartments_dict['Furnished'].append(apartments_class.furnished)
        houses_apartments_dict['Open fire'].append(apartments_class.fire)
        houses_apartments_dict['Terrace'].append(apartments_class.terrace_area)
        houses_apartments_dict['Garden'].append(apartments_class.garden_area)
        houses_apartments_dict['Surface of the land'].append(apartments_class.land)
        houses_apartments_dict['Surface area of the plot of land'].append(apartments_class.land_plot)
        houses_apartments_dict['Number of facades'].append(apartments_class.num_facade)
        houses_apartments_dict['Swimming pool'].append(apartments_class.pool)
        houses_apartments_dict['State of the building'].append(apartments_class.state)

## 3) We store all data to a csv file with dataframe.

In [44]:
df = pd.DataFrame(houses_apartments_dict)
df.to_csv('all_data_of_the_houses.csv')

In [39]:
df

Unnamed: 0,Locality,Type of property,Subtype of property,Price,Type of sale,Number of rooms,Area,Kitchen,Furnished,Open fire,Terrace,Garden,Surface of the land,Surface area of the plot of land,Number of facades,Swimming pool,State of the building
0,1150,House,,990000.0,,6,485,0,,,16,,,251.0,2,0,As new
1,4870,House,,195000.0,,3,242,0,,,36,100.0,,475.0,3,0,Good
2,1040,House,,500000.0,,7,298,0,,,28,,,90.0,2,0,Good
3,1200,House,,,,6,225,0,,,30,30.0,,95.0,3,0,Good
4,1180,Apartment,,380000.0,,2,87,0,,,13,,,,2,0,As new
5,1180,Apartment,,295000.0,,1,65,0,,,12,25.0,,,2,0,As new
6,1050,Apartment,,630000.0,,3,160,0,,,23,,,,4,0,As new
7,1050,Apartment,,1500000.0,,3,220,0,,,60,,,,3,0,As new
