In [22]:
import time
from curl_cffi import requests as cureq
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd

In [23]:
headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36"}

In [24]:
#First check for the number of houses and pages to be scraped
url = f"https://www.immoweb.be/nl/search-results/huis/te-koop?countries=BE&page=1&orderBy=relevance"

resp = cureq.get(url, headers=headers, impersonate="chrome")

print(resp.status_code)

data = resp.json()

total_houses_1_page = int(data['range'].split('-')[1])

total_number_of_houses= data['totalItems']

number_of_pages = total_number_of_houses//(total_houses_1_page + 1)

print(f'Immoweb contains {total_number_of_houses} listings on {number_of_pages} pages')


200
Immoweb contains 9969 listings on 332 pages


In [25]:
#FIRST SCRAPE: Scrape function

def collect_data(pages: int, headers: dict):
    """
    This function collects data from the specified number of pages and returns a list.

    Args:
    pages (int): Number of pages to collect data from.
    headers (dict): HTTP headers to include in the requests.
    """
    if not headers:
        raise ValueError("Headers are required to make the request.")

    data_collection = []

    # Start the timer
    start_time = time.perf_counter()
    print(f"Start scraping {pages} pages...")

    # Loop through the pages and collect data
    for page in range(1, pages + 1):
        url = f"https://www.immoweb.be/nl/search-results/huis/te-koop?countries=BE&page={page}&orderBy=relevance"
        
        try:
            # Send the request
            print(f"Scraping page {page}...")
            resp = cureq.get(url, headers=headers, impersonate="chrome")
            resp.raise_for_status()  # Raise an error for bad responses (4xx or 5xx)

            # Try to parse the JSON response
            data = resp.json()
            data_collection.append(data)
            print(f"Page {page} scraped successfully.")
        
        except requests.exceptions.RequestException as req_err:
            # Handle network-related errors
            print(f"Network error on page {page}: {req_err}")
        except ValueError:
            # Handle JSON decoding errors
            print(f"Error decoding JSON on page {page}.")
        except Exception as e:
            # Catch any other exceptions
            print(f"An error occurred on page {page}: {e}")

    # Stop the timer and print the elapsed time
    end_time = time.perf_counter()
    elapsed_time = end_time - start_time
    print(f"Data scraping completed in {elapsed_time:.2f} seconds.")

    return data_collection


In [26]:
#FIRST SCRAPE: Parse function

def parse_data(listings: list):
    """
    Parses the listings data to extract relevant property details.

    Args:
    listings (list): List of raw listings data.

    Returns:
    all_properties (list): List of dictionaries with parsed property data.
    """
    all_properties = []

    for listing in listings:
        # Get the results list (default to an empty list if not found)
        results_list = listing.get('results', [])

        for result in results_list:
            # Extract property details using nested get() for safety
            property_details = result.get('property', {})
            location = property_details.get('location', {})
            transaction = result.get('transaction', {}).get('sale', {})

            property_data = {
                'id': result.get('id'),
                'type': property_details.get('type'),
                'subtype': property_details.get('subtype'),
                'country': location.get('country'),
                'region': location.get('region'),
                'locality_name': location.get('locality'),
                'locality_code': location.get('postalCode'),
                'street': location.get('street'),
                'number': location.get('number'),
                'latitude': location.get('latitude'),
                'longitude': location.get('longitude'),
                'bedroom_count': property_details.get('bedroomCount', "None"),
                'net_habitable_surface': property_details.get('netHabitableSurface'),
                'land_surface': property_details.get('landSurface'),
                'room_count': property_details.get('roomCount', "None"),
                'transaction_type': result.get('transaction', {}).get('type'),
                'sale_annuity': transaction.get('lifeAnnuity'),
                'price': transaction.get('price'),
                'old_price': transaction.get('oldPrice')
            }

            all_properties.append(property_data)

    print(f"Extracted {len(all_properties)} properties from data.")
    return all_properties


In [27]:
#Testing and starting the first scrape with the scrape function

listings = collect_data(number_of_pages, headers)


Start scraping 332 pages...
Scraping page 1...
Page 1 scraped successfully.
Scraping page 2...
Page 2 scraped successfully.
Scraping page 3...
Page 3 scraped successfully.
Scraping page 4...
Page 4 scraped successfully.
Scraping page 5...
Page 5 scraped successfully.
Scraping page 6...
Page 6 scraped successfully.
Scraping page 7...
Page 7 scraped successfully.
Scraping page 8...
Page 8 scraped successfully.
Scraping page 9...
Page 9 scraped successfully.
Scraping page 10...
Page 10 scraped successfully.
Scraping page 11...
Page 11 scraped successfully.
Scraping page 12...
Page 12 scraped successfully.
Scraping page 13...
Page 13 scraped successfully.
Scraping page 14...
Page 14 scraped successfully.
Scraping page 15...
Page 15 scraped successfully.
Scraping page 16...
Page 16 scraped successfully.
Scraping page 17...
Page 17 scraped successfully.
Scraping page 18...
Page 18 scraped successfully.
Scraping page 19...
Page 19 scraped successfully.
Scraping page 20...
Page 20 scraped succ

In [28]:
#Here i create a json file with one element of listings so we can have a look of the json data from one listing on the overview pages that was scraped during the first scrape
with open('listings_first_scrape_test.json', 'w') as f:
    json.dump(listings[0], f, indent=4)

In [29]:
#Parse all the data from the first scrape and put it in a list

all_properties = parse_data(listings)

Extracted 9930 properties from data.


In [30]:
#An overview of the data that was kept from one listing

all_properties[18]

{'id': 20257810,
 'type': 'HOUSE',
 'subtype': 'EXCEPTIONAL_PROPERTY',
 'country': 'België',
 'region': 'Wallonië',
 'locality_name': 'Libin (Transinne)',
 'locality_code': '6890',
 'street': "Rue de l'Eglise",
 'number': '2',
 'latitude': 50.0000839,
 'longitude': 5.203685699999999,
 'bedroom_count': 2,
 'net_habitable_surface': 232,
 'land_surface': 969,
 'room_count': None,
 'transaction_type': 'FOR_SALE',
 'sale_annuity': None,
 'price': 349000,
 'old_price': None}

In [31]:
#create a first listings list 

listings_1 = []  

for house in all_properties:
    listing_1 = {}  
    listing_1['id'] = house.get('id')
    listing_1['locality_name'] = house.get('locality_name')
    listing_1['Postal_code'] = house.get('locality_code')
    listing_1['Price'] = house.get('price')
    listing_1['Subtype'] = house.get('subtype')
    listing_1['Number_of_rooms'] = house.get('room_count')
    listing_1['Number_of_bedrooms'] = house.get('bedroom_count')
    listing_1['Living_area'] = house.get('net_habitable_surface')
    listing_1['sale_annuity'] = house.get('sale_annuity')
    listing_1['Type_of_sale'] = house.get('transaction_type')
    listing_1['street'] = house.get('street')
    listing_1['number'] = house.get('number')
    listing_1['latitude'] = house.get('latitude')
    listing_1['longitude'] = house.get('longitude')
    listing_1['landSurface'] = house.get('land_surface')
    
    
    listings_1.append(listing_1)  

listings_1

    
    

[{'id': 20264804,
  'locality_name': 'Wetteren',
  'Postal_code': '9230',
  'Price': None,
  'Subtype': 'HOUSE_GROUP',
  'Number_of_rooms': None,
  'Number_of_bedrooms': None,
  'Living_area': None,
  'sale_annuity': None,
  'Type_of_sale': 'FOR_SALE',
  'street': 'Cooppallaan',
  'number': '30',
  'latitude': 51.009508,
  'longitude': 3.876983,
  'landSurface': None},
 {'id': 20260393,
  'locality_name': 'Schaerbeek',
  'Postal_code': '1030',
  'Price': 495000,
  'Subtype': 'HOUSE',
  'Number_of_rooms': None,
  'Number_of_bedrooms': 3,
  'Living_area': 120,
  'sale_annuity': None,
  'Type_of_sale': 'FOR_SALE',
  'street': 'Rue Joseph Wauters',
  'number': '77',
  'latitude': 50.8689682,
  'longitude': 4.3946084,
  'landSurface': 100},
 {'id': 20260850,
  'locality_name': 'Woluwe-Saint-Pierre',
  'Postal_code': '1150',
  'Price': 575000,
  'Subtype': 'HOUSE',
  'Number_of_rooms': None,
  'Number_of_bedrooms': 5,
  'Living_area': 177,
  'sale_annuity': None,
  'Type_of_sale': 'FOR_SALE'

In [32]:
# Convert listings_1 to DataFrame and export to CSV
dataframe_first_scrape = pd.DataFrame(listings_1)
csv_path = r"C:\Users\Rik\Desktop\immoeliza\scraper\immo_scraper_2_land.csv"
dataframe_first_scrape.to_csv(csv_path, index=False)

print(f"Data from first scrape saved to {csv_path}")

Data from first scrape saved to C:\Users\Rik\Desktop\immoeliza\scraper\immo_scraper_2_land.csv


In [None]:
## SECOND SCRAPE: This is the scrape of the individual house pages

print("Start second scraping")
    
data_collection_2 = []

for property in listings_1:
    
    base_url = "https://www.immoweb.be/nl/zoekertje/"
    property_url = base_url + str(property['id'])
    
    try:
        # Send request to property URL
        r = requests.get(property_url, headers=headers)
        r.raise_for_status()  
        
        # Parse the page content
        soup = BeautifulSoup(r.content, "html.parser")
        print(f"Page {property_url} scraped successfully.")
        
        # Try to extract the specific script tag
        script_tag = soup.select_one('div.classified script[type="text/javascript"]')
        
        if script_tag:
            # Process the script_tag here
            print("Script tag found and processed.")
        else:
            print(f"No script tag found on {property_url}")
    
    except Exception as e:
        # Print the error message and continue to the next property
        print(f"Error on page {property_url}: {e}")
        continue
    
    # Extract the JavaScript object
    js_content = script_tag.string
    
    # Find the start and end of the JSON object
    start = js_content.find('{')
    end = js_content.rfind('}') + 1
    
    # Extract and parse the JSON data
    json_data = json.loads(js_content[start:end])
    
    #append json_data to the list data_collection2
    data_collection_2.append(json_data)




Start second scraping
Page https://www.immoweb.be/nl/zoekertje/20264804 scraped successfully.
Script tag found and processed.
Page https://www.immoweb.be/nl/zoekertje/20260393 scraped successfully.
Script tag found and processed.
Page https://www.immoweb.be/nl/zoekertje/20260850 scraped successfully.
Script tag found and processed.
Page https://www.immoweb.be/nl/zoekertje/20254029 scraped successfully.
Script tag found and processed.
Page https://www.immoweb.be/nl/zoekertje/20262954 scraped successfully.
Script tag found and processed.
Page https://www.immoweb.be/nl/zoekertje/20264095 scraped successfully.
Script tag found and processed.
Page https://www.immoweb.be/nl/zoekertje/20264349 scraped successfully.
Script tag found and processed.
Page https://www.immoweb.be/nl/zoekertje/20263033 scraped successfully.
Script tag found and processed.
Page https://www.immoweb.be/nl/zoekertje/20257839 scraped successfully.
Script tag found and processed.
Page https://www.immoweb.be/nl/zoekertje/2

In [15]:
#Here i create a json file with the data for one individual listing/house that i scraped during the second scrape
with open('listings_second_scrape_test.json', 'w') as f:
    json.dump(data_collection_2[15], f, indent=4)

In [16]:
#parse en merge the data from data_collection_2 in listings_2
listings_2 = []

for house in data_collection_2:
    listing_2 = {}
    
    
    listing_2['id'] = house.get('id', None)
    
    
    property_details = house.get('property', {})

    listing_2['Open_fire'] = property_details.get('fireplaceExists', None)
    
    listing_2['Swimming_Pool'] = property_details.get('hasSwimmingPool', None)
    
    listing_2['hasTerrace'] = property_details.get('hasTerrace', None)
    
    listing_2['terraceSurface'] = property_details.get('terraceSurface', None)
    
    listing_2['hasGarden'] = property_details.get('hasGarden', None)
    
    listing_2['gardenSurface'] = property_details.get('gardenSurface', None)
    
    
    kitchen_details = property_details.get('kitchen', {})
    
    if isinstance(kitchen_details, dict):
        listing_2['Kitchen_type'] = kitchen_details.get('type', None)
    else:
        listing_2['Kitchen_type'] = None
        
    
    building_details = property_details.get('building',{})
    
    if isinstance(building_details, dict):
        listing_2['Number_of_facades'] = building_details.get('facadeCount', None)
    else:
        listing_2['Number_of_facades'] = None
    
    if isinstance(building_details, dict):
        listing_2['State_of_building'] = building_details.get('condition', None)
    else:
        listing_2['State_of_building'] = None
    
    
    transaction_details = house.get('transaction',{})
    
    
    sale_details = transaction_details.get('sale',{})
    
    if isinstance(sale_details, dict):
        listing_2['Furnished'] = sale_details.get('isFurnished', None)
    else:
        listing_2['Furnished'] = None
    
    if isinstance(sale_details, dict):
        listing_2['Starting_price'] = sale_details.get('hasStartingPrice', None)
    else:
        listing_2['Starting_price'] = None
    
    certificates = transaction_details.get('certificates', {})
    
    if isinstance(certificates, dict):
        listing_2['epc'] = certificates.get('epcScore', None)
    else:
        listing_2['epc'] = None
    
    listings_2.append(listing_2)
    
    

In [17]:
#Here i test one element (= one id/house) in listings_2
listings_2[24]

{'id': 20260384,
 'Open_fire': False,
 'Swimming_Pool': None,
 'hasTerrace': None,
 'terraceSurface': None,
 'hasGarden': True,
 'gardenSurface': 24,
 'Kitchen_type': 'INSTALLED',
 'Number_of_facades': 2,
 'State_of_building': 'GOOD',
 'Furnished': None,
 'Starting_price': None,
 'epc': 'D'}

In [18]:
#Creating a dataframe of listings_2

dataframe_second_scrape = pd.DataFrame(listings_2)
csv_path = r"C:\Users\Rik\Desktop\immoeliza\scraper\immo_scraper_land.csv"
dataframe_second_scrape.to_csv(csv_path, index=False)

print(f"Data from second scrape saved to {csv_path}")

Data from second scrape saved to C:\Users\Rik\Desktop\immoeliza\scraper\immo_scraper_land.csv


In [19]:
# Merge all the data (listings_1 & listings_2) on key id

merged_df = pd.merge(dataframe_first_scrape, dataframe_second_scrape, on='id', how='inner')

print(merged_df)
csv_path = r"C:\Users\Rik\Desktop\immoeliza\scraper\immo_scraper_merged_land.csv"
merged_df.to_csv(csv_path, index=False)

print(f"Merged data saved to {csv_path}")


          id             locality_name Postal_code      Price  \
0   20264804                  Wetteren        9230        NaN   
1   20260393                Schaerbeek        1030   495000.0   
2   20260850       Woluwe-Saint-Pierre        1150   575000.0   
3   20254029        SINT-PIETERS-LEEUW        1600   449000.0   
4   20262954  Geraardsbergen Ophasselt        9500        NaN   
5   20264095            Oostnieuwkerke        8840        NaN   
6   20264349                 HARELBEKE        8530   379000.0   
7   20263033                   Ixelles        1050  1995000.0   
8   20257839                   Bertrix        6880   175000.0   
9   20257844    Habay (Habay-la-Neuve)        6720   425000.0   
10  20264924           Wezembeek-Oppem        1970   695000.0   
11  20260710                     Ronse        9600        NaN   
12  20260196                Oudsbergen        3670        NaN   
13  20261055                   Beersel        1650        NaN   
14  20261307             

In [20]:
#Showing the first 50 rows of the merged dataframe
merged_df.head(50)

Unnamed: 0,id,locality_name,Postal_code,Price,Subtype,Number_of_rooms,Number_of_bedrooms,Living_area,sale_annuity,Type_of_sale,...,hasTerrace,terraceSurface,hasGarden,gardenSurface,Kitchen_type,Number_of_facades,State_of_building,Furnished,Starting_price,epc
0,20264804,Wetteren,9230,,HOUSE_GROUP,,,,,FOR_SALE,...,,,,,,,,,,
1,20260393,Schaerbeek,1030,495000.0,HOUSE,,3.0,120.0,,FOR_SALE,...,True,34.0,True,12.0,HYPER_EQUIPPED,2.0,AS_NEW,False,,F
2,20260850,Woluwe-Saint-Pierre,1150,575000.0,HOUSE,,5.0,177.0,,FOR_SALE,...,True,30.0,True,60.0,NOT_INSTALLED,2.0,TO_RENOVATE,False,,F
3,20254029,SINT-PIETERS-LEEUW,1600,449000.0,HOUSE,,2.0,121.0,,FOR_SALE,...,True,,,,HYPER_EQUIPPED,3.0,JUST_RENOVATED,,,D
4,20262954,Geraardsbergen Ophasselt,9500,,HOUSE_GROUP,,,,,FOR_SALE,...,,,,,,,,,,
5,20264095,Oostnieuwkerke,8840,,HOUSE_GROUP,,,,,FOR_SALE,...,,,,,,,,,,
6,20264349,HARELBEKE,8530,379000.0,VILLA,,3.0,175.0,,FOR_SALE,...,True,,,,INSTALLED,4.0,GOOD,,,C
7,20263033,Ixelles,1050,1995000.0,HOUSE,,4.0,544.0,,FOR_SALE,...,True,90.0,True,250.0,USA_HYPER_EQUIPPED,2.0,AS_NEW,False,False,D
8,20257839,Bertrix,6880,175000.0,HOUSE,,3.0,85.0,,FOR_SALE,...,True,20.0,True,107.0,SEMI_EQUIPPED,2.0,GOOD,,,C
9,20257844,Habay (Habay-la-Neuve),6720,425000.0,HOUSE,,2.0,128.0,,FOR_SALE,...,True,,True,372.0,INSTALLED,4.0,GOOD,,,F


In [21]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60 entries, 0 to 59
Data columns (total 26 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  60 non-null     int64  
 1   locality_name       60 non-null     object 
 2   Postal_code         60 non-null     object 
 3   Price               40 non-null     float64
 4   Subtype             60 non-null     object 
 5   Number_of_rooms     0 non-null      object 
 6   Number_of_bedrooms  40 non-null     float64
 7   Living_area         40 non-null     float64
 8   sale_annuity        0 non-null      object 
 9   Type_of_sale        60 non-null     object 
 10  street              59 non-null     object 
 11  number              54 non-null     object 
 12  latitude            59 non-null     float64
 13  longitude           59 non-null     float64
 14  Open_fire           60 non-null     bool   
 15  Swimming_Pool       7 non-null      object 
 16  hasTerrace