In [1]:
url = "https://www.booli.se/sok/slutpriser?areaIds=81688,81683&amenities=buildingHasElevator"

In [2]:
from bs4 import BeautifulSoup
import requests
import datetime
import re
from dataclasses import dataclass, asdict
import csv



In [3]:
@dataclass
class Object:
    address: str
    date: datetime.datetime
    price: int
    size: int
    sqm_price: int
    rooms: float  # apparently, you can have e.g. 4.5 rooms
    total_area: int

In [4]:
def get_url(url: str):
    page = requests.get(url)
    return BeautifulSoup(page.content, 'html.parser')

In [5]:
def get_listings_from_soup(soup):
    return soup.find_all('article', class_='relative')

In [6]:
def get_all_listings(url_):
    all_listings = []
    page = 1
    while True:
        page_url = url_ + f"&page={page}" if page > 1 else url_
        soup = get_url(page_url)
        listings = get_listings_from_soup(soup)
        print(f"Found {len(listings)} on page {page}")
        all_listings.extend(listings)
        if len(listings) == 0:
            break
            
        page += 1
    return all_listings

In [7]:
def get_info_from_listing(listing):
    address = listing.find('a', class_='expanded-link').text
    
    date = listing.find('span', class_='object-card__date__logo').text
    date = datetime.datetime.strptime(date, "%Y-%m-%d").date()

    price = listing.find('span', class_='object-card__price__logo').text
    price = int(re.sub(r'\s+', '', price)[:-2]) # Remove whitespace + kr at the end
    
    details = listing.find("ul", class_="object-card__data-list").find_all('li')

    size = details[0]
    size = float(size.text[:-3].replace(",", ".")) # remove m2 and convert "," to "."

    # Some can't be parsed, skipping
    if len(details) not in [3, 4]:
        print("Details broken:", details)
        return Object(
            address=address,
            date=date,
            price=price,
            size=size,
            sqm_price=0,
            rooms=0,
            total_area=0,
        )
    
    if len(details) == 3:
        rooms = None
        total_area = details[1]
        sqm_price = details[2]
    else:
        rooms = details[1]
        total_area = details[2]
        sqm_price = details[3]

    if rooms:
        rooms = float(rooms.text[:-3].replace(",", ".")) # remove "rum"

    if "tomt" in total_area.text: # This can also be floor (like floor 3). Ignoring that
        total_area = int(re.sub(r'\s+', '', total_area.text[:-7]))
        print("TOOT", total_area)
    else:
        total_area = 0
    
    sqm_price = int(re.sub(r'\s+', '', sqm_price.text[:-6]))

    return Object(
        address=address,
        date=date,
        price=price,
        size=size,
        sqm_price=sqm_price,
        rooms=rooms,
        total_area=total_area,
    )

In [8]:
listings = get_all_listings(url)

Found 35 on page 1
Found 35 on page 2
Found 13 on page 3
Found 0 on page 4


In [9]:
parsed_listings = [get_info_from_listing(l) for l in listings]

Details broken: [<li>3,5 rum</li>, <li>vån 6</li>]


In [10]:
with open('output.csv', 'w') as csvfile:
    fieldnames = list(asdict(parsed_listings[0]).keys())
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    remapped = map(asdict, parsed_listings)
    writer.writerows(remapped)