In [2]:
import json
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from datetime import date
pd.options.display.max_columns = 500 

# BoligScraper Class

In [None]:
class BoligScraper(object):
    
    r"""Class for scraping www.boligsiden.dk"""
    
    def __init__(self, num_listings_per_page=5000):
        
        self.num_listings_per_page = num_listings_per_page
        self.base_url = 'https://www.boligsiden.dk/resultat/1f923c02d4bf4c0ca6b0e7320ee8daee?s=12&sd=false&d=1&p={}&i={}'
    
    def scrape(self):
        
        print('Scraping..')
        
        dfs = []
        for i in range(10000):
            
            print(f'Scraping page {i+1}')
            # Get url
            url = self.base_url.format(i, self.num_listings_per_page)
            df = self._get_listing_page_df(url)

            if df.empty:
                break
            else:
                dfs.append(df)
        
        
        # Concat all dfs
        print('Concatenating DataFrames..')
        self.df = pd.concat(dfs)
        
        print('Saving scraped data to disk..')
        self._save_df()
        
        print('Scraping finished!')    
        
        return self.df
    
    def _get_listing_page_df(self, url):
        
        # Get all script tags
        html = requests.get(url).content
        soup = BeautifulSoup(html, 'html.parser')
        scripts = soup.find_all('script')
        
        # Find script tag with json string
        for s in scripts:
            if '__bs_propertylist_result__ ' in str(s):
                script = str(s)

        # Locate JSON string (find better solution)
        script = script[45:]
        script = script[:-16]
        
        # Get df
        data =json.loads(script)['result']['properties'] # convert string to json
        df = pd.DataFrame(data) # convert to pandas df
                
        return df
        
    def _save_df(self):
        self.df.to_pickle(f'./data/boligsiden_{date.today()}.pkl', decimal=',')
    
scraper = BoligScraper()
df = scraper.scrape()



# Boligsiden Data Cleaning

In [6]:
# Load todays data
df = pd.read_pickle(f'./data/boligsiden_{date.today()}.pkl')
df.head()

Unnamed: 0,id,itemTypeName,itemType,propertyLink,isFavorite,rating,priceDevelopment,hasOpenHouse,nextOpenHouse,nextOpenHouseSignup,imageLink300X200,energyMark,energyMarkLink,address,postal,city,paymentCash,downPayment,paymentExpenses,paymentGross,paymentNet,areaResidential,numberOfRooms,areaParcel,salesPeriod,redirectLink,openHouseRedirectLink,agentsLogoLink,financing,areaPaymentCash,areaWeighted,uniqueNumber,agentChainName,isArchive,dateRemoved,placeName,canShowSalesPeriodTotal,salesPeriodTotal,dateAnnounced,calculateLoanAgentChain,label
0,26b739159d444b779808003608580b71,Villa,100,~/salg/871006342,False,"{'ratings': {'conditionRating': None, 'kitchen...",0%,False,01. jan. 00:00,False,https://pic.boligsiden.dk/property/300x200/1/2...,c,https://sparenergi.dk/forbruger/vaerktoejer/fi...,Niels Ebbesens Vej 2,7100,Vejle,2.295.000,115.0,3.28,9.321,8.277,200,6,1.155,0,https://www.boligsiden.dk/viderestilling/26b73...,,https://pic.boligsiden.dk/agent/23ed916c7b6840...,{'link': 'https://raadgivning.boligsiden.dk/bo...,11.591,198,871006342,danbolig,False,,,True,-,27-07-2020,Nordea,Ny
1,e35e199ff3ac46fbb9b0036ae924a432,Villa,100,~/salg/335543235,False,"{'ratings': {'conditionRating': None, 'kitchen...",0%,False,01. jan. 00:00,False,https://pic.boligsiden.dk/property/300x200/1/e...,e,https://sparenergi.dk/forbruger/vaerktoejer/fi...,Kertemindevej 12,5540,Ullerslev,865.000,45.0,1.562,3.355,2.89,99,4,473.0,0,https://www.boligsiden.dk/viderestilling/e35e1...,,https://pic.boligsiden.dk/agent/555ca0c3739f4d...,{'link': 'https://raadgivning.boligsiden.dk/bo...,8.009,108,335543235,BOLIGmægleren,False,,Flødstrup,True,255,27-07-2020,Nordea,Ny
2,147d5f775b634a978f7103f05f88cf28,Ejerlejlighed,300,~/salg/322209509,False,"{'ratings': {'conditionRating': None, 'kitchen...",0%,False,01. jan. 00:00,False,https://pic.boligsiden.dk/property/300x200/1/1...,a2015,https://sparenergi.dk/forbruger/vaerktoejer/fi...,"Smaragdvej 7, 1. mf",7100,Vejle,2.445.000,125.0,2.286,9.891,8.784,93,3,0.0,0,https://www.boligsiden.dk/viderestilling/147d5...,,https://pic.boligsiden.dk/agent/01f208c5b7b246...,{'link': 'https://raadgivning.boligsiden.dk/bo...,26.29,93,322209509,Nybolig,False,,,True,-,27-07-2020,,Ny
3,de786d8071974795be23043afefda1e2,Ejerlejlighed,300,~/salg/461764123,False,"{'ratings': {'conditionRating': None, 'kitchen...",0%,False,01. jan. 00:00,False,https://pic.boligsiden.dk/property/300x200/1/d...,d,https://sparenergi.dk/forbruger/vaerktoejer/fi...,"Marselis Boulevard 69, st. th",8000,Aarhus C,2.598.000,130.0,2.452,10.546,9.31,64,2,0.0,0,https://www.boligsiden.dk/viderestilling/de786...,,https://pic.boligsiden.dk/agent/01f208c5b7b246...,{'link': 'https://raadgivning.boligsiden.dk/bo...,40.594,64,461764123,Nybolig,False,,,True,-,27-07-2020,,Ny
4,21e67465846b44c4a73604875b010d33,Ejerlejlighed,300,~/salg/896694287,False,"{'ratings': {'conditionRating': None, 'kitchen...",0%,False,01. jan. 00:00,False,https://pic.boligsiden.dk/property/300x200/1/2...,b,https://sparenergi.dk/forbruger/vaerktoejer/fi...,"Grøfthøjparken 165, 3. 24",8260,Viby J,1.198.000,60.0,2.255,4.888,4.312,75,2,0.0,0,https://www.boligsiden.dk/viderestilling/21e67...,,https://pic.boligsiden.dk/agent/01f208c5b7b246...,{'link': 'https://raadgivning.boligsiden.dk/bo...,15.973,75,896694287,Nybolig,False,,,True,-,24-07-2020,,


In [22]:
# Fix types
df['paymentCash'].apply(lambda x: x.replace('.', ''))

0       2295000
1        865000
2       2445000
3       2598000
4       1198000
         ...   
1342     135300
1343     186800
1344     133100
1345     185500
1346     128800
Name: paymentCash, Length: 56347, dtype: object

In [None]:
df.to_pickle('./boligsiden_27-07-2020.pkl')

In [None]:
if not ['1']: print(4)

In [None]:

# Get response
url = f'https://www.boligsiden.dk/resultat/1f923c02d4bf4c0ca6b0e7320ee8daee?s=12&sd=false&d=1&p={}&i={NUM_LISTINGS_PER_PAGE}'
r = requests.get(url)
html = r.content

# Soup it!
soup = BeautifulSoup(html, 'html.parser')
scripts = soup.find_all('script')



# Find script tag w. json data
for s in scripts:
    if '__bs_propertylist_result__ ' in str(s):
        script = str(s)

# Locate JSON string (find better solution)
script = script[45:]
script = script[:-16]

# Convert to json
data =json.loads(script)['result']



In [None]:
data['properties']

In [None]:
import pandas as pd
pd.set_option('display.max_columns', 500)
df  = pd.DataFrame(data['properties'])

In [None]:
for i in range(1000):
    print(i)