In [None]:
# Import libraries to be used
import pandas as pd
import requests
import tqdm
import time
import os
import random
from bs4 import BeautifulSoup
from pathlib import Path

In [None]:
# Construct variables to be used later

year_min = 1992 #start year for scrape
year_max = 2022 #end year for scrape

url_0 = 'https://www.boliga.dk/salg/resultater?searchTab=1&propertyType=1,2,3&salesDateMin=' # first part of link to the website
url_1 = '&salesDateMax='
url_2 = '&municipality='
url_end = '&sort=date-d&page=' # second part of link to the website

# Create Path object of new folder located inside the current working directory of this notebook
fp = Path.cwd() / 'data'  
# Use the Path object to actually create the subfolder
Path.mkdir(fp, exist_ok=True) 

#%who #see which variables are created

In [None]:
municipality_list = [
    165, 201, 151, 400, 153, 155, 240, 210, 147, 250, 190, 157, 159, 161,
    270, 260, 217, 163, 219, 167, 169, 223, 183, 101, 173, 230, 175, 185,
    187, 320, 253, 376, 316, 326, 259, 350, 360, 370, 306, 329, 265, 330,
    269, 340, 336, 390, 420, 530, 561, 563, 607, 430, 510, 440, 621, 482,
    410, 480, 450, 461, 479, 540, 550, 573, 575, 630, 492, 580, 710, 766,
    657, 661, 615, 756, 665, 707, 727, 730, 760, 741, 740, 746, 779, 671,
    706, 791, 751, 810, 813, 860, 849, 825, 846, 773, 840, 787, 820, 851
]

In [None]:
# First define the log function to gather the log information
def log(response,logfile):
    # Open or create the csv file
    if os.path.isfile(f'{fp}/{logfile}'): #If the log file exists, open it and allow for changes     
        log = open(f'{fp}/{logfile}','a')
    else: #If the log file does not exist, create it and make headers for the log variables
        log = open(f'{fp}/{logfile}','w')
        header = ['timestamp','status_code','length','output_file']
        log.write(';'.join(header) + "\n") #Make the headers and jump to new line
        
    # Gather log information
    status_code = response.status_code #Status code from the request result
    timestamp = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) #Local time
    length = len(response.text) #Length of the HTML-string
    
    # Open the log file and append the gathered log information
    with open(f'{fp}/{logfile}','a') as log:
        log.write(f'{timestamp};{status_code};{length};{url}' + "\n") #Append the information and jump to new line

In [None]:
# Find the last page number of a search based on a given municipality code
def get_last_page_no(url):
    
    response = requests.get(url)
    html_content = response.content
    
    soup = BeautifulSoup(html_content, 'html.parser')
    nav_right_element = soup.find(class_='nav-right')
    
    if nav_right_element:
        page_button_element = nav_right_element.find(class_='page-button')
        last_page = int(page_button_element.text)
        return last_page

In [None]:
for code in municipality_list:
    
    municipality_code = code
    logfile = f'log_{municipality_code}.csv' #name of logfile based on municipality_code
    list_htmls = [] #list for storing scraped urls

    file_title = f'sales_{year_min}_{year_max}_{municipality_code}' #create string to be used in file names
    globals()[file_title] = pd.DataFrame() #create df named from file_title
    url = f'{url_0}{year_min}{url_1}{year_max}{url_2}{municipality_code}{url_end}1' #set start url based on municipality_code
    pages_total = get_last_page_no(url) #find last page number using get_last_page() function
    
    # This loop runs through all pages based on municipality_code, pages_total, year_min and year_max
    # Then stores the results in a dataframe and saves the result in a csv-file, both named according to years and municipality code
    for i in tqdm.tqdm(range(pages_total)):
        
        page_no = i + 1
        url = f'{url_0}{year_min}{url_1}{year_max}{url_2}{municipality_code}{url_end}{page_no}' #overwrites url for each page iteration
        #random_no =  random.randrange(2, 7, 1) / 10
        
        try:
            response = requests.get(url, headers={'name':'Daniel Pryn'\
                                                  ,'email':'knl810@alumni.ku.dk'\
                                                  ,'message':'This is solely used for a university project. If we cause you any inconvenience, please let us know.'})
        except Exception as e:
            print(url) #Print url
            print(e) #Print error
            #sales_1992_2022_165.to_csv('sales_1992_2022_165.csv') #Save the dataframe as a csv file to retrieve at another time
            continue #Continue to next iteration of the loop
        
        if response.ok: #Check if the response carries any data
            tables = pd.read_html(url, encoding='utf-8') #If the response carries data, then save the tables
        else: #If the response does not carry any data, then print the status_code and continue to next iteration of the loop
            print(url)
            print(response.status_code)
            continue
        
        result_df = pd.DataFrame(tables[0]) #Convert this iteration's first table to a dataframe
        globals()[file_title] = pd.concat([globals()[file_title],result_df], axis=0, ignore_index=True) #Append to the rest of the data
        log(response, logfile) #call log() and write to log_{municipality_code}.csv
        
        if (page_no % 10 == 0) or (page_no == pages_total): #save results to csv for every 10 pages and after last page
            globals()[file_title].to_csv(f'{fp}/{file_title}.csv', index=False)
            
        #time.sleep(random_no) #Sleep between 0.2 and 0.7 seconds

    #The code below gives a print after each full municipality harvest, so that progress can be monitored and validated
    results_listings = pd.read_csv(f'{fp}/{file_title}.csv')
    results_log = pd.read_csv(f'{fp}/log_{municipality_code}.csv')
      
    # count no. of pages and listings
    print(f'Number of pages harvested for {municipality_code}:-', 
          len(results_log))
    print(f'Number of listings harvested for {municipality_code}: ', 
          len(results_listings))


In [None]:
#globals()[file_title]