In [None]:
# Import libraries to be used
import pandas as pd
import requests
import tqdm
import time
import os
import random
from bs4 import BeautifulSoup

In [None]:
# Construct variables to be used later
# municipality_code and pages_total must be updated for each scrape

municipality_code = 101 #number code of municipality
pages_total = 5953 #amount of pages in search results

year_min = 1992 #start year for scrape
year_max = 2022 #end year for scrape

url_0 = 'https://www.boliga.dk/salg/resultater?searchTab=1&propertyType=1,2,3&salesDateMin=' # first part of link to the website
url_1 = '&salesDateMax='
url_2 = '&municipality='
url_end = '&sort=date-d&page=' # second part of link to the website

logfile = f'log_{municipality_code}.csv' #name of logfile based on municipality_code
list_htmls = [] #list for storing scraped urls

file_title = f'sales_{year_min}_{year_max}_{municipality_code}' #create string to be used in file names
globals()[file_title] = pd.DataFrame() #create df based on file_title and municipality_code

#%who #see which variables are created

In [None]:
# First define the log function to gather the log information
def log(response,logfile,output_path=os.getcwd()):
    # Open or create the csv file
    if os.path.isfile(logfile): #If the log file exists, open it and allow for changes     
        log = open(logfile,'a')
    else: #If the log file does not exist, create it and make headers for the log variables
        log = open(logfile,'w')
        header = ['timestamp','status_code','length','output_file']
        log.write(';'.join(header) + "\n") #Make the headers and jump to new line
        
    # Gather log information
    status_code = response.status_code #Status code from the request result
    timestamp = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) #Local time
    length = len(response.text) #Length of the HTML-string
    
    # Open the log file and append the gathered log information
    with open(logfile,'a') as log:
        log.write(f'{timestamp};{status_code};{length};{url}' + "\n") #Append the information and jump to new line

In [None]:
# This loop runs through all pages based on municipality_code, pages_total, year_min and year_max
# Then stores the results in a dataframe and saves the result in a csv-file, both named according to years and municipality code
for i in tqdm.tqdm(range(5350, pages_total)):
    page_no = i + 1
    url = f'{url_0}{year_min}{url_1}{year_max}{url_2}{municipality_code}{url_end}{page_no}'
    random_no =  random.randrange(5, 10, 1) / 10
    
    try:
        response = requests.get(url, headers={'name':'Daniel Pryn'\
                                              ,'email':'knl810@alumni.ku.dk'\
                                              ,'message':'This is solely used for a university project. If we cause you any inconvenience, please let us know.'})
    except Exception as e:
        print(url) #Print url
        print(e) #Print error
        #sales_1992_2022_165.to_csv('sales_1992_2022_165.csv') #Save the dataframe as a csv file to retrieve at another time
        continue #Continue to next iteration of the loop
    
    if response.ok: #Check if the response carries any data
        tables = pd.read_html(url, encoding='utf-8') #If the response carries data, then save the tables
    else: #If the response does not carry any data, then print the status_code and continue to next iteration of the loop
        print(url)
        print(response.status_code)
        continue
    
    result_df = pd.DataFrame(tables[0]) #Convert this iteration's first table to a dataframe
    globals()[file_title] = pd.concat([globals()[file_title],result_df], axis=0, ignore_index=True) #Append to the rest of the data
    log(response, logfile)
    
    if (page_no % 10 == 0) or (page_no == pages_total): #save results to csv for every 10 pages and after last page
        globals()[file_title].to_csv(f'{file_title}.csv', index=False)
        
    #time.sleep(random_no) #Sleep between 0.5 and 1 seconds

In [None]:
# read CSV file
results_log = pd.read_csv(f'log_{municipality_code}.csv')
results = pd.read_csv(f'{file_title}.csv')
  
# count no. of lines
print("Number of lines present:-", 
      len(results))
print("Number of urls present:-", 
      len(results_log))

In [None]:
#globals()[file_title]