# Comparing file list with municipality codes to check that all data has been scraped

In [1]:
import pandas as pd
import os
from pathlib import Path
import re
import requests
from bs4 import BeautifulSoup

In [2]:
fp = Path.cwd() / 'data' # path of files to be found

In [3]:
# Initialize a list to store the last three letters as integers
municipality_code_sales = [] #contains all the municipality codes from sales files sorted by number
municipality_code_log = [] #contains all the municipality codes from log files sorted by number

# Iterate through log files in the folder
for filename in sorted(os.listdir(fp)):
    if filename.startswith('sales_1992_2022_') and filename.endswith('.csv'):
        mun_code = filename.split('_')[-1].split('.')[0]
        municipality_code_sales.append(int(mun_code))
    if filename.startswith('log_') and filename.endswith('.csv'):
        mun_code = filename.split('_')[-1].split('.')[0]
        municipality_code_log.append(int(mun_code))

print(f'Amount of log files: {len(municipality_code_log)}')
print(f'Amount of sales files: {len(municipality_code_sales)}')

Amount of log files: 98
Amount of sales files: 98


In [17]:
# Initialize variables to store the total lengths
total_log_length = 0
number_in_logs_dict = {}

# Iterate through files in the folder
for filename in sorted(os.listdir(fp)):

    if filename.startswith('log_') and filename.endswith('.csv'):
        results_log = pd.read_csv(f'{fp}/{filename}')
        #print(filename, len(results_log))
        total_log_length += len(results_log)
        
        number = re.search(r'\d+', filename).group()
        number_in_logs_dict[int(number)] = len(results_log)

print(f"Total length of 'log' files: {total_log_length}") #65365 pages on Boliga
print(f'Boliga listings: 65365. Missing:', 65365 - total_log_length) #there are more in our files since some pages don't have 50 listings

Total length of 'log' files: 65414
Boliga listings: 65365. Missing: -49


In [5]:
# Initialize variables to store the total lengths
total_sales_length = 0

# Iterate through sales files in the folder
for filename in sorted(os.listdir(fp)):

    if filename.startswith('sales_1992_2022_') and filename.endswith('.csv'):
            sales_listings = pd.read_csv(f'{fp}/{filename}')
            #print(filename, len(sales_listings))
            total_sales_length += len(sales_listings)

print(f"Total length of 'log' files: {total_sales_length}") #3268236 pages on Boliga
print(f'Boliga listings: 3268236. Missing:', 3268236 - total_sales_length)

  sales_listings = pd.read_csv(f'{fp}/{filename}')
  sales_listings = pd.read_csv(f'{fp}/{filename}')
  sales_listings = pd.read_csv(f'{fp}/{filename}')
  sales_listings = pd.read_csv(f'{fp}/{filename}')


Total length of 'log' files: 3268236
Boliga listings: 3268236. Missing: 0


  sales_listings = pd.read_csv(f'{fp}/{filename}')


In [6]:
# Find the last page number of a search based on a given municipality code
def get_last_page_no(url):
    
    response = requests.get(url)
    html_content = response.content
    
    soup = BeautifulSoup(html_content, 'html.parser')
    nav_right_element = soup.find(class_='nav-right')
    
    if nav_right_element:
        page_button_element = nav_right_element.find(class_='page-button')
        last_page = int(page_button_element.text)
        return last_page

In [7]:
# All municipality numbers:
numbers = [
    101, 147, 151, 153, 155, 157, 159, 161, 163, 165, 167, 169, 173, 175, 183, 185, 187,
    190, 201, 210, 217, 219, 223, 230, 240, 250, 253, 259, 260, 265, 269, 270, 306, 316,
    320, 326, 329, 330, 336, 340, 350, 360, 370, 376, 390, 400, 410, 420, 430, 440, 450,
    461, 479, 480, 482, 492, 510, 530, 540, 550, 561, 563, 573, 575, 580, 607, 615, 621,
    630, 657, 661, 665, 671, 706, 707, 710, 727, 730, 740, 741, 746, 751, 756, 760, 766,
    773, 779, 787, 791, 810, 813, 820, 825, 840, 846, 849, 851, 860
]

base_url = "https://www.boliga.dk/salg/resultater?searchTab=1&propertyType=1,2,3&salesDateMin=1992&salesDateMax=2022&sort=date-d&page=1&municipality="

number_of_pages_dict = {}
for number in numbers:
    url = base_url + str(number)
    pages = get_last_page_no(url)
    number_of_pages_dict[int(number)] = pages
    #print(f"Number of pages for municipality {number}: {pages}")
    #print()
    #print(number_of_pages_dict)

In [18]:
number_in_logs_dict == number_of_pages_dict

True

In [19]:
dict1 = number_in_logs_dict
dict2 = number_of_pages_dict

In [22]:
for key in dict1:
    if key in dict2:
        value1 = dict1[key]
        value2 = dict2[key]
        if value1 == value2:
            print(f"Key {key}: Values the same - Dict1: {value1}, Dict2: {value2}")
        else:
            print(f"Key {key}: Values NOT the same - Dict1: {value1}, Dict2: {value2}")
    else:
        print(f"Key {key} not found in Dict2")

Key 101: Values the same - Dict1: 5953, Dict2: 5953
Key 147: Values the same - Dict1: 1127, Dict2: 1127
Key 151: Values the same - Dict1: 364, Dict2: 364
Key 153: Values the same - Dict1: 190, Dict2: 190
Key 155: Values the same - Dict1: 163, Dict2: 163
Key 157: Values the same - Dict1: 1130, Dict2: 1130
Key 159: Values the same - Dict1: 612, Dict2: 612
Key 161: Values the same - Dict1: 185, Dict2: 185
Key 163: Values the same - Dict1: 147, Dict2: 147
Key 165: Values the same - Dict1: 153, Dict2: 153
Key 167: Values the same - Dict1: 400, Dict2: 400
Key 169: Values the same - Dict1: 569, Dict2: 569
Key 173: Values the same - Dict1: 554, Dict2: 554
Key 175: Values the same - Dict1: 378, Dict2: 378
Key 183: Values the same - Dict1: 180, Dict2: 180
Key 185: Values the same - Dict1: 359, Dict2: 359
Key 187: Values the same - Dict1: 219, Dict2: 219
Key 190: Values the same - Dict1: 362, Dict2: 362
Key 201: Values the same - Dict1: 247, Dict2: 247
Key 210: Values the same - Dict1: 451, Dict2