In [1]:
import requests
import time
import tqdm
import pandas as pd
import os
import json
from bs4 import BeautifulSoup
import pandas as pd

import re

from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By

## saving all the urls for flats in copenhagen

In [2]:
# Base URL using an f-string
base_url = 'https://www.boligsiden.dk/kommune/glostrup/solgte/ejerlejlighed?sortAscending=false&yearSoldFrom=2015&yearSoldTo=2023&registrationTypes=normal&page={page_number}'

# Create an empty list to store all article URLs
list_of_article_urls = []

number_pages = 32

# Loop through pages from 1 to 500
for page_number in tqdm.tqdm(range(1, number_pages+1)):
    # Define the URL for the current page using the f-string
    url = base_url.format(page_number=page_number)
    
    # Connect to the page
#     response = requests.get(url)
    
    #
    try:
        response = requests.get(url)
    except Exception as e:
        print(url) #Print url
        print(e) #Print error
        with open("list_of_article_urls", "w") as l: #Save the list_htmls as a json file to retrieve at another time
            json.dump(list_of_article_urls, l)
        continue #Continue to next iteration of the loop
    #
    
    
    # Parse data with BeautifulSoup
    soup = BeautifulSoup(response.content, 'lxml')
    
    # Find all articles on the page
    articles = soup.find_all('div', class_='shadow overflow-hidden mx-4')
    
    # Append the article URLs to the list
    for article in articles:
        article_url = article.find('a')['href']
        list_of_article_urls.append(article_url)

# Now, list_of_article_urls contains all the article URLs from pages 1 to 500
print("Total number of articles:", len(list_of_article_urls))


100%|██████████████████████████████████████████████████████████████████████████████████| 32/32 [00:40<00:00,  1.26s/it]

Total number of articles: 638





## saving the right url code and delete duplicates

In [3]:
base_url = 'https://www.boligsiden.dk'

list_of_article_urls = [base_url + url for url in list_of_article_urls]

list_of_article_urls = list(set(list_of_article_urls))

## scrappe standard information

In [4]:
import requests
from bs4 import BeautifulSoup
import re
import tqdm

# Create empty lists for the information we want to extract for every article
adresse_list = []
kvdm_list = []

# Iterate through each article URL
for article_url in tqdm.tqdm(list_of_article_urls):
    try:
        # Fetch HTML content from the article URL
        response = requests.get(article_url)
        response.raise_for_status()  # Raise an exception for HTTP errors

        # Create a BeautifulSoup object to parse the HTML
        soup = BeautifulSoup(response.text, "html.parser")

        # Find elements with the specified classes and extract their text content
        element_1 = soup.find(class_="font-bold text-sm md:text-base")
        text_content_1 = element_1.get_text(strip=True) if element_1 else None

        element_2 = soup.find(class_="mt-1 text-xs md:text-sm text-gray-600")
        text_content_2 = element_2.get_text(strip=True) if element_2 else None

        # Combine the text content with a space between them
        combined_text = ' '.join(filter(None, [text_content_1, text_content_2]))

        # Append combined_text to adresse_list
        adresse_list.append(combined_text)

        # Regular expression pattern to find numbers before "m²"
        pattern = r'(\d+)\s*m²'  # Grouping the digits

        # Find all matches
        matches = re.findall(pattern, response.text)

        # Append only the first number from each match to kvdm_list
        if matches:
            kvdm_list.append(int(matches[0]))
        else:
            kvdm_list.append(None)

    except requests.RequestException as e:
        print(f"Error fetching content for {article_url}: {e}")

# Now, adresse_list contains the text content of the elements with classes
# "font-bold text-sm md:text-base" and "mt-1 text-xs md:text-sm text-gray-600"
# with a space between them for each article URL
# print(adresse_list)

# Now, kvdm_list contains the first number found before "m²" for each article URL
# Converted to integers where possible, and None where no match was found
# print(kvdm_list)

100%|████████████████████████████████████████████████████████████████████████████████| 638/638 [06:12<00:00,  1.71it/s]


## DataFrame 

In [5]:
# df.to_csv('København_kom.csv', index=False)

In [6]:
# Regular expression pattern to capture the desired part of each address
pattern = r'^[\wæøåÆØÅ\s]+\s\d+'

# Extract the desired part of each address
extracted_addresses = []
for address in adresse_list:
    match = re.match(pattern, address)
    if match:
        extracted_addresses.append(match.group(0))
    else:
        extracted_addresses.append(None)

        
short_addresses = list(set(extracted_addresses))

# Print the length of extracted addresses and adresse_list
# print(len(extracted_addresses), len(adresse_list))
# print(extracted_addresses)

In [7]:
print(len(short_addresses), len(adresse_list))

137 638


In [8]:
# Assuming that all lists have the same length
data = {
    'URL': list_of_article_urls,
    'Adresse': adresse_list,
    'Kort adresse': extracted_addresses,
    'Kvdm': kvdm_list
}

# Create DataFrame
df = pd.DataFrame(data)
df = df[df['Kort adresse'].notna()]
# Print DataFrame
df[:6]
# df.to_csv('København_kom.csv', index=False)

Unnamed: 0,URL,Adresse,Kort adresse,Kvdm
0,https://www.boligsiden.dk/adresse/tranemosevej...,"Tranemosevej 7, st. tv. 2600 Glostrup",Tranemosevej 7,77
1,https://www.boligsiden.dk/adresse/oestervej-37...,"Østervej 37, 2. tv. 2600 Glostrup",Østervej 37,77
2,https://www.boligsiden.dk/adresse/tranemosevej...,"Tranemosevej 5, 1. th. 2600 Glostrup",Tranemosevej 5,77
3,https://www.boligsiden.dk/adresse/christiansve...,"Christiansvej 15, st. tv. 2600 Glostrup",Christiansvej 15,69
4,https://www.boligsiden.dk/adresse/dalvangsvej-...,"Dalvangsvej 13, st. 11. 2600 Glostrup",Dalvangsvej 13,49
5,https://www.boligsiden.dk/adresse/magnoliavej-...,"Magnoliavej 12, st. tv. 2600 Glostrup",Magnoliavej 12,75


## optimere korteadresser så man ikke slår lejligheder i i samme opgang op to gange

In [9]:
data1 = {
    'Kort adresse': short_addresses
}
df1 = pd.DataFrame(data1)
# df1

In [10]:
import googlemaps
# Initialize Google Maps API client
gmaps = googlemaps.Client(key='AIzaSyAkzxBl6dpAhqaPG3WQEDPsaK3pRsJoMjw')  # Replace 'YOUR_API_KEY' with your actual API key

# Function to geocode addresses and retrieve coordinates
def geocode_address(address):
    try:
        # Geocode address
        geocode_result = gmaps.geocode(address)
        if geocode_result:
            # Extract latitude and longitude from geocode result
            location = geocode_result[0]['geometry']['location']
            return location['lat'], location['lng']
        else:
            return None, None
    except Exception as e:
        print(f"Error geocoding address '{address}': {e}")
        return None, None

# Apply geocode_address function to 'Kort adresse' column and store results in new columns 'Latitude' and 'Longitude'
geocoded_results = df1['Kort adresse'].apply(geocode_address)

# Add Latitude and Longitude to DataFrame
df1['Latitude'], df1['Longitude'] = zip(*geocoded_results)

# Save df1 as a CSV file
df1.to_csv('df1_geocoded.csv', index=False)

# Print DataFrame with new coordinates columns
df1

Unnamed: 0,Kort adresse,Latitude,Longitude
0,Sydvestvej 45,55.662894,12.391186
1,Gyvelvej 16,55.636519,12.108263
2,Edithsvej 1,55.664491,12.402499
3,Østervej 27,55.667980,12.391084
4,Ejbytoften 8 2600,55.696473,12.409269
...,...,...,...
132,Magnoliavej 24,55.669963,12.514043
133,Magnoliavej 6,55.669645,12.516241
134,Klinteholm 2,55.677977,12.419327
135,Skolevej 16,55.416516,10.358297


In [11]:
# Merge the two DataFrames on the 'Kort adresse' column using a left join
combined_df = pd.merge(df, df1, on='Kort adresse', how='left')

# Print the combined DataFrame
combined_df

Unnamed: 0,URL,Adresse,Kort adresse,Kvdm,Latitude,Longitude
0,https://www.boligsiden.dk/adresse/tranemosevej...,"Tranemosevej 7, st. tv. 2600 Glostrup",Tranemosevej 7,77,55.751895,12.378978
1,https://www.boligsiden.dk/adresse/oestervej-37...,"Østervej 37, 2. tv. 2600 Glostrup",Østervej 37,77,55.668495,12.390570
2,https://www.boligsiden.dk/adresse/tranemosevej...,"Tranemosevej 5, 1. th. 2600 Glostrup",Tranemosevej 5,77,55.752318,12.379647
3,https://www.boligsiden.dk/adresse/christiansve...,"Christiansvej 15, st. tv. 2600 Glostrup",Christiansvej 15,69,55.753745,12.565578
4,https://www.boligsiden.dk/adresse/dalvangsvej-...,"Dalvangsvej 13, st. 11. 2600 Glostrup",Dalvangsvej 13,49,55.668251,12.415242
...,...,...,...,...,...,...
633,https://www.boligsiden.dk/adresse/broendbyvest...,"Brøndbyvestervej 20, 1. 22. 2600 Glostrup",Brøndbyvestervej 20,72,55.659345,12.406448
634,https://www.boligsiden.dk/adresse/solvangsvej-...,"Solvangsvej 37, 1. 2600 Glostrup",Solvangsvej 37,84,55.671310,12.395484
635,https://www.boligsiden.dk/adresse/hovedvejen-1...,"Hovedvejen 154, 4. 31. 2600 Glostrup",Hovedvejen 154,73,55.665732,12.389936
636,https://www.boligsiden.dk/adresse/hortensiavej...,"Hortensiavej 9, 1. 2600 Glostrup",Hortensiavej 9,75,55.676934,12.535202


In [12]:
combined_df['Kommune'] = 'Glostrup'
combined_df.to_csv('Glostrup_kom.csv', index=False)
combined_df

Unnamed: 0,URL,Adresse,Kort adresse,Kvdm,Latitude,Longitude,Kommune
0,https://www.boligsiden.dk/adresse/tranemosevej...,"Tranemosevej 7, st. tv. 2600 Glostrup",Tranemosevej 7,77,55.751895,12.378978,Glostrup
1,https://www.boligsiden.dk/adresse/oestervej-37...,"Østervej 37, 2. tv. 2600 Glostrup",Østervej 37,77,55.668495,12.390570,Glostrup
2,https://www.boligsiden.dk/adresse/tranemosevej...,"Tranemosevej 5, 1. th. 2600 Glostrup",Tranemosevej 5,77,55.752318,12.379647,Glostrup
3,https://www.boligsiden.dk/adresse/christiansve...,"Christiansvej 15, st. tv. 2600 Glostrup",Christiansvej 15,69,55.753745,12.565578,Glostrup
4,https://www.boligsiden.dk/adresse/dalvangsvej-...,"Dalvangsvej 13, st. 11. 2600 Glostrup",Dalvangsvej 13,49,55.668251,12.415242,Glostrup
...,...,...,...,...,...,...,...
633,https://www.boligsiden.dk/adresse/broendbyvest...,"Brøndbyvestervej 20, 1. 22. 2600 Glostrup",Brøndbyvestervej 20,72,55.659345,12.406448,Glostrup
634,https://www.boligsiden.dk/adresse/solvangsvej-...,"Solvangsvej 37, 1. 2600 Glostrup",Solvangsvej 37,84,55.671310,12.395484,Glostrup
635,https://www.boligsiden.dk/adresse/hovedvejen-1...,"Hovedvejen 154, 4. 31. 2600 Glostrup",Hovedvejen 154,73,55.665732,12.389936,Glostrup
636,https://www.boligsiden.dk/adresse/hortensiavej...,"Hortensiavej 9, 1. 2600 Glostrup",Hortensiavej 9,75,55.676934,12.535202,Glostrup


## scrappe priserne