In [1]:
import requests
import time
import tqdm
import pandas as pd
import os
import json
from bs4 import BeautifulSoup
import pandas as pd

import re

from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By

## saving all the urls for flats in copenhagen

In [2]:
# Base URL using an f-string
base_url = 'https://www.boligsiden.dk/kommune/lyngby-taarbaek/solgte/ejerlejlighed?sortAscending=false&yearSoldFrom=2015&yearSoldTo=2023&registrationTypes=normal&page={page_number}'

# Create an empty list to store all article URLs
list_of_article_urls = []

number_pages = 73

# Loop through pages from 1 to 500
for page_number in tqdm.tqdm(range(1, number_pages+1)):
    # Define the URL for the current page using the f-string
    url = base_url.format(page_number=page_number)
    
    # Connect to the page
#     response = requests.get(url)
    
    #
    try:
        response = requests.get(url)
    except Exception as e:
        print(url) #Print url
        print(e) #Print error
        with open("list_of_article_urls", "w") as l: #Save the list_htmls as a json file to retrieve at another time
            json.dump(list_of_article_urls, l)
        continue #Continue to next iteration of the loop
    #
    
    
    # Parse data with BeautifulSoup
    soup = BeautifulSoup(response.content, 'lxml')
    
    # Find all articles on the page
    articles = soup.find_all('div', class_='shadow overflow-hidden mx-4')
    
    # Append the article URLs to the list
    for article in articles:
        article_url = article.find('a')['href']
        list_of_article_urls.append(article_url)

# Now, list_of_article_urls contains all the article URLs from pages 1 to 500
print("Total number of articles:", len(list_of_article_urls))


100%|██████████████████████████████████████████████████████████████████████████████████| 73/73 [01:22<00:00,  1.13s/it]

Total number of articles: 1445





## saving the right url code and delete duplicates

In [3]:
base_url = 'https://www.boligsiden.dk'

list_of_article_urls = [base_url + url for url in list_of_article_urls]

list_of_article_urls = list(set(list_of_article_urls))

## scrappe standard information

In [4]:
import requests
from bs4 import BeautifulSoup
import re
import tqdm

# Create empty lists for the information we want to extract for every article
adresse_list = []
kvdm_list = []

# Iterate through each article URL
for article_url in tqdm.tqdm(list_of_article_urls):
    try:
        # Fetch HTML content from the article URL
        response = requests.get(article_url)
        response.raise_for_status()  # Raise an exception for HTTP errors

        # Create a BeautifulSoup object to parse the HTML
        soup = BeautifulSoup(response.text, "html.parser")

        # Find elements with the specified classes and extract their text content
        element_1 = soup.find(class_="font-bold text-sm md:text-base")
        text_content_1 = element_1.get_text(strip=True) if element_1 else None

        element_2 = soup.find(class_="mt-1 text-xs md:text-sm text-gray-600")
        text_content_2 = element_2.get_text(strip=True) if element_2 else None

        # Combine the text content with a space between them
        combined_text = ' '.join(filter(None, [text_content_1, text_content_2]))

        # Append combined_text to adresse_list
        adresse_list.append(combined_text)

        # Regular expression pattern to find numbers before "m²"
        pattern = r'(\d+)\s*m²'  # Grouping the digits

        # Find all matches
        matches = re.findall(pattern, response.text)

        # Append only the first number from each match to kvdm_list
        if matches:
            kvdm_list.append(int(matches[0]))
        else:
            kvdm_list.append(None)

    except requests.RequestException as e:
        print(f"Error fetching content for {article_url}: {e}")

# Now, adresse_list contains the text content of the elements with classes
# "font-bold text-sm md:text-base" and "mt-1 text-xs md:text-sm text-gray-600"
# with a space between them for each article URL
# print(adresse_list)

# Now, kvdm_list contains the first number found before "m²" for each article URL
# Converted to integers where possible, and None where no match was found
# print(kvdm_list)

100%|██████████████████████████████████████████████████████████████████████████████| 1445/1445 [10:43<00:00,  2.24it/s]


## DataFrame 

In [5]:
# df.to_csv('København_kom.csv', index=False)

In [6]:
# Regular expression pattern to capture the desired part of each address
pattern = r'^[\wæøåÆØÅ\s]+\s\d+'

# Extract the desired part of each address
extracted_addresses = []
for address in adresse_list:
    match = re.match(pattern, address)
    if match:
        extracted_addresses.append(match.group(0))
    else:
        extracted_addresses.append(None)

        
short_addresses = list(set(extracted_addresses))

# Print the length of extracted addresses and adresse_list
# print(len(extracted_addresses), len(adresse_list))
# print(extracted_addresses)

In [7]:
print(len(short_addresses), len(adresse_list))

335 1445


In [8]:
# Assuming that all lists have the same length
data = {
    'URL': list_of_article_urls,
    'Adresse': adresse_list,
    'Kort adresse': extracted_addresses,
    'Kvdm': kvdm_list
}

# Create DataFrame
df = pd.DataFrame(data)
df = df[df['Kort adresse'].notna()]
# Print DataFrame
df[:6]
# df.to_csv('København_kom.csv', index=False)

Unnamed: 0,URL,Adresse,Kort adresse,Kvdm
0,https://www.boligsiden.dk/adresse/eremitagepar...,"Eremitageparken 309, 1. c. 2800 Kongens Lyngby",Eremitageparken 309,48
1,https://www.boligsiden.dk/adresse/askevaenget-...,"Askevænget 15, 3. th. 2830 Virum",Askevænget 15,75
2,https://www.boligsiden.dk/adresse/noergaardsve...,"Nørgaardsvej 22E, st. tv. 2800 Kongens Lyngby",Nørgaardsvej 22,56
3,https://www.boligsiden.dk/adresse/ulrikkenborg...,"Ulrikkenborg Plads 10D, 2. th. 2800 Kongens Ly...",Ulrikkenborg Plads 10,80
4,https://www.boligsiden.dk/adresse/cedervaenget...,"Cedervænget 11, st. mf. 2830 Virum",Cedervænget 11,50
5,https://www.boligsiden.dk/adresse/johan-wilman...,"Johan Wilmanns Vej 21, 2. tv. 2800 Kongens Lyngby",Johan Wilmanns Vej 21,57


## optimere korteadresser så man ikke slår lejligheder i i samme opgang op to gange

In [9]:
data1 = {
    'Kort adresse': short_addresses
}
df1 = pd.DataFrame(data1)
# df1

In [10]:
import googlemaps
# Initialize Google Maps API client
gmaps = googlemaps.Client(key='AIzaSyAkzxBl6dpAhqaPG3WQEDPsaK3pRsJoMjw')  # Replace 'YOUR_API_KEY' with your actual API key

# Function to geocode addresses and retrieve coordinates
def geocode_address(address):
    try:
        # Geocode address
        geocode_result = gmaps.geocode(address)
        if geocode_result:
            # Extract latitude and longitude from geocode result
            location = geocode_result[0]['geometry']['location']
            return location['lat'], location['lng']
        else:
            return None, None
    except Exception as e:
        print(f"Error geocoding address '{address}': {e}")
        return None, None

# Apply geocode_address function to 'Kort adresse' column and store results in new columns 'Latitude' and 'Longitude'
geocoded_results = df1['Kort adresse'].apply(geocode_address)

# Add Latitude and Longitude to DataFrame
df1['Latitude'], df1['Longitude'] = zip(*geocoded_results)

# Save df1 as a CSV file
df1.to_csv('df1_geocoded.csv', index=False)

# Print DataFrame with new coordinates columns
df1

Error geocoding address 'None': HTTP Error: 400


Unnamed: 0,Kort adresse,Latitude,Longitude
0,Lyngby Hovedgade 13,55.772450,12.501599
1,Taarbæk Strandvej 105K 2930,55.789305,12.592285
2,Niels Skrivers Vej 10A 2830,55.795078,12.438859
3,Lystoftevej 12,55.794194,12.510994
4,Skovvej 6,55.421215,11.571186
...,...,...,...
330,Grønnevej 23,55.798673,12.471048
331,Hollandsvej 25,55.763564,12.505768
332,Om Kæret 14A 2800,55.799590,12.525881
333,Skovvej 1,55.952255,12.527871


In [11]:
# Merge the two DataFrames on the 'Kort adresse' column using a left join
combined_df = pd.merge(df, df1, on='Kort adresse', how='left')

# Print the combined DataFrame
combined_df

Unnamed: 0,URL,Adresse,Kort adresse,Kvdm,Latitude,Longitude
0,https://www.boligsiden.dk/adresse/eremitagepar...,"Eremitageparken 309, 1. c. 2800 Kongens Lyngby",Eremitageparken 309,48,55.794706,12.535218
1,https://www.boligsiden.dk/adresse/askevaenget-...,"Askevænget 15, 3. th. 2830 Virum",Askevænget 15,75,55.793509,12.477723
2,https://www.boligsiden.dk/adresse/noergaardsve...,"Nørgaardsvej 22E, st. tv. 2800 Kongens Lyngby",Nørgaardsvej 22,56,55.770192,12.509265
3,https://www.boligsiden.dk/adresse/ulrikkenborg...,"Ulrikkenborg Plads 10D, 2. th. 2800 Kongens Ly...",Ulrikkenborg Plads 10,80,55.767739,12.501527
4,https://www.boligsiden.dk/adresse/cedervaenget...,"Cedervænget 11, st. mf. 2830 Virum",Cedervænget 11,50,55.794660,12.478604
...,...,...,...,...,...,...
1428,https://www.boligsiden.dk/adresse/eremitagepar...,"Eremitageparken 323, 1. a. 2800 Kongens Lyngby",Eremitageparken 323,108,55.796116,12.535988
1429,https://www.boligsiden.dk/adresse/askevaenget-...,"Askevænget 13, 1. tv. 2830 Virum",Askevænget 13,86,55.793400,12.477761
1430,https://www.boligsiden.dk/adresse/groennevej-6...,"Grønnevej 62, 1. tv. 2830 Virum",Grønnevej 62,60,55.794727,12.471729
1431,https://www.boligsiden.dk/adresse/bagsvaerdvej...,"Bagsværdvej 69B, st. 3. 2800 Kongens Lyngby",Bagsværdvej 69,64,55.765731,12.486328


In [12]:
combined_df['Kommune'] = 'Lyngby-Taarbaek'
combined_df.to_csv('Lyngby-Taarbaek_kom.csv', index=False)
combined_df

Unnamed: 0,URL,Adresse,Kort adresse,Kvdm,Latitude,Longitude,Kommune
0,https://www.boligsiden.dk/adresse/eremitagepar...,"Eremitageparken 309, 1. c. 2800 Kongens Lyngby",Eremitageparken 309,48,55.794706,12.535218,Lyngby-Taarbaek
1,https://www.boligsiden.dk/adresse/askevaenget-...,"Askevænget 15, 3. th. 2830 Virum",Askevænget 15,75,55.793509,12.477723,Lyngby-Taarbaek
2,https://www.boligsiden.dk/adresse/noergaardsve...,"Nørgaardsvej 22E, st. tv. 2800 Kongens Lyngby",Nørgaardsvej 22,56,55.770192,12.509265,Lyngby-Taarbaek
3,https://www.boligsiden.dk/adresse/ulrikkenborg...,"Ulrikkenborg Plads 10D, 2. th. 2800 Kongens Ly...",Ulrikkenborg Plads 10,80,55.767739,12.501527,Lyngby-Taarbaek
4,https://www.boligsiden.dk/adresse/cedervaenget...,"Cedervænget 11, st. mf. 2830 Virum",Cedervænget 11,50,55.794660,12.478604,Lyngby-Taarbaek
...,...,...,...,...,...,...,...
1428,https://www.boligsiden.dk/adresse/eremitagepar...,"Eremitageparken 323, 1. a. 2800 Kongens Lyngby",Eremitageparken 323,108,55.796116,12.535988,Lyngby-Taarbaek
1429,https://www.boligsiden.dk/adresse/askevaenget-...,"Askevænget 13, 1. tv. 2830 Virum",Askevænget 13,86,55.793400,12.477761,Lyngby-Taarbaek
1430,https://www.boligsiden.dk/adresse/groennevej-6...,"Grønnevej 62, 1. tv. 2830 Virum",Grønnevej 62,60,55.794727,12.471729,Lyngby-Taarbaek
1431,https://www.boligsiden.dk/adresse/bagsvaerdvej...,"Bagsværdvej 69B, st. 3. 2800 Kongens Lyngby",Bagsværdvej 69,64,55.765731,12.486328,Lyngby-Taarbaek


## scrappe priserne