In [9]:
import requests
import time
import tqdm
import pandas as pd
import os
import json
from bs4 import BeautifulSoup
import pandas as pd

import re

from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By

## saving all the urls for flats in copenhagen

In [14]:
# Base URL using an f-string
base_url = 'https://www.boligsiden.dk/kommune/frederiksberg/solgte/ejerlejlighed?sortAscending=false&yearSoldFrom=2015&yearSoldTo=2023&page={page_number}'

# Create an empty list to store all article URLs
list_of_article_urls = []

####################################################
# Insert the page number into the base URL using string formatting
url_with_page_number = base_url.format(page_number=1)

# Send a GET request to the URL
response = requests.get(url_with_page_number)
soup = BeautifulSoup(response.content, 'html.parser')

# Find all elements containing the number of pages
pagination_elements = soup.find_all('a', class_='w-10 h-10 block text-center')

# Extract the second last element if it exists
if len(pagination_elements) >= 2:
    # Convert the tag object to a string and then extract the text
    second_last_element_text = str(pagination_elements[-2].text)
    # Extract the number from the text
    number_pages = int(second_last_element_text)
else:
    number_pages = 500  # Default value if second last element is not found

print(number_pages)
####################################################

# Loop through pages from 1 to 500
for page_number in tqdm.tqdm(range(1, number_pages+1)):
    # Define the URL for the current page using the f-string
    url = base_url.format(page_number=page_number)
    
    # Connect to the page
#     response = requests.get(url)
    
    #
    try:
        response = requests.get(url)
    except Exception as e:
        print(url) #Print url
        print(e) #Print error
        with open("list_of_article_urls", "w") as l: #Save the list_htmls as a json file to retrieve at another time
            json.dump(list_of_article_urls, l)
        continue #Continue to next iteration of the loop
    #
    
    
    # Parse data with BeautifulSoup
    soup = BeautifulSoup(response.content, 'lxml')
    
    # Find all articles on the page
    articles = soup.find_all('div', class_='shadow overflow-hidden mx-4')
    
    # Append the article URLs to the list
    for article in articles:
        article_url = article.find('a')['href']
        list_of_article_urls.append(article_url)

# Now, list_of_article_urls contains all the article URLs from pages 1 to 500
print("Total number of articles:", len(list_of_article_urls))


439


  2%|█▎                                                                                | 7/439 [00:08<09:11,  1.28s/it]


KeyboardInterrupt: 

## saving the right url code and delete duplicates

In [3]:
base_url = 'https://www.boligsiden.dk'

list_of_article_urls = [base_url + url for url in list_of_article_urls]

list_of_article_urls = list(set(list_of_article_urls))

## scrappe standard information

In [4]:
import requests
from bs4 import BeautifulSoup
import re
import tqdm

# Create empty lists for the information we want to extract for every article
adresse_list = []
kvdm_list = []

# Iterate through each article URL
for article_url in tqdm.tqdm(list_of_article_urls):
    try:
        # Fetch HTML content from the article URL
        response = requests.get(article_url)
        response.raise_for_status()  # Raise an exception for HTTP errors

        # Create a BeautifulSoup object to parse the HTML
        soup = BeautifulSoup(response.text, "html.parser")

        # Find elements with the specified classes and extract their text content
        element_1 = soup.find(class_="font-bold text-sm md:text-base")
        text_content_1 = element_1.get_text(strip=True) if element_1 else None

        element_2 = soup.find(class_="mt-1 text-xs md:text-sm text-gray-600")
        text_content_2 = element_2.get_text(strip=True) if element_2 else None

        # Combine the text content with a space between them
        combined_text = ' '.join(filter(None, [text_content_1, text_content_2]))

        # Append combined_text to adresse_list
        adresse_list.append(combined_text)

        # Regular expression pattern to find numbers before "m²"
        pattern = r'(\d+)\s*m²'  # Grouping the digits

        # Find all matches
        matches = re.findall(pattern, response.text)

        # Append only the first number from each match to kvdm_list
        if matches:
            kvdm_list.append(int(matches[0]))
        else:
            kvdm_list.append(None)

    except requests.RequestException as e:
        print(f"Error fetching content for {article_url}: {e}")

# Now, adresse_list contains the text content of the elements with classes
# "font-bold text-sm md:text-base" and "mt-1 text-xs md:text-sm text-gray-600"
# with a space between them for each article URL
# print(adresse_list)

# Now, kvdm_list contains the first number found before "m²" for each article URL
# Converted to integers where possible, and None where no match was found
# print(kvdm_list)

100%|████████████████████████████████████████████████████████████████████████████| 8734/8734 [1:05:26<00:00,  2.22it/s]


## DataFrame 

In [5]:
# df.to_csv('København_kom.csv', index=False)

In [6]:
# Regular expression pattern to capture the desired part of each address
pattern = r'^[\wæøåÆØÅ\s]+\s\d+'

# Extract the desired part of each address
extracted_addresses = []
for address in adresse_list:
    match = re.match(pattern, address)
    if match:
        extracted_addresses.append(match.group(0))
    else:
        extracted_addresses.append(None)

        
short_addresses = list(set(extracted_addresses))

# Print the length of extracted addresses and adresse_list
# print(len(extracted_addresses), len(adresse_list))
# print(extracted_addresses)

In [13]:
print(len(short_addresses), len(adresse_list))

1380 8734


In [7]:
# Assuming that all lists have the same length
data = {
    'URL': list_of_article_urls,
    'Adresse': adresse_list,
    'Kort adresse': extracted_addresses,
    'Kvdm': kvdm_list
}

# Create DataFrame
df = pd.DataFrame(data)
df = df[df['Kort adresse'].notna()]
# Print DataFrame
df[:6]
# df.to_csv('København_kom.csv', index=False)

Unnamed: 0,URL,Adresse,Kort adresse,Kvdm
0,https://www.boligsiden.dk/adresse/borups-alle-...,"Borups Alle 126A, 2. tv. 2000 Frederiksberg",Borups Alle 126,92
1,https://www.boligsiden.dk/adresse/holger-dansk...,"Holger Danskes Vej 30, 1. 6. 2000 Frederiksberg",Holger Danskes Vej 30,34
2,https://www.boligsiden.dk/adresse/munkensvej-6...,"Munkensvej 6, 3. tv. 2000 Frederiksberg",Munkensvej 6,69
3,https://www.boligsiden.dk/adresse/hospitalsvej...,"Hospitalsvej 1A, 1. tv. 2000 Frederiksberg",Hospitalsvej 1,85
4,https://www.boligsiden.dk/adresse/kristian-zah...,"Kristian Zahrtmanns Plads 81, 3. th. 2000 Fred...",Kristian Zahrtmanns Plads 81,122
5,https://www.boligsiden.dk/adresse/jakob-dannef...,"Jakob Dannefærds Vej 6B, 3. tv. 1973 Frederiks...",Jakob Dannefærds Vej 6,104


## optimere korteadresser så man ikke slår lejligheder i i samme opgang op to gange

In [8]:
data1 = {
    'Kort adresse': short_addresses
}
df1 = pd.DataFrame(data1)
# df1

In [11]:
import googlemaps
# Initialize Google Maps API client
gmaps = googlemaps.Client(key='AIzaSyAkzxBl6dpAhqaPG3WQEDPsaK3pRsJoMjw')  # Replace 'YOUR_API_KEY' with your actual API key

# Function to geocode addresses and retrieve coordinates
def geocode_address(address):
    try:
        # Geocode address
        geocode_result = gmaps.geocode(address)
        if geocode_result:
            # Extract latitude and longitude from geocode result
            location = geocode_result[0]['geometry']['location']
            return location['lat'], location['lng']
        else:
            return None, None
    except Exception as e:
        print(f"Error geocoding address '{address}': {e}")
        return None, None

# Apply geocode_address function to 'Kort adresse' column and store results in new columns 'Latitude' and 'Longitude'
geocoded_results = df1['Kort adresse'].apply(geocode_address)

# Add Latitude and Longitude to DataFrame
df1['Latitude'], df1['Longitude'] = zip(*geocoded_results)

# Save df1 as a CSV file
df1.to_csv('df1_geocoded.csv', index=False)

# Print DataFrame with new coordinates columns
df1

Error geocoding address 'None': HTTP Error: 400


Unnamed: 0,Kort adresse,Latitude,Longitude
0,Betty Nansens Alle 29,55.670625,12.498680
1,Ingemannsvej 30,55.402521,11.364764
2,Lykkesholms Allé 36,55.679223,12.549451
3,Nyelandsvej 33,55.683205,12.528452
4,La Cours Vej 18,55.685361,12.514286
...,...,...,...
1375,Nordre Fasanvej 155,55.692414,12.531476
1376,Godthåbsvej 121,55.689706,12.518744
1377,Fuglebakkevej 87,55.693596,12.528851
1378,Vagtelvej 17,55.691491,12.521658


In [14]:
# Merge the two DataFrames on the 'Kort adresse' column using a left join
combined_df = pd.merge(df, df1, on='Kort adresse', how='left')

# Print the combined DataFrame
combined_df

Unnamed: 0,URL,Adresse,Kort adresse,Kvdm,Latitude,Longitude
0,https://www.boligsiden.dk/adresse/borups-alle-...,"Borups Alle 126A, 2. tv. 2000 Frederiksberg",Borups Alle 126,92,55.694147,12.531933
1,https://www.boligsiden.dk/adresse/holger-dansk...,"Holger Danskes Vej 30, 1. 6. 2000 Frederiksberg",Holger Danskes Vej 30,34,55.687085,12.537083
2,https://www.boligsiden.dk/adresse/munkensvej-6...,"Munkensvej 6, 3. tv. 2000 Frederiksberg",Munkensvej 6,69,55.693723,12.534592
3,https://www.boligsiden.dk/adresse/hospitalsvej...,"Hospitalsvej 1A, 1. tv. 2000 Frederiksberg",Hospitalsvej 1,85,55.679172,12.529267
4,https://www.boligsiden.dk/adresse/kristian-zah...,"Kristian Zahrtmanns Plads 81, 3. th. 2000 Fred...",Kristian Zahrtmanns Plads 81,122,55.692394,12.525977
...,...,...,...,...,...,...
7994,https://www.boligsiden.dk/adresse/blytsvej-5-0...,"Blytsvej 5, st. tv. 2000 Frederiksberg",Blytsvej 5,123,55.676534,12.513092
7995,https://www.boligsiden.dk/adresse/bentzonsvej-...,"Bentzonsvej 41, st. tv. 2000 Frederiksberg",Bentzonsvej 41,78,55.686072,12.529463
7996,https://www.boligsiden.dk/adresse/wilkensvej-1...,"Wilkensvej 15, 1. th. 2000 Frederiksberg",Wilkensvej 15,86,55.682186,12.515560
7997,https://www.boligsiden.dk/adresse/adilsvej-14-...,"Adilsvej 14, 3. th. 2000 Frederiksberg",Adilsvej 14,66,55.683940,12.533967


In [15]:
combined_df['Kommune'] = 'frederiksberg'
combined_df.to_csv('Frederiksberg_kom.csv', index=False)
combined_df

Unnamed: 0,URL,Adresse,Kort adresse,Kvdm,Latitude,Longitude,Kommune
0,https://www.boligsiden.dk/adresse/borups-alle-...,"Borups Alle 126A, 2. tv. 2000 Frederiksberg",Borups Alle 126,92,55.694147,12.531933,frederiksberg
1,https://www.boligsiden.dk/adresse/holger-dansk...,"Holger Danskes Vej 30, 1. 6. 2000 Frederiksberg",Holger Danskes Vej 30,34,55.687085,12.537083,frederiksberg
2,https://www.boligsiden.dk/adresse/munkensvej-6...,"Munkensvej 6, 3. tv. 2000 Frederiksberg",Munkensvej 6,69,55.693723,12.534592,frederiksberg
3,https://www.boligsiden.dk/adresse/hospitalsvej...,"Hospitalsvej 1A, 1. tv. 2000 Frederiksberg",Hospitalsvej 1,85,55.679172,12.529267,frederiksberg
4,https://www.boligsiden.dk/adresse/kristian-zah...,"Kristian Zahrtmanns Plads 81, 3. th. 2000 Fred...",Kristian Zahrtmanns Plads 81,122,55.692394,12.525977,frederiksberg
...,...,...,...,...,...,...,...
7994,https://www.boligsiden.dk/adresse/blytsvej-5-0...,"Blytsvej 5, st. tv. 2000 Frederiksberg",Blytsvej 5,123,55.676534,12.513092,frederiksberg
7995,https://www.boligsiden.dk/adresse/bentzonsvej-...,"Bentzonsvej 41, st. tv. 2000 Frederiksberg",Bentzonsvej 41,78,55.686072,12.529463,frederiksberg
7996,https://www.boligsiden.dk/adresse/wilkensvej-1...,"Wilkensvej 15, 1. th. 2000 Frederiksberg",Wilkensvej 15,86,55.682186,12.515560,frederiksberg
7997,https://www.boligsiden.dk/adresse/adilsvej-14-...,"Adilsvej 14, 3. th. 2000 Frederiksberg",Adilsvej 14,66,55.683940,12.533967,frederiksberg


## scrappe priserne