In [4]:
import os
import pandas as pd
from bs4 import BeautifulSoup

In [5]:
def extract_property_details(soup):
    
    property_details = {}
    
    address = soup.find("h1", class_="_2uQQ3SV0eMHL1P6t5ZDo2q")
    if address:
        property_details["address"] = address.get_text(strip=True)
    
    price_pcm = soup.find("div", class_="_1gfnqJ3Vtd1z40MlC0MzXu")
    if price_pcm:
        property_details["price_pcm"] = price_pcm.find('span').get_text(strip=True)
    
    price_per_week = soup.find("div", class_="HXfWxKgwCdWTESd5VaU73")
    if price_per_week:
        property_details["price_per_week"] = price_per_week.contents[0].get_text(strip=True)
    
    letting_divs = soup.find_all("div", class_="_2RnXSVJcWbWv4IpBC1Sng6")
    for div in letting_divs:
        dt = div.find("dt")
        dd = div.find("dd")
        if dt and dd:
            key = dt.get_text(strip=True)
            contents = dd.contents
            if contents:
                if contents[0].get_text(strip=True) == "£":
                    value = f"{contents[0].get_text(strip=True)}{contents[1].get_text(strip=True)}" if len(contents) > 1 else contents[0].text
                else:
                    value = contents[0].get_text(strip=True)
            else:
                value = None
            property_details[key] = value
    
    key_elements = soup.find_all("span", class_="ZBWaPR-rIda6ikyKpB_E2")
    value_elements = soup.find_all("p", class_="_1hV1kqpVceE9m-QrX_hWDN")
    for i in range(len(key_elements)):
        property_details[key_elements[i].get_text(strip=True)] = value_elements[i].get_text(strip=True)
    
    letting_divs_2 = soup.find_all("div", class_="_9u6R9n55iQlZi-JF6H59W")
    for div in letting_divs_2:
        dt = div.find("dt")
        dd = div.find("dd")
        if dt and dd and dd.contents:
            key = dt.contents[0].get_text(strip=True)
            span_tags = list(dd.find_all("span"))
            span_texts = [span.get_text(strip=True) for span in span_tags]
            if span_texts:
                value = ' '.join(span_texts)
            else:
                value = dd.get_text(strip=True)
            property_details[key] = value
    
#    features = soup.find_all("li", class_="lIhZ24u1NHMa5Y6gDH90A")
#    if len(features) != 0:
#        key_features = []
#        for feature in features:
#            key_features.append(feature.text.strip())
#       property_details["key_features"] = key_features
    
    return property_details

In [6]:
def extract_from_path(path):
    soups_folder = os.path.join(path, "soups")
    property_details = []

    for soup_file in os.listdir(soups_folder):
        soup_path = os.path.join(soups_folder, soup_file)
        if os.path.isfile(soup_path) and soup_file.endswith(".html"):
            with open(soup_path, "r", encoding="utf-8") as file:
                soup = BeautifulSoup(file.read(), "html.parser")
                details = extract_property_details(soup)
                if details:
                    property_details.append(details)

    return pd.DataFrame(property_details)

In [7]:
def extract_all(base_path):
    locations = [loc for loc in os.listdir(base_path) if os.path.isdir(os.path.join(base_path, loc)) and not loc.startswith('.')]
    for location in locations:
        location_path = os.path.join(base_path, location)
        output_csv = os.path.join(location_path, "data.csv")
        df = extract_from_path(location_path)
        df.to_csv(output_csv, index=False)
        os.chmod(output_csv, 0o444) # Make file read-only
        print(f"Saved {output_csv}")

In [8]:
extract_all("rent")
extract_all("sale")

Saved rent/Nottingham/data.csv
Saved rent/Newcastle-Under-Lyme/data.csv
Saved rent/Lancaster/data.csv
Saved rent/York/data.csv
Saved rent/Loughborough/data.csv
Saved sale/Nottingham/data.csv
Saved sale/Newcastle-Under-Lyme/data.csv
Saved sale/Loughborough/data.csv
Saved sale/Lancaster/data.csv
Saved sale/York/data.csv
