In [8]:
import requests
from bs4 import BeautifulSoup
import csv
import time

In [70]:
def fetch_details(url, url_id):
   
    full_url = f'https://www.otodom.pl{url}'
    try:
        response = requests.get(full_url, headers={'User-Agent': 'Mozilla/5.0'})
        if response.status_code == 404:
            print(f"URL not found, skipping ID {url_id}: {full_url}")
            return None  # Return None to indicate that this URL should be skipped
        response.raise_for_status()  # Raises an HTTPError for bad responses (4XX, 5XX)
    except requests.RequestException as e:
        print(f"Request failed for ID {url_id}, URL: {full_url}, Error: {e}")
        return None  # Return None to indicate failure
    
    time.sleep(1)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Define a helper function to extract data using a specified selector or attribute
    def get_data(selector, attribute=None):
        element = soup.select_one(selector)
        if element:
            if attribute:
                return element[attribute]
            else:
                return element.get_text(strip=True)
        else:
            return None

    # Use the helper function to fetch various details
    details = {
        'id': url_id,  # Use the passed index as ID
        'price': get_data('strong[data-cy="adPageHeaderPrice"]'),  # Extracting price
        'area': get_data('div[data-testid="table-value-area"]'),
        'rooms': get_data('div[data-testid="table-value-rooms_num"]'),
        'floor': get_data('div[data-testid="table-value-floor"]'),
        'rent': get_data('div[data-testid="table-value-rent"]'),
        'outdoor': get_data('div[data-testid="table-value-outdoor"]'),
        'parking': get_data('div[data-testid="table-value-car"]'),
        'build_year': get_data('div[data-testid="table-value-build_year"]'),
        'status': get_data('div[data-testid="table-value-construction_status"]'),
        'address': get_data('a[aria-label="Adres"]') 
    }
    return details

In [24]:
def save_to_csv(data, filename):
    # Open the file in append mode, create a new file if it doesn't exist
    with open(filename, 'a', newline='', encoding='utf-8') as file:
        # Create a DictWriter with a semicolon as the delimiter
        writer = csv.DictWriter(file, fieldnames=data[0].keys(), delimiter=';')
        if file.tell() == 0:  # write header only if file is empty
            writer.writeheader()
        writer.writerows(data)  # Write the data


In [23]:
def read_urls_from_csv(filename, start_index, end_index):
    """ Reads URLs from a CSV file and returns them along with their original indices. """
    with open(filename, 'r', newline='') as file:
        reader = csv.reader(file)
        # We create a list of tuples that include the line number (1-based) and the URL
        urls_with_indices = [(index + 1, row[0]) for index, row in enumerate(reader)]
        # Filter the list to only include URLs within the specified range
        filtered_urls = [(index, url) for index, url in urls_with_indices if start_index <= index <= end_index]
        return filtered_urls


In [76]:
def scrape_details(start, end):
    """ Scrapes details for URLs between specified start and end indices. """
    urls_with_indices = read_urls_from_csv('all_urls.csv', start, end)
    all_details = []
    save_interval = 10  # Change to control how often to save

    for url_id, url in urls_with_indices:
        details = fetch_details(url, url_id)
        if details is None:
            print(f"Skipping ID {url_id} due to fetch failure.")
            continue  # Skip saving this ID if fetching failed

        all_details.append(details)

        # Save every save_interval records or at the end of the list
        if len(all_details) >= save_interval:
            print(f"Saving up to ID {url_id}")
            save_to_csv(all_details, 'details.csv')
            all_details = []  # Reset the list after saving

    # Save any remaining details if not exactly divisible by save_interval
    if all_details:
        print("Saving final batch of details.")
        save_to_csv(all_details, 'details.csv')



In [98]:
start = 13001
end = 13496

In [99]:
scrape_details(start,end)

Saving up to ID 13010
Saving up to ID 13020
Saving up to ID 13030
Saving up to ID 13040
Request failed for ID 13049, URL: https://www.otodom.pl/pl/oferta/ciche-trzy-pokoje-na-zielonym-mokotowskim-osiedlu-ID4otDY, Error: 410 Client Error: Gone for url: https://www.otodom.pl/pl/oferta/ciche-trzy-pokoje-na-zielonym-mokotowskim-osiedlu-ID4otDY
Skipping ID 13049 due to fetch failure.
Saving up to ID 13051
Saving up to ID 13061
Request failed for ID 13070, URL: https://www.otodom.pl/pl/oferta/znakomita-inwestycja-24-minuty-do-centrum-ID4osg5, Error: 410 Client Error: Gone for url: https://www.otodom.pl/pl/oferta/znakomita-inwestycja-24-minuty-do-centrum-ID4osg5
Skipping ID 13070 due to fetch failure.
Saving up to ID 13072
Saving up to ID 13082
Saving up to ID 13092
Saving up to ID 13102
Saving up to ID 13112
Saving up to ID 13122
Saving up to ID 13132
Saving up to ID 13142
Saving up to ID 13152
Saving up to ID 13162
Saving up to ID 13172
Saving up to ID 13182
Saving up to ID 13192
Saving up 