In [13]:
import requests
from urllib.parse import urlencode
from bs4 import BeautifulSoup
import pandas as pd
import os
import json
import asyncio
import aiohttp

In [77]:
locations = {
    "Nottingham": "1019",
    "Newcastle-Under-Lyme": "982",
    "Loughborough": "871",
    "Lancaster": "769",
    "York": "1498"
}

def get_location_url(base_url, location_number):
    params = {"locationIdentifier": f"REGION^{location_number}"}
    return base_url + urlencode(params)

base_url = "https://www.rightmove.co.uk/property-{}/find.html?"

rents = {k: {'search_url': get_location_url(base_url.format('to-rent'), v)} for k, v in locations.items()}
sales = {k: {'search_url': get_location_url(base_url.format('for-sale'), v)} for k, v in locations.items()}

rents, sales

({'Nottingham': {'search_url': 'https://www.rightmove.co.uk/property-to-rent/find.html?locationIdentifier=REGION%5E1019'},
  'Newcastle-Under-Lyme': {'search_url': 'https://www.rightmove.co.uk/property-to-rent/find.html?locationIdentifier=REGION%5E982'},
  'Loughborough': {'search_url': 'https://www.rightmove.co.uk/property-to-rent/find.html?locationIdentifier=REGION%5E871'},
  'Lancaster': {'search_url': 'https://www.rightmove.co.uk/property-to-rent/find.html?locationIdentifier=REGION%5E769'},
  'York': {'search_url': 'https://www.rightmove.co.uk/property-to-rent/find.html?locationIdentifier=REGION%5E1498'}},
 {'Nottingham': {'search_url': 'https://www.rightmove.co.uk/property-for-sale/find.html?locationIdentifier=REGION%5E1019'},
  'Newcastle-Under-Lyme': {'search_url': 'https://www.rightmove.co.uk/property-for-sale/find.html?locationIdentifier=REGION%5E982'},
  'Loughborough': {'search_url': 'https://www.rightmove.co.uk/property-for-sale/find.html?locationIdentifier=REGION%5E871'},


In [78]:
async def fetch_results(session, url, pages=None):
    
    links = []
    page = 0
    index = 0

    while pages is None or page < pages:  # None for all pages
        res_url = f"{url}&index={index}"

        async with session.get(res_url) as response:
            if response.status == 400:
                print(f"Final page reached for URL {url}")
                break

            response.raise_for_status()
            html = await response.text()

            soup = BeautifulSoup(html, 'html.parser')

            # Extract property links
            property_cards = soup.find_all("div", class_="propertyCard-details")[1:]  # exclude featured listing
            property_links = [pc.find('a', class_="propertyCard-link").get("href") for pc in property_cards]
            full_links = [f"https://www.rightmove.co.uk{link}" for link in property_links if link]
            links.extend(full_links)

            page += 1
            index += len(property_cards)

    return links

In [79]:
async def update_results(session, _dict, pages):
    for k, v in _dict.items():
        url = v['search_url']
        results = await fetch_results(session, url, pages)
        _dict[k]['links'] = results

In [80]:
async def run_update_results(pages):
    async with aiohttp.ClientSession() as session:
        await asyncio.gather(
            update_results(session, rents, pages),
            update_results(session, sales, pages)
        )
await run_update_results(pages=2)

In [81]:
len(rents['Nottingham']['links']), len(sales['Nottingham']['links'])

(48, 48)

In [82]:
async def fetch_soup(url, session):

    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
        "Accept-Language": "en-US,en;q=0.9",
        "Accept-Encoding": "gzip, deflate, br",
        "Connection": "keep-alive",
        "Upgrade-Insecure-Requests": "1"
    }
    try:
        async with session.get(url, headers=headers) as response:
            response.raise_for_status()
            html = await response.text(encoding="utf-8")
            soup = BeautifulSoup(html, 'html.parser')
        
        return soup
        
    except (aiohttp.ClientError, Exception) as e:
        print(f"Failed to fetch {url}: {e}")
        return None

In [83]:
async def update_soups(session, _dict):
    for k, v in _dict.items():
        links = v['links']
        soups = await asyncio.gather(*[fetch_soup(link, session) for link in links])
        _dict[k]['soups'] = soups

In [84]:
async def run_update_soups():
    async with aiohttp.ClientSession() as session:
        await asyncio.gather(
            update_soups(session, rents),
            update_soups(session, sales)
        )
await run_update_soups()

In [85]:
len(rents['Nottingham']['soups']), len(sales['Nottingham']['soups'])

(48, 48)

In [86]:
def save_urls(data, base_folder):
    os.makedirs(base_folder, exist_ok=True)
    
    for location, details in data.items():
        location_folder = os.path.join(base_folder, location)
        os.makedirs(location_folder, exist_ok=True)

        # Save links
        links_folder = os.path.join(location_folder, "links")
        os.makedirs(links_folder, exist_ok=True)
        with open(os.path.join(links_folder, "links.json"), "w") as f:
            json.dump(details["links"], f, indent=4)

        # Save soups
        soups_folder = os.path.join(location_folder, "soups")
        os.makedirs(soups_folder, exist_ok=True)
        for idx, soup in enumerate(details["soups"], start=1):
            soup_file = f"soup_{idx}.html"
            with open(os.path.join(soups_folder, soup_file), "w") as f:
                f.write(soup.prettify())

In [87]:
save_urls(rents, 'rent')
save_urls(sales, 'sale')