In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import numpy as np
import re
import json
from datetime import datetime

### First Scraper

**Loops through all n rightmove results pages extracting link, price and id of each property.**

1. First we define the parameters of our search i.e. the max/min price and the radius around the station.
2. Next, since the URL changes for page 1 vs pages 2+, we reconfigure the request accordingly using `if` and `elif`. The URLs have the parameters of our search inserted. 
3. Requests.get fetches the specified webpage. r objects have `.text` attributes which returns the webpage's raw html.
4. BeautifulSoup is a package which parses html and returns a `soup` object.
5. `find_all` takes a html tag as an argument ("div" here). Any argument that’s not recognized (e.g. class_) will be turned into a filter on a tag’s attributes. Here the argument class_, is used to filter against each tag’s 'class_' attribute which identifies a new property.
6. Looping through the apartments, we extract the relevant information this time using `find` and looking for the relevant info indicated by 'class_' again.
7. Appending the info at the end of each loop means we compile all the info across the webpages.

https://www.rightmove.co.uk/property-for-sale/find.html?locationIdentifier=STATION%5E6095&maxPrice=800000&minPrice=400000&radius=3.0&sortType=6&index=24&propertyTypes=detached%2Cflat%2Csemi-detached%2Cterraced&secondaryDisplayPropertyType=housesandflats&includeSSTC=false&mustHave=&dontShow=&furnishTypes=&keywords=

Scrape 200 properties (i.e. 8 pages as there are 25 properties per page)
Apply filters to show properties that are: a) 2 beds, b) 1 bathroom, c) posted in last 7 days, d) a house/flat/apartment, e) 


In [None]:
![alt text](imagename.png "Title")

In [28]:
# Scraping the rightmove property search results webpages 
# Collates valid properties' weblinks, plus their price and id

def scrape_results_page(min_beds=2,max_beds=2,radius=1,noPages=2,days_since_added=7):
    all_apartment_links = [] # stores apartment links
    all_price = [] # stores the listing price of apartment
    all_id_no = []
    stations = ['STATION%5E7832','STATION%5E3509', 'STATION%5E9338', 'STATION%5E245']
    
    for station in stations:
        for i in range(noPages):
            if i==0:
                r= requests.get(f'https://www.rightmove.co.uk/property-for-sale/find.html?locationIdentifier={station}&maxBedrooms={max_beds}&minBedrooms={min_beds}&radius={radius}&sortType=6&propertyTypes=detached%2Cflat%2Csemi-detached%2Cterraced&maxDaysSinceAdded={days_since_added}&includeSSTC=false&mustHave=&dontShow=&furnishTypes=&keywords=')
            else:
                r = ''
                while r == '':
                    try:
                        r = requests.get(f'https://www.rightmove.co.uk/property-for-sale/find.html?locationIdentifier={station}&maxBedrooms={max_beds}&minBedrooms={min_beds}&radius={radius}&sortType=6&index={i*24}&propertyTypes=detached%2Cflat%2Csemi-detached%2Cterraced&maxDaysSinceAdded={days_since_added}&includeSSTC=false&mustHave=&dontShow=&furnishTypes=&keywords=')
                        break
                    except:
                        print(f'Connection refused by the server on page {i+1}... sleeping for 3 seconds')
                        time.sleep(3)
                        print("Was a nice sleep, now let me continue...")
                        continue

            soup = BeautifulSoup(r.text, 'lxml')

            apartments = soup.find_all("div", class_="l-searchResult is-list")

            for j in range(len(apartments)):

                # tracks which apartment we are on in the page
                apartment_no = apartments[j]

                # append link
                apartment_info = apartment_no.find("a", class_="propertyCard-link")
                link = "https://www.rightmove.co.uk" + apartment_info.attrs["href"]
                all_apartment_links.append(link)

                # append price
                price = (
                    apartment_no.find("div", class_="propertyCard-priceValue")
                    .get_text()
                    .strip()
                )
                all_price.append(int(price.replace(",","").replace("£","")))

                # append id
                id_no = (
                    apartment_no.find("div", id_="property-*")
                    #.get_text()
                    #.strip()
                )
                all_id_no.append(id_no)
    
    remove_indices = [0,25]
    all_apartment_links = [i for j, i in enumerate(all_apartment_links) if j not in remove_indices]
    all_price = [i for j, i in enumerate(all_price) if j not in remove_indices]
    all_id_no = [i for j, i in enumerate(all_id_no) if j not in remove_indices]
    
    return r.ok, all_apartment_links, all_price, all_id_no

In [30]:
prices = scrape_results_page()[2]

In [31]:
len(prices)

153