In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import numpy as np
import re
import json
import geopy
from geopy import distance
from datetime import datetime

### First Scraper

**Loops through all n rightmove results pages extracting link, price and id of each property.**

1. First we define the parameters of our search i.e. the max/min price and the radius around the station.
2. Next, since the URL changes for page 1 vs pages 2+, we reconfigure the request accordingly using `if` and `elif`. The URLs have the parameters of our search inserted. 
3. Requests.get fetches the specified webpage. r objects have `.text` attributes which returns the webpage's raw html.
4. BeautifulSoup is a package which parses html and returns a `soup` object.
5. `find_all` takes a html tag as an argument ("div" here). Any argument that’s not recognized (e.g. class_) will be turned into a filter on a tag’s attributes. Here the argument class_, is used to filter against each tag’s 'class_' attribute which identifies a new property.
6. Looping through the apartments, we extract the relevant information this time using `find` and looking for the relevant info indicated by 'class_' again.
7. Appending the info at the end of each loop means we compile all the info across the webpages.

https://www.rightmove.co.uk/property-for-sale/find.html?locationIdentifier=STATION%5E6095&maxPrice=800000&minPrice=400000&radius=3.0&sortType=6&index=24&propertyTypes=detached%2Cflat%2Csemi-detached%2Cterraced&secondaryDisplayPropertyType=housesandflats&includeSSTC=false&mustHave=&dontShow=&furnishTypes=&keywords=

In [2]:
# Scraping the rightmove property search results webpages
# Returns link, price and id of properties within a set radius from Marylebone station within the specified price bounds

def scrape_results_page(minPrice=450000,maxPrice=800000,radius=3,noPages=42):
    all_apartment_links, all_price, all_id_no = [], [], [] # stores apartment links, listing prices and ids
    
    for i in range(noPages):
        if i==0:
            r= requests.get(f'https://www.rightmove.co.uk/property-for-sale/find.html?locationIdentifier=STATION%5E7658&maxPrice={maxPrice}&minPrice={minPrice}&radius={radius}&sortType=6&propertyTypes=detached%2Cflat%2Csemi-detached%2Cterraced&secondaryDisplayPropertyType=housesandflats&includeSSTC=false&mustHave=&dontShow=&furnishTypes=&keywords=')
        else:
            r = ''
            while r == '':
                try:
                    r = requests.get(f'https://www.rightmove.co.uk/property-for-sale/find.html?locationIdentifier=STATION%5E7658&maxPrice={maxPrice}&minPrice={minPrice}&radius={radius}&sortType=6&index={i*24}&propertyTypes=detached%2Cflat%2Csemi-detached%2Cterraced&secondaryDisplayPropertyType=housesandflats&includeSSTC=false&mustHave=&dontShow=&furnishTypes=&keywords=')
                    break
                except:
                    print(f'Connection refused by the server on page {i+1}... sleeping for 3 seconds')
                    time.sleep(3)
                    print("Was a nice sleep, now let me continue...")
                    continue
            #r= requests.get(f'https://www.rightmove.co.uk/property-for-sale/find.html?locationIdentifier=STATION%5E6095&maxPrice={maxPrice}&minPrice={minPrice}&radius={radius}&sortType=6&index={i*24}&propertyTypes=detached%2Cflat%2Csemi-detached%2Cterraced&secondaryDisplayPropertyType=housesandflats&includeSSTC=false&mustHave=&dontShow=&furnishTypes=&keywords=')

        soup = BeautifulSoup(r.text, 'lxml')

        apartments = soup.find_all("div", class_="l-searchResult is-list")

        for i in range(len(apartments)):

            # tracks which apartment we are on in the page
            apartment_no = apartments[i]

            # append link
            apartment_info = apartment_no.find("a", class_="propertyCard-link")
            link = "https://www.rightmove.co.uk" + apartment_info.attrs["href"]
            all_apartment_links.append(link)

            # append price
            price = (
                apartment_no.find("div", class_="propertyCard-priceValue")
                .get_text()
                .strip()
            )
            all_price.append(price)

            # append id
            id_no = (
                apartment_no.find("div", id_="property-*")
                #.get_text()
                #.strip()
            )
            all_id_no.append(id_no)

    return r.ok, all_apartment_links, all_price, all_id_no

### Second Scraper

**Loops through all listing specific weblinks identified above extracting data on each property.**

In [3]:
# Uses the links from scrape_results_page function above to scrapes individual property listings
# Gathers data relevant to a property's value such as its features and location

def scrape_listings(date_from, links):

    all_links, all_features, all_prices, all_statname, all_statdist =[], [], [], [], []
    all_outcodes, all_postcodes, all_centralities, all_dates = [], [], [], []
    
    for i in range(len(links)):
        # Progress tracker
        if len(links) > 20:
            if i % (len(links)//20) == 0:
                percent = round(i*100/len(links))
                print(f'Code is {percent}% completed')

        r= requests.get(links[i])

        soup = BeautifulSoup(r.text, 'lxml')
        
        # Append Date uploaded
        link = soup.find_all('script')[6]    

        jsonobj = json.loads(link.text[25:]) # converts json into dictionary  
        date = jsonobj.get("analyticsInfo").get('analyticsProperty').get('added')
        
        if int(date) <= date_from:   # Skip if from an already scraped date
            continue
        all_dates.append(date)
        
        # Append variables of interest
        # weblink
        all_links.append(links[i])

        # physical features
        features = (
            soup.find_all("dd", class_="_1hV1kqpVceE9m-QrX_hWDN")
        )
        features =[str(i).replace('<dd class="_1hV1kqpVceE9m-QrX_hWDN">', '').replace('</dd>', '')
         .replace('<dd class="_1hV1kqpVceE9m-QrX_hWDN _2SpNNVW0fTYoFvPDmhKSt8 _3ixAp8byA3wC3qvii8d-kg">' , '') 
         for i in features]
        features =[i for i in features if "<dd class" not in i ]
        all_features.append(features)

        # price
        price = (
            soup.find('input').attrs['value']
            #.get_text()
            #.strip()
        )
        all_prices.append(int(price.replace(",","")))

        # postcodes   
        outcode = jsonobj.get("propertyData").get('address').get('outcode')
        postcode = outcode + jsonobj.get("propertyData").get('address').get('incode')
        all_outcodes.append(outcode)
        all_postcodes.append(postcode)    

        # distance to centre of London
        latitude = float(jsonobj.get("propertyData").get('location').get('latitude'))
        longitude = float(jsonobj.get("propertyData").get('location').get('longitude'))
        coords = (latitude, longitude)
        charingX = (51.507602, -0.127816)
        centrality = geopy.distance.geodesic(coords, charingX).km
        all_centralities.append(centrality)

        # stations
        statdist = (
            soup.find("span", class_="_1ZY603T1ryTT3dMgGkM7Lg")
            .get_text()
            .strip()
        )
        all_statdist.append(float(statdist.replace(" miles","")))

        statname = (
            soup.find("span", class_="cGDiWU3FlTjqSs-F1LwK4")
            .get_text()
            .strip()
        )
        all_statname.append(statname)

    return r.ok, all_links,all_features, all_prices, all_statname, all_statdist, all_outcodes,all_postcodes,all_centralities, all_dates 

### Property Valuer

**Takes link to chosen property and predicts values using best trained models.**

In [4]:
def area_scrape(year_month):
    # Scraping the data
    r= requests.get('https://www.plumplot.co.uk/London-house-prices.html')
    soup = BeautifulSoup(r.text, 'lxml')

    rows = soup.find_all("tr")

    outcode, avg_price = [], []
    for i in range(4,960):
        outcode.append(rows[i].find(lambda tag: tag.name == 'td' and 
                  tag.get('class') == ['mh2']).text)
        outcode[i-4] = outcode[i-4].replace(" ", "")
        avg_price.append(rows[i].find("td", class_="mh2 text-right").text)
        avg_price[i-4] = int(avg_price[i-4][:-1])

    area_df = pd.DataFrame.from_dict({"Outcode3": outcode, "Avg_price": avg_price})
    area_df.to_csv(f'{location}/data/area_df_{year_month}.csv', index = False)