## RightMove Scrape

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import numpy as np
import re
import json
import geopy
from geopy import distance
from datetime import datetime

### First Scraper

**Loops through all n rightmove results pages extracting link, price and id of each property.**

1. First we define the parameters of our search i.e. the max/min price and the radius around the station.
2. Next, since the URL changes for page 1 vs pages 2+, we reconfigure the request accordingly using `if` and `elif`. The URLs have the parameters of our search inserted. 
3. Requests.get fetches the specified webpage. r objects have `.text` attributes which returns the webpage's raw html.
4. BeautifulSoup is a package which parses html and returns a `soup` object.
5. `find_all` takes a html tag as an argument ("div" here). Any argument that’s not recognized (e.g. class_) will be turned into a filter on a tag’s attributes. Here the argument class_, is used to filter against each tag’s 'class_' attribute which identifies a new property.
6. Looping through the apartments, we extract the relevant information this time using `find` and looking for the relevant info indicated by 'class_' again.
7. Appending the info at the end of each loop means we compile all the info across the webpages.

https://www.rightmove.co.uk/property-for-sale/find.html?locationIdentifier=STATION%5E6095&maxPrice=800000&minPrice=400000&radius=3.0&sortType=6&index=24&propertyTypes=detached%2Cflat%2Csemi-detached%2Cterraced&secondaryDisplayPropertyType=housesandflats&includeSSTC=false&mustHave=&dontShow=&furnishTypes=&keywords=

In [3]:
# Scraping the rightmove property search results webpages 
# Collates valid properties' weblinks, plus their price and id

def scrape_results_page(minPrice=450000,maxPrice=800000,radius=3,noPages=42):
    all_apartment_links = [] # stores apartment links
    all_price = [] # stores the listing price of apartment
    all_id_no = []
    
    for i in range(noPages):
        if i==0:
            r= requests.get(f'https://www.rightmove.co.uk/property-for-sale/find.html?locationIdentifier=STATION%5E7658&maxPrice={maxPrice}&minPrice={minPrice}&radius={radius}&sortType=6&propertyTypes=detached%2Cflat%2Csemi-detached%2Cterraced&secondaryDisplayPropertyType=housesandflats&includeSSTC=false&mustHave=&dontShow=&furnishTypes=&keywords=')
        else:
            r = ''
            while r == '':
                try:
                    r = requests.get(f'https://www.rightmove.co.uk/property-for-sale/find.html?locationIdentifier=STATION%5E7658&maxPrice={maxPrice}&minPrice={minPrice}&radius={radius}&sortType=6&index={i*24}&propertyTypes=detached%2Cflat%2Csemi-detached%2Cterraced&secondaryDisplayPropertyType=housesandflats&includeSSTC=false&mustHave=&dontShow=&furnishTypes=&keywords=')
                    break
                except:
                    print(f'Connection refused by the server on page {i+1}... sleeping for 3 seconds')
                    time.sleep(3)
                    print("Was a nice sleep, now let me continue...")
                    continue
            #r= requests.get(f'https://www.rightmove.co.uk/property-for-sale/find.html?locationIdentifier=STATION%5E6095&maxPrice={maxPrice}&minPrice={minPrice}&radius={radius}&sortType=6&index={i*24}&propertyTypes=detached%2Cflat%2Csemi-detached%2Cterraced&secondaryDisplayPropertyType=housesandflats&includeSSTC=false&mustHave=&dontShow=&furnishTypes=&keywords=')

        soup = BeautifulSoup(r.text, 'lxml')

        apartments = soup.find_all("div", class_="l-searchResult is-list")

        for i in range(len(apartments)):

            # tracks which apartment we are on in the page
            apartment_no = apartments[i]

            # append link
            apartment_info = apartment_no.find("a", class_="propertyCard-link")
            link = "https://www.rightmove.co.uk" + apartment_info.attrs["href"]
            all_apartment_links.append(link)

            # append price
            price = (
                apartment_no.find("div", class_="propertyCard-priceValue")
                .get_text()
                .strip()
            )
            all_price.append(price)

            # append id
            id_no = (
                apartment_no.find("div", id_="property-*")
                #.get_text()
                #.strip()
            )
            all_id_no.append(id_no)

    return r.ok, all_apartment_links, all_price, all_id_no

In [4]:
all_apartment_links = scrape_results_page()[1]
all_prices = scrape_results_page()[2]
all_id_no = scrape_results_page()[3]

### Second Scraper

**Loops through all listing specific weblinks identified above extracting data on each property.**

In [5]:
most_recent = 20220000

In [6]:
# Scrapes individual property listing webpages based on links collated above

def scrape_listings(date_from = most_recent, links = all_apartment_links):

    all_links =[]
    all_features =[]
    all_prices =[]
    all_statname =[]
    all_statdist =[]
    all_outcodes = []
    all_postcodes = []
    all_centralities = []
    all_dates = []
    
    for i in range(len(links)):

        # Progress tracker
        if i % 52 == 0:
            percent = round(i*100/1050)
            print(f'Code is {percent}% complete')

        r= requests.get(links[i])
        soup = BeautifulSoup(r.text, 'lxml')

        # Append Date uploaded
        link = soup.find_all('script')[4]    
        jsonobj = json.loads(link.text[25:]) # converts json into dictionary  
        date = jsonobj.get("analyticsInfo").get('analyticsProperty').get('added')
        
        if int(date) <= date_from:   # Skip if from an already scraped date
            continue
        all_dates.append(date)

        # Append variables of interest

        # weblink
        all_links.append(links[i])

        # physical features
        features = (
            soup.find_all("p", class_="_1hV1kqpVceE9m-QrX_hWDN")
        )
        features =[str(i).replace('<p class="_1hV1kqpVceE9m-QrX_hWDN">', '').replace('</p>', '')
         .replace('<p class="_1hV1kqpVceE9m-QrX_hWDN _2SpNNVW0fTYoFvPDmhKSt8 _3ixAp8byA3wC3qvii8d-kg">' , '') 
         for i in features]
        features =[i for i in features if "<p class" not in i ]
        all_features.append(features)

        # price
        price = (
            soup.find('input').attrs['value']
            #.get_text()
            #.strip()
        )
        all_prices.append(int(price.replace(",","")))

        # postcodes   
        outcode = jsonobj.get("propertyData").get('address').get('outcode')
        postcode = outcode + jsonobj.get("propertyData").get('address').get('incode')
        all_outcodes.append(outcode)
        all_postcodes.append(postcode)    

        # distance to centre of London
        latitude = float(jsonobj.get("propertyData").get('location').get('latitude'))
        longitude = float(jsonobj.get("propertyData").get('location').get('longitude'))
        coords = (latitude, longitude)
        charingX = (51.507602, -0.127816)
        centrality = geopy.distance.geodesic(coords, charingX).km
        all_centralities.append(centrality)

        # stations
        statdist = (
            soup.find("span", class_="_1ZY603T1ryTT3dMgGkM7Lg")
            .get_text()
            .strip()
        )
        all_statdist.append(float(statdist.replace(" miles","")))

        statname = (
            soup.find("span", class_="cGDiWU3FlTjqSs-F1LwK4")
            .get_text()
            .strip()
        )
        all_statname.append(statname)

    return r.ok, all_links,all_features, all_prices, all_statname, all_statdist, all_outcodes,all_postcodes,all_centralities, all_dates 

In [9]:
# Capture all outputs
scraped = scrape_listings(date_from = most_recent)

# Obtain variables
all_links,all_features, all_prices = scraped[1], scraped[2],scraped[3]
all_statname, all_statdist, all_outcodes = scraped[4], scraped[5], scraped[6]
all_postcodes, all_centralities, all_dates = scraped[7],scraped[8],scraped[9]

Code is 0% complete
Code is 5% complete
Code is 10% complete
Code is 15% complete
Code is 20% complete
Code is 25% complete
Code is 30% complete
Code is 35% complete
Code is 40% complete
Code is 45% complete


KeyboardInterrupt: 

### Wrangling

In [None]:
# Creating floor size (sq. ft.) variable

import re
floor_size =[]

for i in range(len(all_features)):
    for j in range(len(all_features[i])):
        if all_features[i][j].endswith('sq. ft.'):
            clean1 = int(re.sub("[^0-9]", "", all_features[i][j]))
            if len(str(clean1)) > 4:   # Fixes instances where floor size is a range (e.g. 439-1051 sq feet)
                clean1 = str(clean1)
                half = int(len(clean1)/2)
                clean1 = int( (int(clean1[0:half]) + int(clean1[half:len(clean1)]) ) / 2)
            elif clean1 > 3000:  # Remove floorsizes > 3000
                clean1 = np.nan
            floor_size.append(clean1)
    if len(floor_size) <= i: 
        floor_size.append(np.nan)
floor_size[:10]

In [None]:
# Keywords and Floor size (different route)

units = ['Sq ft', 'sq ft', 'Sq Ft', 'SQ FT', 'SqFt', 'sqft', 'Sqft', 'Square Feet', 'square feet', 'Square feet', 
         'square foot', 'Square foot', 'Square Foot', 'Sq. Ft.', 'sq. ft.', 'Sq. ft.', 'Sq. Ft','Sq. ft','sq. ft']

floor_size2 = []

# Extracting keywords: porter/concierge/caretaker, balcony/garden/terrace 

porter_list = ['Porter','PORTER','porter', 'Concierge','CONCIERGE','concierge','Caretaker','caretaker','CARETAKER']
outdoor_list = ['Balcony','BALCONY','balcony', 'Terrace','TERRACE','terrace', 'Patio', 'PATIO', 'patio', 'Garden', 'GARDEN', 'garden']

porter = []
outdoor = []

for i in range(len(all_apartment_links)):
    # Progress tracker
    if i % 52 == 0:
        percent = round(i*100/1050)
        print(f'Code is {percent}% complete')
    
    #r= requests.get(all_apartment_links[i])
    if i==0:
        r= requests.get(all_apartment_links[i])
    else:
        r = ''
        while r == '':
            try:
                r = requests.get(all_apartment_links[i])
                break
            except:
                print(f'Connection refused by the server on page {i+1}... sleeping for 3 seconds')
                time.sleep(3)
                print("Was a nice sleep, now let me continue...")
                continue

    soup = BeautifulSoup(r.text, 'lxml')
    
    # floor size
    lst = (soup.find_all("li", class_="lIhZ24u1NHMa5Y6gDH90A"))
    text =[str(x).replace('<li class="lIhZ24u1NHMa5Y6gDH90A">', '').replace('</li>', '') 
     for x in lst]
    floor_size2.append(np.nan)
    for j in text:
        #print(f'entry {clean1}, next text is {j}')
        if any(k in j for k in units):
            try: 
                position = j.index('sq')
            except ValueError: 
                try:
                    position = j.index('Sq')
                except ValueError: 
                    position = j.index('SQ')
                except:
                    pass
            str1 = j[0:position]
            clean1 = int(re.sub("[^0-9]", "", str1))
            if clean1 >= 3000 or clean1 <= 200:
                clean1 = np.nan
            floor_size2[i] = clean1
        else:
            clean1 = np.nan

    # keywords
    lst = (soup.find_all("li", class_="lIhZ24u1NHMa5Y6gDH90A"))
    text =[str(x).replace('<li class="lIhZ24u1NHMa5Y6gDH90A">', '').replace('</li>', '') 
     for x in lst]
    porter.append(0)
    outdoor.append(0)
    for j in text:
        if any(k in j for k in porter_list) and porter[i] < 1:
            porter[i] +=1
        if any(k in j for k in outdoor_list) and outdoor[i] < 1:
            outdoor[i] +=1

In [7]:
# Merge floor size lists
floor_size3 =  [None] * len(floor_size)
for i in range(len(floor_size)):
    if np.isnan(floor_size[i]) == True:
        floor_size3[i] = floor_size2[i]
    else:
        floor_size3[i] = floor_size[i]

NameError: name 'floor_size' is not defined

In [11]:
# Proving the merge has worked
fs, fs2, fs3 = 0,0,0
for i in range(len(floor_size)):
    if np.isnan(floor_size[i]) == False:
        fs +=1

for i in range(len(floor_size)):
    if np.isnan(floor_size2[i]) == False:
        fs2 +=1
        
for i in range(len(floor_size)):
    if np.isnan(floor_size3[i]) == False:
        fs3 +=1

print(f'fs has {fs} entries, fs2 has {fs2} entries, combined they have {fs3} entries')

fs has 282 entries, fs2 has 125 entries, combined they have 365 entries


In [12]:
# Creating property type variable
prop_type =[]

for i in all_features:
    if i[0] in ('Apartment','Flat','Studio','Maisonette', 'House','Terraced','Mews'):
        prop_type.append(i[0])
    else: 
        prop_type.append('')

In [13]:
# Creating bedrooms & bathrooms variable
bedrooms =[]
bathrooms =[]

import numpy as np

for i in all_features:
    if i[0] == 'Studio':
        bedrooms.append(0)
        if i[1].startswith('×'):
            bathrooms.append(int(i[1][1]))
        else:
            bathrooms.append(np.nan)
    elif i[0].startswith('×') and i[1].startswith('×'): 
        bedrooms.append(int(i[0][1]))
        bathrooms.append(int(i[1][1]))
    elif len(i) <= 2:
        bedrooms.append(np.nan)
        bathrooms.append(np.nan)  
    elif i[1].startswith('×') and i[2].startswith('×'): 
        bedrooms.append(int(i[1][1]))
        bathrooms.append(int(i[2][1]))  
    else:
        bedrooms.append(np.nan)
        bathrooms.append(np.nan)

bedrooms[:20] 
#bathrooms[:20]

[1, 1, 1, 2, 1, nan, 2, 2, 3, 2, 2, 1, 2, 1, 1, 3, 2, 1, 1, 2]

In [36]:
# Extracting keywords: porter/concierge/caretaker, balcony/garden/terrace 

porter = []
outdoor = []
porter_list = ['Porter','PORTER','porter', 'Concierge','CONCIERGE','concierge','Caretaker','caretaker','CARETAKER']
outdoor_list = ['Balcony','BALCONY','balcony', 'Terrace','TERRACE','terrace', 'Patio', 'PATIO', 'patio', 'Garden', 'GARDEN', 'garden']

for i in range(len(all_links)):
    
    # Progress tracker
    if i % 52 == 0:
        percent = round(i*100/1050)
        print(f'Code is {percent}% complete')
    
    r= requests.get(all_links[i])

    soup = BeautifulSoup(r.text, 'lxml')
    
    # keywords
    lst = (soup.find_all("li", class_="lIhZ24u1NHMa5Y6gDH90A"))
    text =[str(x).replace('<li class="lIhZ24u1NHMa5Y6gDH90A">', '').replace('</li>', '') 
     for x in lst]
    porter.append(0)
    outdoor.append(0)
    for j in text:
        if any(k in j for k in porter_list) and porter[i] < 1:
            porter[i] +=1
        if any(k in j for k in outdoor_list) and outdoor[i] < 1:
            outdoor[i] +=1

Code is 0% complete
Code is 5% complete
Code is 10% complete
Code is 15% complete
Code is 20% complete
Code is 25% complete
Code is 30% complete
Code is 35% complete
Code is 40% complete
Code is 45% complete
Code is 50% complete
Code is 54% complete
Code is 59% complete
Code is 64% complete
Code is 69% complete
Code is 74% complete
Code is 79% complete
Code is 84% complete
Code is 89% complete
Code is 94% complete


In [37]:
# Creating contract type variable

contract =[]

for i in all_features:
    if i[-1] == 'Leasehold':
        contract.append('Leasehold')
    elif i[-1] == 'Share of Freehold':
        contract.append('Freehold')
    else:
        contract.append('')

In [38]:
# Adding new outcode variable

outcode2 =[]

for i in all_outcodes:
    if i[-1].isalpha() == True:
        outcode2.append(i[0:-1])
    else:
        outcode2.append(i)

In [39]:
# North vs South variable
south = ['SE1','SE5','SE11','SE16','SE15','SE17','SW8','SW11','SW9','SW4','SW2']
north =[]
for i in outcode2:
    if i in south:
        north.append(0)
    else:
        north.append(1)

In [40]:
# convert to dataframe
data = {"Links": all_links,
        "Date": all_dates,
        "Price": all_prices,
        "NearStat": all_statname,
        "StatDist(miles)": all_statdist,
        "Prop_Type": prop_type,
        "Bedrooms": bedrooms,
        "Bathrooms": bathrooms,
        "Floor_Size": floor_size3,
        "Outdoor": outdoor,
        "Portered": porter,
        "Contract_Type": contract,
        "Outcode": all_outcodes,
        "Outcode2": outcode2,
        "Postcode": all_postcodes,
        "Centrality": all_centralities,
        "North": north
       }
df = pd.DataFrame.from_dict(data)

# Save to csvs

#df.to_csv('/Users/Sim/Documents/Other/Programming/Personal Projects/Build.csv')

In [41]:
df.head(10)

Unnamed: 0,Links,Date,Price,NearStat,StatDist(miles),Prop_Type,Bedrooms,Bathrooms,Floor_Size,Outdoor,Portered,Contract_Type,Outcode,Outcode2,Postcode,Centrality,North
0,https://www.rightmove.co.uk/properties/1263461...,20220824,630000,Kings Cross Thameslink Station,0.1,Apartment,1.0,1.0,608.0,1,0,Leasehold,N1,N1,N19DW,2.703983,1
1,https://www.rightmove.co.uk/properties/1265351...,20220830,525000,Tufnell Park Station,0.3,Flat,1.0,1.0,,1,0,Leasehold,N19,N19,N194PL,5.625042,1
2,https://www.rightmove.co.uk/properties/1231726...,20220509,765000,South Hampstead Station,0.3,Apartment,1.0,1.0,,1,0,Freehold,NW6,NW6,NW63PH,5.679951,1
3,https://www.rightmove.co.uk/properties/1233508...,20220513,500000,Elephant & Castle (Bakerloo) Station,0.1,Flat,2.0,1.0,,0,1,Leasehold,SE1,SE1,SE16BB,2.372957,0
4,https://www.rightmove.co.uk/properties/1244871...,20220629,800000,Gloucester Road Station,0.1,Flat,1.0,1.0,803.0,1,0,Freehold,SW7,SW7,SW74RW,4.200852,1
5,https://www.rightmove.co.uk/properties/1265331...,20220830,799000,Battersea Power Underground Station,0.3,Flat,,,,0,1,Leasehold,SW11,SW11,SW118EF,3.064252,0
6,https://www.rightmove.co.uk/properties/8539062...,20220531,500000,Elephant & Castle (Bakerloo) Station,0.1,Flat,2.0,1.0,,0,1,Leasehold,SE1,SE1,SE16BB,2.372957,0
7,https://www.rightmove.co.uk/properties/1265320...,20220830,739950,Swiss Cottage Station,0.2,Flat,2.0,1.0,,1,0,Freehold,NW3,NW3,NW33JH,4.985283,1
8,https://www.rightmove.co.uk/properties/1265319...,20220830,700000,Canonbury Station,0.2,Maisonette,3.0,2.0,,1,0,Leasehold,N1,N1,N12LL,4.923459,1
9,https://www.rightmove.co.uk/properties/1225507...,20220419,585000,Brondesbury Station,0.1,Apartment,2.0,1.0,,0,0,Leasehold,NW6,NW6,NW67XL,6.731176,1


### Pre-processing

In [42]:
df['NearStat'].value_counts().head(10)

Pimlico Station                                                45
Westbourne Park Station                                        42
Ladbroke Grove Station                                         30
Edgware Road (Circle, District, Hammersmith & City) Station    30
Essex Road Station                                             29
Old Street Station                                             26
Caledonian Road & Barnsbury Station                            26
Vauxhall Station                                               23
Queen's Park Station                                           22
Kilburn High Road Station                                      21
Name: NearStat, dtype: int64

In [43]:
df['Outcode2'].value_counts()

N1      93
NW6     87
W2      82
SW1     79
W9      67
SE1     64
NW1     58
N7      55
EC1     50
NW3     39
WC1     33
NW8     31
W1      30
SW11    25
SE11    25
W10     24
W11     24
SW3     22
NW5     19
SW8     16
SW5     13
EC2     13
SW7     13
W8       9
N19      8
WC2      7
N5       7
EC4      6
SW10     5
N6       3
SE17     2
NW10     1
E1       1
EC3      1
NW2      1
E2       1
Name: Outcode2, dtype: int64

In [44]:
# Getting dummies vars
df2 = pd.concat([df, pd.get_dummies(df.iloc[:,5])], axis=1) #prop_type
df2 = pd.concat([df2, pd.get_dummies(df.iloc[:,3])], axis=1) #nearest station
df2 = pd.concat([df2, pd.get_dummies(df.iloc[:,11])], axis=1) #contract
df2 = pd.concat([df2, pd.get_dummies(df.iloc[:,13])], axis=1) #outcode2

df2.head(3)

Unnamed: 0,Links,Date,Price,NearStat,StatDist(miles),Prop_Type,Bedrooms,Bathrooms,Floor_Size,Outdoor,...,Warwick Avenue Station,Waterloo Station,West Hampstead Station,West Hampstead Thameslink Station,Westbourne Park Station,0,1,Unnamed: 19,Freehold,Leasehold
0,https://www.rightmove.co.uk/properties/1263461...,20220824,630000,Kings Cross Thameslink Station,0.1,Apartment,1.0,1.0,608.0,1,...,0,0,0,0,0,0,1,0,0,1
1,https://www.rightmove.co.uk/properties/1265351...,20220830,525000,Tufnell Park Station,0.3,Flat,1.0,1.0,,1,...,0,0,0,0,0,0,1,0,0,1
2,https://www.rightmove.co.uk/properties/1231726...,20220509,765000,South Hampstead Station,0.3,Apartment,1.0,1.0,,1,...,0,0,0,0,0,0,1,0,1,0
