A web crawler is set up to pull sale data off of p24. 
# SET THE CSV FILE NAME IN LAST CELL

In [11]:
import requests
from bs4 import BeautifulSoup
import csv
from tqdm import tqdm 

# Define the number of pages
num_iterations = 160

pricelist = []
locationlist = []


# Create a tqdm progress bar
for x in tqdm(range(num_iterations), desc="Scraping Progress"):
    p24 = f"https://www.property24.com/for-sale/northern-cape/8/p{x}"
    req = requests.get(p24)
    soup = BeautifulSoup(req.text, "html.parser")

    for location in soup.find_all("span", attrs=["p24_location"]):
        locationlist.append(location.text)

    for price in soup.find_all("span", attrs=["p24_price"]):
        pricelist.append(price.text)   

pricelist = [price.strip().replace("\n", "").replace("\r", "").replace(" ", "").replace("R", "").replace("POA", "0") for price in pricelist]    

csvdata = [[addr, price] for addr, price in set(zip(pricelist, locationlist))]

print(f"CSVDATA: {len(csvdata), csvdata}")

Scraping Progress:   0%|          | 0/160 [00:00<?, ?it/s]

Scraping Progress: 100%|██████████| 160/160 [00:59<00:00,  2.68it/s]

CSVDATA: (3076, [['1700000', 'Homevale'], ['3750000', 'El Toro Park'], ['950000', 'Modderrivier'], ['403000', 'Oosterville'], ['1450000', "McDougall's Bay"], ['1275000', 'Port Nolloth'], ['1600000', 'Victoria West Rural'], ['1300000', 'Die Rand'], ['1450000', 'Brandvlei'], ['2354000', 'Hartswater'], ['319500', 'Monument Heights'], ['120000000', 'Roodepan'], ['890000', 'Calvinia'], ['4500000', 'Beaconsfield'], ['525000', 'Roodepan'], ['1070000', 'Victoria West'], ['1950000', 'Belgravia'], ['20000000', 'Orania'], ['0', 'Middelpos'], ['731400', 'Hadison Park'], ['1200000', 'Rhodesdene'], ['380000', 'Kleinsee'], ['14000000', 'Jan Kempdorp'], ['3895000', 'Moghul Park'], ['24150000', 'Port Nolloth'], ['3200000', 'Victoria West'], ['2150000', 'Upington Rural'], ['1185000', 'Koingnaas'], ['6440050', 'De Beers'], ['1800000', 'Augrabies'], ['1199000', 'Port Nolloth'], ['1643000', 'Homestead'], ['321000', 'Kuruman'], ['5500000', 'Oosterville'], ['960000', 'Kimberley Central'], ['1490000', 'Kimber




Using the propattr class the median and sd are calculated for the data. 

The data should be sorted by area name to avoid over use of the api. 

The data is sorted by name and pulled into its own list to find the co-ordinates using Bings Map Api.

In [12]:
import geocoder
from tqdm import tqdm

commonlocations = []


for data in csvdata:
    if data[1] not in commonlocations:
        commonlocations.append(data[1])

print(f"SIZE: {len(commonlocations)} {commonlocations}")

common_coords = []

failed = []

for addr in tqdm(commonlocations, desc="Finding Coordinates"):
    try:
        g = geocoder.bing(f'{addr + "northern cape" + "South Africa"}', key="YOURKEY")
        
        # Check if the geocoding request was successful
        if g.ok:
            results = g.json
            common_coords.append([results["lat"], results["lng"], addr])
        else:
            # Handle the case where the geocoding request failed
            print(f"Geocoding request failed for address: {addr}")
            failed.append(addr)
    except Exception as e:
        # Handle any exceptions that may occur during the geocoding request
        print(f"An error occurred while geocoding address: {addr}")
        print(f"Error details: {str(e)}")

SIZE: 200 ['Homevale', 'El Toro Park', 'Modderrivier', 'Oosterville', "McDougall's Bay", 'Port Nolloth', 'Victoria West Rural', 'Die Rand', 'Brandvlei', 'Hartswater', 'Monument Heights', 'Roodepan', 'Calvinia', 'Beaconsfield', 'Victoria West', 'Belgravia', 'Orania', 'Middelpos', 'Hadison Park', 'Rhodesdene', 'Kleinsee', 'Jan Kempdorp', 'Moghul Park', 'Upington Rural', 'Koingnaas', 'De Beers', 'Augrabies', 'Homestead', 'Kuruman', 'Kimberley Central', 'Retswelele', 'Warrenton', 'Kathu', 'Lemoendraai', 'Keidebees', 'Vosburg', 'Upington Central', 'Minerva Gardens', 'Hillcrest', 'Diamant Park', 'Fraserburg', 'Ritchie', 'Kimberley North', 'Floors', 'Vanderkloof', 'Lime Acres', 'Hondeklip Bay', 'Keimoes', 'Springbok', 'Olifantshoek', 'Nollothville', 'Kakamas Rural', 'Kakamas', 'Kimberley Rural', 'Albertynshof', 'Colville', 'West End', 'Carters Glen', 'Blydeville', 'Hopetown', 'Morning Glory', 'Laboria', 'Nieuwoudtville', 'Hopetown Rural', 'Bellvue', 'Cassandra', 'Memorial Road Area', 'Riviera

Finding Coordinates: 100%|██████████| 200/200 [01:51<00:00,  1.79it/s]


Enter data into csv for locations, price and coords.

In [13]:
import os
import csv

# Specify the target directory path
target_directory = r'C:\GitHub\Land-value-GIS\backups'

# Create the target directory if it doesn't exist
if not os.path.exists(target_directory):
    os.makedirs(target_directory)

# Define the file paths within the "backups" folder
csvdata_file_path = os.path.join(target_directory, "prop_northern_cape_locations_price.csv")
common_coords_file_path = os.path.join(target_directory, "northern_cape_area_coords.csv")

# Write to "prop_gauteng_locations_price.csv"
with open(csvdata_file_path, "w") as r:
    writer = csv.writer(r)
    writer.writerows(csvdata)

# Write to "gauteng_area_coords.csv"
with open(common_coords_file_path, "w") as w:
    writer = csv.writer(w)
    writer.writerows(common_coords)


Move the files into the backups dir