In [24]:
import requests
from bs4 import BeautifulSoup

# this section is for unit testing:

#fetch HTML code
r = requests.get("http://www.pyclass.com/real-estate/rock-springs-wy/LCWYROCKSPRINGS/", headers={'User-agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'})
c = r.content

# parse requested content in HTML format using BeautifulSoup
soup = BeautifulSoup(c,"html.parser") 
#print(soup.prettify()) # to test that HTML script has been read correctly

#from inspecting HTML, we know that each listing is a div element with class propertyRow
all = soup.find_all("div",{"class":"propertyRow"})

#extract price of first listing from h4 tag with class propPrice
all[0].find("h4",{"class":"propPrice"}).text.replace("\n","").replace(" ","") # convert 'Tag' to text, remove \n and spaces

'$725,000'

In [25]:
page_no = soup.find_all("a",{"class":"Page"})[-1].text # page number is required to dynamically change the URL as the listings
# are spread across multiple pages

# print price and address of each listing on page
l = [] # list to store dictionaries. Each dictionary has details of one listing

# you notice that only the last part of the URL s= changes from 0 to 10 to 20 depending on the page number
base_url = "http://www.pyclass.com/real-estate/rock-springs-wy/LCWYROCKSPRINGS/t=0&s="
for page in range(0,int(page_no)*10,10):
    print(base_url+str(page)+".html")
    r = requests.get(base_url+str(page)+".html", headers={'User-agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'})
    c = r.content
    soup = BeautifulSoup(c, "html.parser")
    all = soup.find_all("div",{"class":"propertyRow"})

    for item in all:
        d = {} # dictionary
        
        # address is stored in TWO span tags with class propAddressCollapse. First has street and second has zip
        d["Address"] = item.find_all("span",{"class":"propAddressCollapse"})[0].text
        d["Locality"] = item.find_all("span",{"class":"propAddressCollapse"})[1].text
        
        d["Price"] = item.find("h4",{"class":"propPrice"}).text.replace("\n","").replace(" ","")

        # bedroom info
        try: #to catch exception where data (HTML tag) is missing, such as no bedroom info for first listing
            d["Beds"] = item.find("span",{"class":"infoBed"}).find("b").text
        except:
            d["Beds"] = None

        try: 
            d["Area"] = item.find("span",{"class":"infoSqFt"}).find("b").text
        except:
            d["Area"] = None

        try: 
            d["Full Baths"] = item.find("span",{"class":"infoValueFullBath"}).find("b").text
        except:
            d["Full Baths"] = None

        try: 
            d["Half Baths"] = item.find("span",{"class":"infoValueHalfBath"}).find("b").text
        except:
            d["Half Baths"] = None

        # Extract Lot Size from Features. Features exist in div tags with class columnGroup, which have span tags with class featureGroup which has the feature type
        # (Lot Size, Appliances etc.) and class featureName which has the values

        for column_group in item.find_all("div",{"class":"columnGroup"}):
            for feature_group, feature_name in zip(column_group.find_all("span",{"class":"featureGroup"}), column_group.find_all("span",{"class":"featureName"})):
                if "Lot Size" in feature_group.text:
                    d["Lot Size"] = feature_name.text

        l.append(d)

http://www.pyclass.com/real-estate/rock-springs-wy/LCWYROCKSPRINGS/t=0&s=0.html
http://www.pyclass.com/real-estate/rock-springs-wy/LCWYROCKSPRINGS/t=0&s=10.html
http://www.pyclass.com/real-estate/rock-springs-wy/LCWYROCKSPRINGS/t=0&s=20.html


In [26]:
# load above list into a DataFrame and export it to CSV
import pandas as pd
df = pd.DataFrame(l)

df.to_csv("Page1_Data.csv")