# Scraping Real Estate Listings for Dataset Building

### Importing the needed libraries

In [1]:
#For pulling data out of HTML and XML files
from bs4 import BeautifulSoup

import urllib.request
from random import randint
from time import sleep
from requests import get
import pandas as pd
import itertools
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

### Setting up the stage

We need the following to make it seem like we are sending queries from an actual web browser

In [2]:
#Needed to mimic queries coming from an actual browser
headers = ({'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36'})

After looking into the website, I found that looking into the listings from Chicago, Illinois, and those from Portland, Oregon fit more what I am looking for (pictures of the facade of the buildings at the forefront, types of buildings, etc.) so I will start with these two. 

In [None]:
page = 1
bigCities = ["https://www.zillow.com/homes/Portland-OR_rb/", "https://www.zillow.com/homes/Chicago-IL_rb/"]
print(bigCities)
for city in bigCities:
    response = get(city, headers=headers)
    print(response)

### Getting the data we want

In [87]:
#Empty lists to store the relevant info
image_links = []
years_built = []
locations = []
listing_titles = []
listing_types = []
n_pages = 0

for page in range(1,21):
    bigCities = ["https://www.zillow.com/homes/Portland-OR_rb/"+str(page)+"_p", "https://www.zillow.com/homes/Chicago-IL_rb/"+str(page)+"_p"]
    n_pages += 1
    print(page)
    for city in bigCities:
        print(city)
        response = get(city, headers=headers)
        html_soup = BeautifulSoup(response.text, 'html.parser')
        listings = html_soup.find_all('article', class_="list-card list-card-short list-card_not-saved")
        
        #Now we loop through each listing to get the data
        for listing in listings: 

            #We store what we find in an images list object which actually only has one image (due to the structure of the tags tree)
            images = listing.find_all('div', class_="list-card-top")
            images = images[0].find_all('img')
            
            for image in images:
                if("jpg" in image['src']):
                    image_links.append(image['src'])
                    filename = image['src'].split('/')[-1]
                    filename = filename + ".jpg"
                    path = "./Buildings/"+filename
                    urllib.request.urlretrieve(image['src'], path)
                    listing_titles.append(image['alt'])
                    
                    #Getting the type of the listing
                    types = listing.find_all('div', class_="list-card-type")
                    listing_types.append(types[0].text)
                    
                    #Getting the location
                    addresses = listing.find_all('h3', class_="list-card-addr")
                    locations.append(addresses[0].text)
            
    #Sleeping to mimic a human search and keep the scraping within polite and acceptable limits 
    sleep(randint(1,2))
    

print('You scraped {} pages.'.format(n_pages))


1
https://www.zillow.com/homes/Portland-OR_rb/1_p
https://www.zillow.com/homes/Chicago-IL_rb/1_p
2
https://www.zillow.com/homes/Portland-OR_rb/2_p
https://www.zillow.com/homes/Chicago-IL_rb/2_p
3
https://www.zillow.com/homes/Portland-OR_rb/3_p
https://www.zillow.com/homes/Chicago-IL_rb/3_p
4
https://www.zillow.com/homes/Portland-OR_rb/4_p
https://www.zillow.com/homes/Chicago-IL_rb/4_p
5
https://www.zillow.com/homes/Portland-OR_rb/5_p
https://www.zillow.com/homes/Chicago-IL_rb/5_p
6
https://www.zillow.com/homes/Portland-OR_rb/6_p
https://www.zillow.com/homes/Chicago-IL_rb/6_p
7
https://www.zillow.com/homes/Portland-OR_rb/7_p
https://www.zillow.com/homes/Chicago-IL_rb/7_p
8
https://www.zillow.com/homes/Portland-OR_rb/8_p
https://www.zillow.com/homes/Chicago-IL_rb/8_p
9
https://www.zillow.com/homes/Portland-OR_rb/9_p
https://www.zillow.com/homes/Chicago-IL_rb/9_p
10
https://www.zillow.com/homes/Portland-OR_rb/10_p
https://www.zillow.com/homes/Chicago-IL_rb/10_p
11
https://www.zillow.com/h

In [90]:
cols = ['Title', 'Location', 'Type', 'Image']

mydata = pd.DataFrame({'Title': listing_titles,
                           'Location': locations,
                           'Type': listing_types,
                           'Image': image_links})[cols]

mydata.to_excel('Chicago_Portland_raw.xls')

### Helpful links

https://towardsdatascience.com/looking-for-a-house-build-a-web-scraper-to-help-you-5ab25badc83e
https://www.crummy.com/software/BeautifulSoup/bs4/doc/
