In [3]:
#Import module
import pandas as pd
import numpy as np
import seaborn as sns
from lxml import html
import requests 
import unicodecsv as csv
import argparse
import json

In [39]:
# Create clean test 
def clean(text):
    if text:
        return ''.join(''.join(text).split())
# Create header 
def get_headers():
    headers = {'accept': 'text/html,application/xhtml+xml, application/xml;q=0.9,image/webp,*/*;q=0.8',
              'accept-encoding': 'gzip, deflate, sdch, br',
              'accept-language': 'en-GB,en;q=0.8,en-US;q-0.6,ml;q=0.4',
              'cache-conrol': 'max-age=0',
              'upgrade-insecure-requests': '1',
              'user-agent': 'Mozilla/5.0 (X11: Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'}
    return headers 
# Create Zillow URL based on filter 
def create_url(zipcode, filter):
    if filter == "newest":
        url = 'https://www.zillow.com/homes/for_sale/{0}/0_singlestory/days_sort'.format(zipcode)
    elif filter == "cheapest":
        url = 'https://www.zillow.com/homes/for_sale/{0}/0_singlestory/pricea_sort/'.format(zipcode)
    else:
        url = 'https://www.zillow.com/homes/for_sale/{0}_rb/?fromHomePage=true&shouldFireSellPageImplicitClaimGA=false&fromHomePageTab=buy'.format(zipcode)
    print(url)
    return url
# Save webpage to an html
def save_to_file(response, path):
     with open("response.html",'w') as fp:
        fp.write(path+"response.text")
# Write file to csv
def write_data_to_csv(data, zipcode,path):
    with open(path+"properties-%s.csv" % (zipcode), 'wb') as csvfile:
        fieldnames = ['title','address','city','state','postal_code','price','facts and features', 'real estate provider', 'url']
        writer = csv.DictWriter(csvfile,fieldnames = fieldnames)
        writer.writeheader()
        for row in data:
            writer.writerow(row)
#Get response from zillow.com
def get_response(url):
    for i in range(5):
        response = requests.get(url, headers = get_headers())
        print("status code received:", response.status_code)
        if response.status_code !=200:
            #saving response to file for debugging purpose
            save_to_file(response,path)
            continue
        else:
            save_to_file(response,path)
            return response
#Get data from json (raw json data)    
def get_data_from_json(raw_json_data):
    cleaned_data = clean(raw_json_data).replace('<!--',"").replace("-->","")
    properties_list = []
    try:
        json_data = json.loads(cleaned_data)
        search_results = json_data.get('searchResults').get('listResults',[])
        
        for properties in search_results:
            address = properties.get('addressWithZip')
            property_info = properties.get('hdpData', {}).get('homeInfo')
            city = property_info.get('city')
            state = property_info.get('state')
            postal_code = property_info.get('zipcode')
            price = properties.get('price')
            bedrooms = properties.get('beds')
            bathrooms = properties.get('baths')
            area = properties.get('area')
            info = f'{bedrooms} bds, {bathrooms} ba, {area} sqft'
            broker = properties.get('brokerName')
            property_url = properties.get('detailUrl')
            title = properties.get('statusText')
            
            data = {'address' : address,
                    'city': city,
                    'state': state,
                    'postal_code': postal_code,
                    'price': price,
                    'facts and features': info,
                    'real estate provider':broker,
                    'url': property_url,
                    'title': title}
            properties_list.append(data)
        return properties_list
    except ValueError:
        print("Invalid Json")
        return None
#Parse zipcode
def parse(zipcode, filter= None):
    url = create_url(zipcode, filter)
    response = get_response(url)
    
    if not response:
        print("Failed to fetch the page, please check `response.html' to see the response received from zillow.com")
        return None
    
    parser = html.fromstring(response.text)
    search_results = parser.xpath("//div[@id='search-results']//article")
    
    if not search_results:
        print("Parsing from json data")
        #identified as type 2 page
        raw_json_data= parser.xpath('//script[@data-zrr-shared-data-key="mobileSearchPageStore"]//text()')
        return get_data_from_json(raw_json_data)
    
    print("parsing from html page")
    
    properties_list = []
    for properties in search_results:
        raw_address = properties.xpath(".//span[@itemprop='address']//span[@itemprop='streetAddress']//text()")
        raw_city = properties.xpath(".//span[@itemprop='address']//span[@itemprop='addressLocality']//text()")
        raw_state = properties.xpath(".//span[@itemprop='address']//span[@itemprop='addressRegion']//text()") 
        raw_postal_code = properties.xpath(".//span[@itemprop='address']//span[@itemprop='postalCode']//text()")
        raw_price = properties.xpath(".//span[@class='zsg-photo-card-price']//text()")
        raw_info = properties.xpath(".//span[@class='zsg-photo-card-info']//text()")
        raw_broker_name = properties.xpath(".//span[@class='zsg-photo-card-broker-name']//text()")
        url = properties.xpath(".//a[contains(@class,'overlay-link')]/@href")
        raw_title= properties.xpath(".//h4/text()")
        
        address = clean(raw_address)
        city = clean (raw_city)
        state  = clean(raw_state)
        postal_code = clean(raw_postal_code)
        price = clean(raw_price)
        info = clean(raw_info)
        broker_name = clean(raw_broker_name)
        title = clean(raw_title)
        property_url = "https://wwww.zillow.com" + url[0] if url else None
        is_forsale = properties.xpath('//span[@class="zsg-icon-for-sale"]')
        
        properties = {'address': address,
                     'city': city,
                     'state': state,
                     'postal_code': postal_code,
                     'price': price,
                     'facts and features': info,
                     'real estate provider': broker_name,
                     'url': property_url,
                     'title': title}
        if is_forsale:
            properties_list.append(properties)
    return properties_list
    
        

        

In [11]:
# Test definition
print(clean("To day is a wrong ! day"))
print(create_url(9016,"newest"))

T o d a y i s a w r o n g ! d a y
https://www.zillow.com/homes/for_sale/9016/0_singlestory/days_sort
https://www.zillow.com/homes/for_sale/9016/0_singlestory/days_sort


In [18]:
# This code is to use for running the program by itself. 
if __name__ == "__main__":
    #Reading arguments
    argparser = argparse.ArgumentParser(formatter_class = argparse.RawTextHelpFormatter)
    argparser.add_argument('zipcode', help='')
    sortorder_help =  """
    available sort orders are: newest: Latest property details, cheapest: Properties with cheapest prices"""
    
    argparser.add_argument('sort', nargs = '?', help = sortorder_help, default="Homes for You")
    args = argparser.parse_args()
    zipcode = args.zipcode 
    sort = args.sort
    print("Fetching data for %s" % (zipcode))
    scraped_data = parse(zipcode,sort)
    if scraped_data:
        print ("Writing data to output file")
        write_data_to_csv(scraped_data)

usage: ipykernel_launcher.py [-h] zipcode [sort]
ipykernel_launcher.py: error: unrecognized arguments: -f


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [44]:
#Test for one single zipcode
#Define variable 
zipcode = '02126'
sort = "newest"
path = "C:\\Users\\nguqu781\\Dropbox\\Quyen Nguyen's Document\\6. Program\\HousePrice\\"
#Scrap data 
scraped_data = parse(zipcode,sort)
write_data_to_csv(scraped_data,zipcode,path)
#Read data 
final_data = pd.read_csv(path+'properties-'+zipcode+'.csv')
print(final_data.head(5))

https://www.zillow.com/homes/for_sale/02126/0_singlestory/days_sort
status code received: 200
Parsing from json data
              title                             address    city state  \
0  Apartmentforsale      1222BlueHillAve,Boston,MA02126  Boston    MA   
1      Houseforsale          10RockwaySt,Boston,MA02126  Boston    MA   
2      Houseforsale         15WestmoreRd,Boston,MA02126  Boston    MA   
3   Pre-foreclosure        36GrovelandSt,Boston,MA02126  Boston    MA   
4      Houseforsale  106WellingtonHillSt,Boston,MA02126  Boston    MA   

   postal_code     price         facts and features  \
0         2126  $989,000  13 bds, 5.0 ba, 4272 sqft   
1         2126  $399,900    2 bds, 2.0 ba, 851 sqft   
2         2126  $439,000   3 bds, 1.0 ba, 1916 sqft   
3         2126       NaN   4 bds, 2.0 ba, 2691 sqft   
4         2126  $589,000   4 bds, 4.0 ba, 2208 sqft   

                        real estate provider  \
0                                        NaN   
1                

In [50]:
#Load zip code US list
zipcode_file = pd.read_csv(path+'USzipcode.csv')
print(zipcode_file.head(5))
print(zipcode_file.shape)
zipcode_list =np.array(zipcode_file["ZIP code"])
print(len(zipcode_list))

   ZIP code      City   State Abbreviation
0     99546      Adak  Alaska           AK
1     99571      Adak  Alaska           AK
2     99615    Akhiok  Alaska           AK
3     99551  Akiachak  Alaska           AK
4     99552     Akiak  Alaska           AK
(74022, 4)
74022


In [None]:
#Scrap for the entire list
for zipcode in zipcode_list:
    false_list = []
    try:
        print("Start scrap for " + str(zipcode))
        scraped_data = parse(str(zipcode),sort)
        print ("Finish parsing")
        write_data_to_csv(scraped_data,str(zipcode),path)
        print ("Finish writing to csv")
    except:
        print("Cannot collect data for..." + str(zipcode))
        false_list.append(zipcode)

false_file = pd.DataFrame(false_list)
false_file.to_csv(path+"FalseZipcode.csv")
        


Start scrap for 99546
https://www.zillow.com/homes/for_sale/99546/0_singlestory/days_sort
status code received: 200
Parsing from json data
Cannot collect data for...99546
Start scrap for 99571
https://www.zillow.com/homes/for_sale/99571/0_singlestory/days_sort
status code received: 200
Parsing from json data
Finish parsing
Finish writing to csv
Start scrap for 99615
https://www.zillow.com/homes/for_sale/99615/0_singlestory/days_sort
status code received: 200
Parsing from json data
Finish parsing
Finish writing to csv
Start scrap for 99551
https://www.zillow.com/homes/for_sale/99551/0_singlestory/days_sort
status code received: 200
parsing from html page
Cannot collect data for...99551
Start scrap for 99552
https://www.zillow.com/homes/for_sale/99552/0_singlestory/days_sort
status code received: 200
Parsing from json data
Finish parsing
Finish writing to csv
Start scrap for 99553
https://www.zillow.com/homes/for_sale/99553/0_singlestory/days_sort
status code received: 200
Parsing from j