In [None]:
import requests
from lxml import html

import unicodecsv as csv
import pandas as pd
import numpy as np

import time
import random
import re
import json

from IPython.display import display, Markdown, HTML
import IPython.display as dsp

In [193]:
base_url = 'https://www.zillow.com{}'

target = 'pittsburgh'

target_url = base_url.format(target)
current_url = None
last_fetch = ''
last_jl = []

LINE_UP = '\033[1A'
LINE_CLEAR = '\x1b[2k'

fields_of_interest = [
    'addressStreet',
    'unformattedPrice',
    'beds',
    'baths',
    'area',
    'daysOnZillow',
    'zestimate',
    'lotAreaString',
    'address',
    'addressCity',
    'addressState',
    'addressZip',
    'detailUrl',
    'lat',
    'long',
    'zpid',
    'flexFieldType',
    'contentType',
    'statusType',
    'statusText',
    'rawHomeStatusCd',
    'imgSrc',
]

In [5]:
xpath_string = dict(
    address='//*[contains(@data-test, "property-card-")]/address/text()',  # returns a list of str's containing the address
    links='//a[contains(@data-test, "property-card-")]/@href',  # returns a list of str's containing links to properties
    nxt='//*[contains(@rel, "next")]/@href',  # list of one str with path to next page
    bbs='//ul/li/b/text()',  # list of [beds, baths, sf, beds, baths, sf, ...]; Will need reshaped (easy with numpy)
    price='//span[contains(@data-test, "property-card-price")]/text()',  # list of prices
)

In [6]:
def write(name, data):
    with open(name, 'w') as f:
        f.write(data)

In [7]:
def random_ua():
    user_agents = [
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.3',
        #'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36',
        #'Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36 Edg/130.0.0.0',
    ]
    selected_ua = random.choice(user_agents)
    print(f'Selected UA: {selected_ua}')
    return selected_ua

In [197]:
def countdown( min, max=None, buffer=None, prefix='Sleeping', postfix='...'):
    '''Print a countdown to screen.  Erase it when it is done.  
    Accepts either the time to countdown OR a min and max for randint for random sleeps.'''
    if max is None:
        max = min
    
    delay = random.randint(min, max)
    for i in range(delay):
        if buffer is None:
            print(f'\rIn {delay-i}', end='')
        else:
            buffer.update(Markdown(f'{prefix} {delay-i:>2}s / {delay:>2}s {postfix}'))

        time.sleep(1)
        if buffer is None:
            print(f'\r                      ', end='\r')
        #else:
            #buffer.update(Markdown(f' '))
    if buffer is None:
        print(LINE_UP, LINE_CLEAR, end='')
    else:
        buffer.update(Markdown('Sleeping complete ...'))
    

In [9]:
def get_json_string(parser):
    loc = parser.text_content().index('searchResults') - 2
    string = parser.text_content()[loc:]
    json_string = ''
    try:
        json.JSONDecoder().decode(string)
    except Exception as e:
        #print(e)
        json_string = json.JSONDecoder().decode(string[:e.colno-1])
        return json_string

    return None

In [201]:
def keep_fields(d, fields_to_keep, sort=True):
    if type(d) is dict:
        d = { key: value for key, value in d.items() if key in fields_to_keep}
        
    elif type(d) in (list, tuple):
        d = [keep_fields(dd, fields_to_keep) for dd in d]
    else:
        raise(TypeError, f'Received {type(d)}.  Expected: dict, list, or tuple.')
    
    if type(d) is list and sort:
        sorted_list = []
        for idx, dct in enumerate(d):
            new_dict = {}
            for key in fields_to_keep:
                new_dict[key] = dct[key] if key in dct.keys() else ''  # Reinsert existing keys in the preferred order from the 'keep' list
            sorted_list.append(new_dict)
    
    return d

In [None]:
def flatten_json(json_string, flat_only=False, to_keep, verbose=False):
    properties = []
    for prop in json_string:  # Concatenate all dictionaries in list to form one list of all JSON dict's where 1 dict = 1 property
        properties += prop
    all_keys = set()
    if not flat_only:
    
        for prop in properties:
            try:
                prop['carouselPhotos'] = '|'.join([photo['url'] for photo in prop['carouselPhotos']])
            except KeyError as e:
                pass
            prop['daysOnZillow'] = prop['hdpData']['homeInfo']['daysOnZillow']
            prop['hdpData'] = json.dumps(prop['hdpData'])
            try:
                prop['lat'] = prop['latLong']['latitude']
                prop['long'] = prop['latLong']['longitude']
            except KeyError as e:
                if verbose:
                    print(f'Failed to find keys in latLong: {prop['latLong']}')
            del(prop['latLong'])
            all_keys.update(list(prop.keys()))
        if verbose:
            print(f'Found {len(properties)} properties.')
        properties = keep_fields(properties, to_keep)
        all = []
        for k in to_keep:
            if k in all_keys:
                all.append(k)
    return all, properties

In [12]:
def parse_page(parser, url):
	
	data = {}  # dict to hold data
	next_link = None
	# print(f'Parsing: {url}')
	for k, v in xpath_string.items():
		# print(f"XPATH -> {k}: {v}", end='')
		if k == 'nxt':  # get next link
			next_link = parser.xpath(v)
			# print(next_link)
			if len(next_link) == 1:
				next_link = next_link[0]
			else:
				next_link = None
		
		elif k == 'bbs':  # dealing with beds, baths, and sf
			bbs = parser.xpath(v)
			beds, baths, sf = ('', '', '')
			try:
				beds,baths,sf = np.reshape(bbs, (len(bbs)//3, 3)).T.tolist()
			except ValueError as e:
				# dealing with lots: [lot1sf, lot2sf, bed1, baths1, sf1, lot3sf, bed2, baths2, sf2, ... ]
				out = []
				for i in bbs:
					out.append(i)
					if 'lot' in i:
						out += ['', '']
				beds, baths, sf = np.reshape(out, (len(out)//3, 3)).T.tolist()

			data['beds'] = beds
			data['baths'] = baths
			sf = [s.replace(',', '') for s in sf]
			data['sf'] = sf
		else:  # everything else
			d = parser.xpath(v)
			if k == 'price':
				d = [ int(re.sub(r'(\$|,|\+)', '', t)) for t in d]
			data[k] = d

	print(f'SCRAPED: {data}')
	return next_link, data


In [13]:
def write_csv(data, target, fieldnames=None):
    # Save data to csv

    with open(f'properties-{target}.csv', 'wb') as csvfile:
        if fieldnames is None:
            fieldnames = list(data[0].keys())
        print(f'Fieldnames: {fieldnames}')
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for row in data:
            writer.writerow(row)

In [14]:
def json_to_list(js):
    out = []
    
    for j in js:
        out += j['searchResults']['listResults']
    
    return out

In [165]:
def pr(msg, buffer=None, end='\n', auto_format=True):
    #format = dict(top='## {}', mid='{}', bot='{}')  # buffers are IPython.display.display objects with a 'name' attribute added

    if not buffer:
        print(msg, end=end)
    
    else:
       if type(msg) is str:
        if auto_format:
            try:
              msg = buffer.format.format(msg)
            except Exception as e:
              print('In pr(): buffer formatting failed')
        buffer.update(Markdown(msg))
       
       else:
          buffer.update(msg)


In [None]:
def parse(target):
	if target[0] != '/':
		target = '/' + target
	url = base_url.format(target) + '/'
	headers = {
        'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
        'accept-language': "en-US,en;q=0.9",
		'accept-encoding': 'gzip, deflate, br, zstd',
        "priority": "u=0, i",
        #"sec-ch-ua": "\"Chromium\";v=\"130\", \"Microsoft Edge\";v=\"130\", \"Not?A_Brand\";v=\"99\"",
        "sec-ch-ua-mobile": "?0",
        "sec-ch-ua-platform": "\"Windows\"",
        "sec-fetch-dest": "document",
        "sec-fetch-mode": "navigate",
        "sec-fetch-site": "non",
        "sec-fetch-user": "?1",
        "upgrade-insecure-requests": "1",
		"referrerPolicy": 'strict-origin-when-cross-origin',  # 'unsafe-url',
		'User-Agent': random_ua(),  # 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.3'  # 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36'  # 'Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36'  # 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36 Edg/130.0.0.0',
    }
  
	s = requests.Session()
	s.headers.update(headers)
	json_strings = []
	properties_dict_list = []
	while url is not None:
		response = s.get(url, headers=headers)
		print(f'URL: {response.url}\tStatus code: {response.status_code}')
		write('log', response.text)

		if response.status_code != 200:
			print('Bad response')
			break
		
		parser = html.fromstring(response.text)
		last_url = url
		url, data = parse_page(parser, url)  # This doesn't work, the remainder of the page will not be loaded without scrolling; limited to 9 results per page; pittsburgh @ 20 pages gives 180 results, but json gives 820
		print(f'Got {len(data['address'])} properties.')

		json_strings.append(get_json_string(parser)['searchResults']['listResults'])
		write('json_part', repr(json_strings))

		# Create next url
		url = base_url.format(url)
		#print('Next URL: ', url, end='')
		#print(data, end='\r')
		properties_dict_list += [data]
		
		if last_url == url:
			print('Breaking: last_url == url.')
			break
		# Delay to prevent 403
		delay = random.randint(3,10)
		print(f' Sleeping {delay}s\tNext URL: {url}')
		countdown(delay)
	
	write('json_strings', json.dumps(json_strings))
	
	write('raw_out', repr(properties_dict_list))  # raw string representation of the list of dicts (1 property = 1 dict)
	
	# properties_dict_list is a list[dict[K,list[V]]], convert to list[dict[k,v]]
	properties_out = []
	length = 0
	for page_props in properties_dict_list:  # each is a dict with key: list.  Flatten each and add to output array
		length += len(page_props)
		properties_out += [ dict(zip(page_props, t)) for t in zip(*page_props.values()) ]
	print('Prop count:', len(properties_out))	
	write_csv(properties_out, target[1:])

	all_keys, flat_json = flatten_json(json_strings, fields_of_interest)
	flat_json = [keep_fields(fj, fields_of_interest) for fj in flat_json]
	write_csv(flat_json, target[1:] + '_json', all_keys)
	
	return properties_dict_list

In [233]:
def get_display_handles():
    dh_top = display('...', display_id=True)
    dh_mid = display('...', display_id=True)
    dh_bot = display('...', display_id=True)

    dh_top.name = 'top'
    dh_mid.name = 'mid'
    dh_bot.name = 'bot'

    dh_top.format = '#### {}'
    dh_bot.format = '{}'
    dh_mid.format = '{}'

    return (dh_top, dh_mid, dh_bot)

In [None]:
def parse_z_json(target):
    dh_top, dh_mid, dh_bot = get_display_handles()

    base_url = 'https://zillow.com{}'
    if target[0] != '/':
        target = '/' + target
    url = base_url.format(target) + '/'
    headers = {
        'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
        'accept-language': "en-US,en;q=0.9",
		'accept-encoding': 'gzip, deflate, br, zstd',
        "priority": "u=0, i",
        #"sec-ch-ua": "\"Chromium\";v=\"130\", \"Microsoft Edge\";v=\"130\", \"Not?A_Brand\";v=\"99\"",
        "sec-ch-ua-mobile": "?0",
        "sec-ch-ua-platform": "\"Windows\"",
        "sec-fetch-dest": "document",
        "sec-fetch-mode": "navigate",
        "sec-fetch-site": "non",
        "sec-fetch-user": "?1",
        "upgrade-insecure-requests": "1",
		"referrerPolicy": 'strict-origin-when-cross-origin',  # 'unsafe-url',
		'User-Agent': random_ua(),  # 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.3'  # 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36'  # 'Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36'  # 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36 Edg/130.0.0.0',
    }
  
    s = requests.Session()
    s.headers.update(headers)
    json_strings = []
    properties_dict_list = []
    while url is not None:
        response = s.get(url, headers=headers)
        pr(f'**{response.status_code}** {url}', dh_top)
        
        time.sleep(0.5)
        write('last_response.html', response.text)  # Record response for debugging purposes

        if response.status_code != 200:
            pr('**Bad** response!', buffer=dh_top)
            break
		
        parser = html.fromstring(response.text)
        last_url = url

        json_strings.append(get_json_string(parser)['searchResults']['listResults'])
        write('json_part', repr(json_strings))

        url = parser.xpath(xpath_string['nxt'])
        if len(url) == 1:
            url = url[0]
        else:
            url = None

        # Create next url
        url = base_url.format(url)
        
        if last_url == url:
            pr('Breaking: last_url == url.')
            pr('**Scraped all links**', buffer=dh_mid)
            break
        
        # Delay to prevent 403
        countdown(min=3, max=10, buffer=dh_mid, postfix=f'Next URL: *{url}*')
        pr(' ', dh_mid)
    
    pr(f'Wrapping up and Writing Output', dh_top)
    
    # Write raw JSON to file
    pr(f'Writing raw JSON to file.', dh_bot)
    write('json_strings', json.dumps(json_strings))  

    # Flatten raw JSON.  Extract pertinent strings and sort.
    all_keys, flat_json = flatten_json(json_strings, fields_of_interest)

    pr('Writing CSV based on JSON to file', dh_bot)
    write_csv(flat_json, target[1:] + '_json', all_keys)

    pr(f'Wrote **{len(flat_json)}** lines.', dh_bot)
    pr('Finished successfully!', dh_top)
    
    return flat_json

In [231]:
def parse_to_pd(target):
    dh_top, dh_mid, dh_bot = get_display_handles()

    base_url = 'https://zillow.com{}'
    if target[0] != '/':
        target = '/' + target
    url = base_url.format(target) + '/'
    headers = {
        'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
        'accept-language': "en-US,en;q=0.9",
		'accept-encoding': 'gzip, deflate, br, zstd',
        "priority": "u=0, i",
        #"sec-ch-ua": "\"Chromium\";v=\"130\", \"Microsoft Edge\";v=\"130\", \"Not?A_Brand\";v=\"99\"",
        "sec-ch-ua-mobile": "?0",
        "sec-ch-ua-platform": "\"Windows\"",
        "sec-fetch-dest": "document",
        "sec-fetch-mode": "navigate",
        "sec-fetch-site": "non",
        "sec-fetch-user": "?1",
        "upgrade-insecure-requests": "1",
		"referrerPolicy": 'strict-origin-when-cross-origin',  # 'unsafe-url',
		'User-Agent': random_ua(),  # 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.3'  # 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36'  # 'Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36'  # 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36 Edg/130.0.0.0',
    }
  
    s = requests.Session()
    s.headers.update(headers)
    json_strings = []
    properties_dict_list = []
    while url is not None:
        response = s.get(url, headers=headers)
        pr(f'**{response.status_code}** {url}', dh_top)
        
        time.sleep(0.5)
        write('last_response.html', response.text)  # Record response for debugging purposes

        if response.status_code != 200:
            pr('**Bad** response!', buffer=dh_top)
            break
		
        parser = html.fromstring(response.text)
        last_url = url

        json_strings += (get_json_string(parser)['searchResults']['listResults'])
        write('json_part', repr(json_strings))

        url = parser.xpath(xpath_string['nxt']) # get next url part
        if len(url) == 1:
            url = url[0]
        else:
            url = None

        url = base_url.format(url)  # Create next url
        
        if last_url == url:
            pr('Breaking: last_url == url.')
            pr('**Scraped all links**', buffer=dh_mid)
            break
        
        # Delay to prevent 403
        countdown(min=3, max=10, buffer=dh_mid, postfix=f'Next URL: *{url}*')
        pr(' ', dh_mid)
    
    pr(f'Wrapping up and Writing Output', dh_top)
    
    # Write raw JSON to file
    pr(f'Writing raw JSON to file.', dh_bot)
    write('json_strings', json.dumps(json_strings))  

    # Flatten raw JSON.  Extract pertinent strings and sort.
    #all_keys, flat_json = flatten_json(json_strings, fields_of_interest)
    df = pd.json_normalize(json_strings)

    #pr('Writing CSV based on JSON to file', dh_bot)
    #write_csv(flat_json, target[1:] + '_json', all_keys)
    df.to_csv('json_string.csv')
    pr(f'Wrote **{len(df)}** lines.', dh_bot)
    pr('Finished successfully!', dh_top)
    
    return df

In [None]:
#data = parse('pittsburgh')

Selected UA: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.3
URL: https://www.zillow.com/pittsburgh/	Status code: 200
SCRAPED: {'address': ['124 Byron Rd, Pittsburgh, PA 15237', '403 Antenor Ave, Pittsburgh, PA 15210', '126 S Ohio St, Pittsburgh, PA 15202', '336 Maple Rd, Pittsburgh, PA 15239', '3820 Clements Rd, Pittsburgh, PA 15239', '5857 Solway St, Pittsburgh, PA 15217', '1967 McNary Blvd, Pittsburgh, PA 15221', '1543 Cumberland St, Pittsburgh, PA 15205', '125 Country Ln, Pittsburgh, PA 15229'], 'links': ['https://www.zillow.com/homedetails/124-Byron-Rd-Pittsburgh-PA-15237/11523885_zpid/', 'https://www.zillow.com/homedetails/403-Antenor-Ave-Pittsburgh-PA-15210/11648294_zpid/', 'https://www.zillow.com/homedetails/126-S-Ohio-St-Pittsburgh-PA-15202/11322771_zpid/', 'https://www.zillow.com/homedetails/336-Maple-Rd-Pittsburgh-PA-15239/11650928_zpid/', 'https://www.zillow.com/homedetails/3820-Clements-Rd-Pittsburgh-PA-15239/1

In [232]:
data = parse_to_pd('allegheny-county-pa')
#data

## Finished successfully!

**Scraped all links**

Wrote **820** lines.

Selected UA: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.3
Breaking: last_url == url.


In [271]:
df = data.copy()
list(df.columns)
cols_to_keep = [
    'zpid',

    'rawHomeStatusCd',
    'marketingStatusSimplifiedCd',
    'imgSrc',
    'hasImage',
    'detailUrl',
    'statusType',
    'statusText',
    'countryCurrency',
    'price',
    'unformattedPrice',
    'address',
    'addressStreet',
    'addressCity',
    'addressState',
    'addressZipcode',
    'isUndisclosedAddress',
    'beds',
    'baths',
    'area',
    'isZillowOwned',
    'flexFieldText',
    'flexFieldType',
    'contentType',
    'isSaved',
    'isUserClaimingOwner',
    'isUserConfirmedClaim',
    'pgapt',
    'sgapt',
    'zestimate',
    'shouldShowZestimateAsPrice',
    'has3DModel',
    'hasVideo',
    'isHomeRec',
    'hasAdditionalAttributions',
    'isFeaturedListing',
    'isShowcaseListing',
    'list',
    'relaxed',
    'info6String',
    'carouselPhotos',
    'latLong.latitude',
    'latLong.longitude',
    'hdpData.homeInfo.zpid',
    'hdpData.homeInfo.streetAddress',
    'hdpData.homeInfo.zipcode',
    'hdpData.homeInfo.city',
    'hdpData.homeInfo.state',
    'hdpData.homeInfo.latitude',
    'hdpData.homeInfo.longitude',
    'hdpData.homeInfo.price',
    'hdpData.homeInfo.bathrooms',
    'hdpData.homeInfo.bedrooms',
    'hdpData.homeInfo.livingArea',
    'hdpData.homeInfo.homeType',
    'hdpData.homeInfo.homeStatus',
    'hdpData.homeInfo.daysOnZillow',
    'hdpData.homeInfo.isFeatured',
    'hdpData.homeInfo.shouldHighlight',
    'hdpData.homeInfo.zestimate',
    'hdpData.homeInfo.rentZestimate',
    'hdpData.homeInfo.listing_sub_type.is_FSBA',
    'hdpData.homeInfo.isUnmappable',
    'hdpData.homeInfo.isPreforeclosureAuction',
    'hdpData.homeInfo.homeStatusForHDP',
    'hdpData.homeInfo.priceForHDP',
    'hdpData.homeInfo.timeOnZillow',
    'hdpData.homeInfo.isNonOwnerOccupied',
    'hdpData.homeInfo.isPremierBuilder',
    'hdpData.homeInfo.isZillowOwned',
    'hdpData.homeInfo.currency',
    'hdpData.homeInfo.country',
    'hdpData.homeInfo.taxAssessedValue',
    'hdpData.homeInfo.lotAreaValue',
    'hdpData.homeInfo.lotAreaUnit',
    'hdpData.homeInfo.isShowcaseListing',
    'hdpData.homeInfo.datePriceChanged',
    'hdpData.homeInfo.priceReduction',
    'hdpData.homeInfo.priceChange',
    'hasOpenHouse',
    'openHouseStartDate',
    'openHouseEndDate',
    'openHouseDescription',
    'hdpData.homeInfo.listing_sub_type.is_openHouse',
    'hdpData.homeInfo.openHouse',
    'hdpData.homeInfo.open_house_info.open_house_showing',
    'hdpData.homeInfo.listing_sub_type.is_foreclosure',
    'hdpData.homeInfo.videoCount',
    'hdpData.homeInfo.unit',
    'hdpData.homeInfo.listing_sub_type.is_newHome',
    'hdpData.homeInfo.newConstructionType',
    'providerListingId',
    'hdpData.homeInfo.providerListingID',
    'lotAreaString',
    'info3String',
    'brokerName',
    'isPropertyResultCDP',
    'builderName',
    'hdpData.homeInfo.group_type',
    'hdpData.homeInfo.priceSuffix',
    'availabilityDate',
    'streetViewURL',
    'streetViewMetadataURL'
]

df.at[0,'zpid']
df.at[0,'hdpData.homeInfo.zpid']
df.isnull().sum()

zpid                                                     0
id                                                       0
rawHomeStatusCd                                          0
marketingStatusSimplifiedCd                              0
imgSrc                                                   0
hasImage                                                 2
detailUrl                                                0
statusType                                               0
statusText                                               0
countryCurrency                                          0
price                                                    0
unformattedPrice                                         0
address                                                  0
addressStreet                                            0
addressCity                                              0
addressState                                             0
addressZipcode                                          

In [264]:
vars(pd.options.display)


{'d': {'date_dayfirst': False,
  'date_yearfirst': False,
  'encoding': 'UTF-8',
  'precision': 6,
  'float_format': None,
  'max_info_rows': 1690785,
  'max_rows': 60,
  'min_rows': 10,
  'max_categories': 8,
  'max_colwidth': 50,
  'max_columns': 20,
  'large_repr': 'truncate',
  'max_info_columns': 100,
  'colheader_justify': 'right',
  'notebook_repr_html': True,
  'pprint_nest_depth': 3,
  'multi_sparse': True,
  'expand_frame_repr': True,
  'show_dimensions': 'truncate',
  'chop_threshold': None,
  'max_seq_items': 100,
  'width': 200,
  'memory_usage': True,
  'unicode': {'east_asian_width': False, 'ambiguous_as_wide': False},
  'html': {'table_schema': False, 'border': 1, 'use_mathjax': True},
  'max_dir_items': 100},
 'prefix': 'display'}

In [269]:
pd.set_option('display.width', None)
pd.set_option('display.max_rows', 120)
pd.set_option('display.max_info_rows', 120)
pd.describe_option('display')

display.chop_threshold : float or None
    if set to a float value, all float values smaller than the given threshold
    will be displayed as exactly 0 by repr and friends.
    [default: None] [currently: None]
display.colheader_justify : 'left'/'right'
    Controls the justification of column headers. used by DataFrameFormatter.
    [default: right] [currently: right]
display.date_dayfirst : boolean
    When True, prints and parses dates with the day first, eg 20/01/2005
    [default: False] [currently: False]
display.date_yearfirst : boolean
    When True, prints and parses dates with the year first, eg 2005/01/20
    [default: False] [currently: False]
display.encoding : str/unicode
    Defaults to the detected encoding of the console.
    Specifies the encoding to be used for strings returned by to_string,
    these are generally strings meant to be displayed on the console.
    [default: UTF-8] [currently: UTF-8]
display.expand_frame_repr : boolean
    Whether to print out the fu

In [262]:
pd.options.display.large_repr

'truncate'

In [215]:
import pandas as pd
pd.json_normalize(data).set_index('zpid').sort_values(by='unformattedPrice', ascending=False).head()

Unnamed: 0_level_0,rawHomeStatusCd,imgSrc,detailUrl,statusType,statusText,unformattedPrice,address,addressStreet,addressCity,addressState,beds,baths,area,flexFieldType,contentType,zestimate,daysOnZillow,lat,long,lotAreaString
zpid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
164402271,ForSale,https://photos.zillowstatic.com/fp/4334dc20295...,https://www.zillow.com/homedetails/341-Fairvie...,FOR_SALE,House for sale,7000000,"341 Fairview Rd, Pittsburgh, PA 15238",341 Fairview Rd,Pittsburgh,PA,6,8,13789.0,showcase,showcase,6262700.0,256,40.535343,-79.88384,
11629946,ForSale,https://photos.zillowstatic.com/fp/cb0de016e0f...,https://www.zillow.com/homedetails/122-Woodlan...,FOR_SALE,House for sale,5295000,"122 Woodland Rd, Pittsburgh, PA 15232",122 Woodland Rd,Pittsburgh,PA,7,8,,threeDHome,threeDHome,5155200.0,42,40.444847,-79.9219,
11518946,ForSale,https://photos.zillowstatic.com/fp/b7c530bc5cd...,https://www.zillow.com/homedetails/35-Linden-P...,FOR_SALE,House for sale,4750000,"35 Linden Pl, Sewickley, PA 15143",35 Linden Pl,Sewickley,PA,6,8,6131.0,homeInsight,homeInsight,4327200.0,99,40.54278,-80.18833,
2082292819,ForSale,https://photos.zillowstatic.com/fp/7d4dc0656e0...,https://www.zillow.com/homedetails/550-Market-...,FOR_SALE,Condo for sale,4250000,"550 Market St FLOOR 19, Pittsburgh, PA 15222",550 Market St FLOOR 19,Pittsburgh,PA,3,3,,homeInsight,homeInsight,,347,40.45041,-79.99114,
11273531,ForSale,https://photos.zillowstatic.com/fp/630fb351777...,https://www.zillow.com/homedetails/1635-Pegher...,FOR_SALE,House for sale,4200000,"1635 Pegher Ln, Sewickley, PA 15143",1635 Pegher Ln,Sewickley,PA,6,7,10133.0,threeDHome,threeDHome,3846500.0,79,40.601856,-80.1104,


In [227]:
js = json.loads(read('json_strings'))
pd.json_normalize(js).iloc[9,0]
out = []
for j in js:
    out += j
out

df=pd.json_normalize(out)

In [229]:
list(df.columns)

['zpid',
 'id',
 'rawHomeStatusCd',
 'marketingStatusSimplifiedCd',
 'imgSrc',
 'hasImage',
 'detailUrl',
 'statusType',
 'statusText',
 'countryCurrency',
 'price',
 'unformattedPrice',
 'address',
 'addressStreet',
 'addressCity',
 'addressState',
 'addressZipcode',
 'isUndisclosedAddress',
 'beds',
 'baths',
 'area',
 'isZillowOwned',
 'flexFieldText',
 'flexFieldType',
 'contentType',
 'isSaved',
 'isUserClaimingOwner',
 'isUserConfirmedClaim',
 'pgapt',
 'sgapt',
 'zestimate',
 'shouldShowZestimateAsPrice',
 'has3DModel',
 'hasVideo',
 'isHomeRec',
 'hasAdditionalAttributions',
 'isFeaturedListing',
 'isShowcaseListing',
 'list',
 'relaxed',
 'info6String',
 'carouselPhotos',
 'latLong.latitude',
 'latLong.longitude',
 'hdpData.homeInfo.zpid',
 'hdpData.homeInfo.streetAddress',
 'hdpData.homeInfo.zipcode',
 'hdpData.homeInfo.city',
 'hdpData.homeInfo.state',
 'hdpData.homeInfo.latitude',
 'hdpData.homeInfo.longitude',
 'hdpData.homeInfo.price',
 'hdpData.homeInfo.bathrooms',
 'hdp

In [None]:
with open('json_strings', 'r') as f:
    js = f.read()
js = json.loads(js)

properties = []
for prop in js:    
    properties += prop
all_keys = set()
for prop in properties:
    #print(j['hdpData']['homeInfo'].keys())
    all_keys.update(list(prop.keys()))
    try:
        prop['carouselPhotos'] = '|'.join([photo['url'] for photo in prop['carouselPhotos']])
    except KeyError as e:
        pass
    prop['daysOnZillow'] = prop['hdpData']['homeInfo']['daysOnZillow']
    prop['hdpData'] = json.dumps(prop['hdpData'])
#print(out)
all_keys.update(['daysOnZillow'])


write_csv(properties, 'json_csv.csv', fieldnames=all_keys)

Fieldnames: {'statusType', 'isHomeRec', 'isPropertyResultCDP', 'isZillowOwned', 'flexFieldText', 'zpid', 'latLong', 'isUserClaimingOwner', 'isFeaturedListing', 'streetViewMetadataURL', 'providerListingId', 'hasVideo', 'openHouseStartDate', 'list', 'flexFieldType', 'addressState', 'rawHomeStatusCd', 'contentType', 'area', 'has3DModel', 'info6String', 'beds', 'statusText', 'countryCurrency', 'sgapt', 'hasAdditionalAttributions', 'carouselPhotos', 'relaxed', 'price', 'id', 'shouldShowZestimateAsPrice', 'hdpData', 'unformattedPrice', 'detailUrl', 'address', 'streetViewURL', 'openHouseEndDate', 'isShowcaseListing', 'pgapt', 'zestimate', 'lotAreaString', 'addressZipcode', 'isSaved', 'imgSrc', 'isUndisclosedAddress', 'addressStreet', 'hasOpenHouse', 'builderName', 'addressCity', 'baths', 'openHouseDescription', 'daysOnZillow', 'hasImage', 'marketingStatusSimplifiedCd', 'isUserConfirmedClaim'}


In [None]:
fields_of_interest = [
    'statusType',
    'zpid',
    'latLong',
    'flexFieldType',
    'rawHomeStatusCd',
    'contentType',
    'address',
    'addressStreet',
    'addressCity',
    'addressState',
    'addressZip',
    'area',
    'beds',
    'baths',
    'statusText',
    'unformattedPrice',
    'detailUrl',
    'imgSrc',
    'zestimate',
    'lotAreaString',
    'daysOnZillow'
]

In [57]:
def read(path, inspect=False, width=100):
    with open(path, 'r') as f:
        out = f.read()
    
    if inspect:
        lower_bound = (inspect - width) # if (inspect - width) >= 0 else 0
        upper_bound = (inspect + width) # if (inspect - width) <= len(out) - 1 else len(out) - 1
        length = len(out) - 1
        
        if lower_bound < 0:
            lower_bound = 0
        
        if upper_bound > length:
            upper_bound = length

        print(f'Length: {length}\nInspecting [{lower_bound}:{upper_bound}]')
        print(f'{out[lower_bound:upper_bound]}')
    return out

In [62]:
js = json.loads(read('sr.json', inspect=391117, width=25))
len(js['searchResults']['listResults'])

Length: 391118
Inspecting [391092:391118]
edResultsHash": "1"
    }



41

In [66]:
js = json.loads(read('sr2.json'))
len(js['searchResults']['listResults'])

41

In [117]:
hout = HTML('<p>HTML paragraph<b>bold</b></p>')
mout = Markdown('<p>MD pargraph <b>bold</b></p>')
md_handle = display(mout, display_id=True)
#dh = dsp.DisplayHandle()
#display(dh)
time.sleep(2)
md_handle.update(hout)
time.sleep(1)
md_handle.update(mout)




<p>MD pargraph <b>bold</b></p>

In [136]:
import itertools

dh = display(display_id=True)
display(Markdown('# Display Update Fun'))
dh.display(Markdown(''))

for i in itertools.cycle(range(0x1F5F0, 0x1F650)):
    dh.update(Markdown(f'## &#{i}; {hex(i)}'))
    time.sleep(0.5)

# Display Update Fun

## &#128503; 0x1f5f7

KeyboardInterrupt: 

In [154]:
cyc = [0x1F468, 0x200D, 0x1F466, 0x200D, 0x1F466]
display(Markdown(f'# &#{0x1F468};&#{0x200d};&#{0x1F466};&#{0x200d};&#{0x1F466};'))
cc = '&#'+';&#'.join([str(c) for c in cyc])+';'
print([str(c) for c in cyc])
print(cc)
display(Markdown(cc))
for c in cyc:
    display(Markdown(f'# &#{c};'))

display(Markdown('🐱‍👤'))

# &#128104;&#8205;&#128102;&#8205;&#128102;

['128104', '8205', '128102', '8205', '128102']
&#128104;&#8205;&#128102;&#8205;&#128102;


&#128104;&#8205;&#128102;&#8205;&#128102;

# &#128104;

# &#8205;

# &#128102;

# &#8205;

# &#128102;

🐱‍👤