In [21]:
 # Dependencies
import requests
import urllib
import random
import math
import pandas as pd
import xml.etree.ElementTree as ET
import time
from config import zws_id, gkey 
from urllib.request import urlopen

In [22]:
# function to grab the exact address based on longitude and latitude
# modified from here https://gist.github.com/bradmontgomery/5397472
# their example didn't include an API key, but I added it otherwise you'd hit the rate limit easily

def reverse_geocode(latitude, longitude):
    # Did the geocoding request comes from a device with a
    # location sensor? Must be either true or false
    sensor = 'true'

    # Hit Google's reverse geocoder directly
    # NOTE: I *think* their terms state that you're supposed to
    # use google maps if you use their api for anything.
    base = "https://maps.googleapis.com/maps/api/geocode/json?"
    params = "latlng={lat},{lon}&sensor={sen}&key={key}".format(
        lat=latitude,
        lon=longitude,
        sen=sensor,
        key=gkey
    )
    url = "{base}{params}".format(base=base, params=params)
    #print(url)
    response = requests.get(url).json()
    address = response['results'][0]['formatted_address']
    return address


In [23]:
# function to generate random lat & lng within a certain radius 
# modified from here: http://hadoopguru.blogspot.com/2014/12/python-generate-random-latitude-and.html
# changed to take in an empty initial dataframe and load in the data + return it
# this calls the reverse geocode function to grab the addresses of each randomly generated lat & lng

def generate_addresses(latitude, longitude, df):
    
    radius = 10000                         #Choose your own radius
    radiusInDegrees=radius/111300            
    r = radiusInDegrees

    counter = 0
    
    for i in range(1,200):                 #Choose number of Lat Long to be generated

        u = float(random.uniform(0.0,1.0))
        v = float(random.uniform(0.0,1.0))

        w = r * math.sqrt(u)
        t = 2 * math.pi * v
        x = w * math.cos(t) 
        y = w * math.sin(t)

        xLat  = x + latitude
        yLng = y + longitude

        df.set_value(counter, "latitude", xLat)
        df.set_value(counter, "longitude", yLng)
        
        #print(format(counter) + ": " + format(xLat) + ", " + format(yLng))
        address = reverse_geocode(xLat, yLng).split(',')
        citystatezip = address[1] + address[2]
        
        df.set_value(counter, "address", address[0])
        df.set_value(counter, "city_state_zip", citystatezip)
        
        # Add to counter
        counter = counter + 1
    
    return df

# use Los Angeles as the center of the Latitude & longitude to test this out
# if instructor allows, we should probably just manually call this 20 times to get data for each city faster
# otherwise the gigantic loop will take forever

la_df = pd.DataFrame({"latitude":'',
                   "longitude":'',
                   "address":'',
                   "city_state_zip":''}, index=[0])

generate_addresses(34.0522342,-118.2436849, la_df)

Unnamed: 0,address,city_state_zip,latitude,longitude
0,6506 Stanford Ave,Los Angeles CA 90001,33.9807,-118.262
1,3824 Wisconsin St,Los Angeles CA 90037,34.016,-118.293
2,1423 Lucile Ave,Los Angeles CA 90026,34.0901,-118.28
3,3135 Glendale Blvd,Los Angeles CA 90039,34.1171,-118.263
4,101-189 N Hope St,Los Angeles CA 90012,34.0567,-118.25
5,1055 Corporate Center Dr,Monterey Park CA 91754,34.051,-118.164
6,2400-2412 Mayberry St,Los Angeles CA 90026,34.0841,-118.265
7,5542 Barton Ave,Los Angeles CA 90038,34.0872,-118.313
8,3047 W 12th Pl,Los Angeles CA 90006,34.0485,-118.308
9,3641 Ramboz Dr,Los Angeles CA 90063,34.0535,-118.187


In [29]:
# this code calls the Zillow API's GetSearchResults and will check to see if a house exists at that address

# zillow url format
# http://www.zillow.com/webservice/GetSearchResults.htm?zws-id=<ZWSID>&address=2114+Bigelow+Ave&citystatezip=Seattle%2C+WA
        
def get_message_codes(df):

    # create new column to hold message code from zillow
    df['message_code'] = ''

    for index, row in df.iterrows():

        try:
            url = 'https://www.zillow.com/webservice/GetSearchResults.htm?zws-id='
            address = row['address']
            citystatezip =row['city_state_zip']


            query_url = url + zws_id + '&address=' + urllib.parse.quote(address) + '&citystatezip=' + urllib.parse.quote(citystatezip) 
            #print(query_url)

            root = ET.parse(urlopen(query_url)).getroot()

            for message in root.iter('message'):
                message_code = message[1].text

            print(format(index) + ": " + message_code)

            df.set_value(index, 'message_code', message_code)

            time.sleep(0.5) #necessary bc bombarding Zillow with API calls doesn't allow enough time to respond to each

        except:
            break
        
    # remove the ones with a code of 508, which means property not found
    

    
get_message_codes(la_df)

0: 508
1: 0
2: 0
3: 0
4: 508
5: 508
6: 508
7: 508
8: 0
9: 0
10: 0
11: 0
12: 508
13: 0
14: 0
15: 508
16: 508
17: 508
18: 508
19: 0
20: 0
21: 0
22: 508
23: 0
24: 0
25: 0
26: 0
27: 508
28: 508
29: 508
30: 0
31: 0
32: 508
33: 0
34: 508
35: 0
36: 508
37: 508
38: 508
39: 508
40: 0
41: 508
42: 508
43: 0
44: 508
45: 508
46: 508
47: 0
48: 508
49: 508
50: 508
51: 508
52: 508
53: 508
54: 0
55: 508
56: 508
57: 0
58: 508
59: 0
60: 508
61: 508
62: 508
63: 508
64: 0
65: 508
66: 508
67: 508
68: 0
69: 508
70: 0
71: 508
72: 0
73: 0
74: 508
75: 508
76: 508
77: 0
78: 0
79: 0
80: 508
81: 508
82: 508
83: 508
84: 0
85: 508
86: 508
87: 508
88: 508
89: 0
90: 508
91: 508
92: 508
93: 508
94: 508
95: 508
96: 508
97: 508
98: 0
99: 508
100: 508
101: 508
102: 508
103: 0
104: 508
105: 508
106: 0
107: 508
108: 508
109: 0
110: 0
111: 0
112: 508
113: 508
114: 508
115: 508
116: 508
117: 0
118: 508
119: 0
120: 508
121: 508
122: 508
123: 508
124: 508
125: 508
126: 508
127: 508
128: 0
129: 508
130: 0
131: 508
132: 508
133: 

In [31]:
# cull all the rows where houses do not exist at the address
la_df = la_df[la_df.message_code != '508']

la_df

Unnamed: 0,address,city_state_zip,latitude,longitude,message_code
1,3824 Wisconsin St,Los Angeles CA 90037,34.016,-118.293,0
2,1423 Lucile Ave,Los Angeles CA 90026,34.0901,-118.28,0
3,3135 Glendale Blvd,Los Angeles CA 90039,34.1171,-118.263,0
8,3047 W 12th Pl,Los Angeles CA 90006,34.0485,-118.308,0
9,3641 Ramboz Dr,Los Angeles CA 90063,34.0535,-118.187,0
10,921 E 51st St,Los Angeles CA 90011,33.997,-118.259,0
11,1801 S Victoria Ave,Los Angeles CA 90019,34.0414,-118.332,0
13,2615 Rodeo Rd,Los Angeles CA 90018,34.0192,-118.324,0
14,3026 Silver Lake Blvd,Los Angeles CA 90039,34.1093,-118.255,0
19,1350 W 41st St,Los Angeles CA 90037,34.0087,-118.3,0


In [55]:
## STILL WORKING ON THIS, testing with a simple example not the whole table ##

#http://www.zillow.com/webservice/GetDeepSearchResults.htm?zws-id=<ZWSID>&address=2114+Bigelow+Ave&citystatezip=Seattle%2C+WA

url = 'https://www.zillow.com/webservice/GetDeepSearchResults.htm?zws-id='
address = la_df['address'][1]
citystatezip =la_df['city_state_zip'][1]


query_url = url + zws_id + '&address=' + urllib.parse.quote(address) + '&citystatezip=' + urllib.parse.quote(citystatezip) 
print(query_url + '\n')

root = ET.parse(urlopen(query_url)).getroot()

print(address + citystatezip + '\n')

# grab the zestimate
for zestimate in root.iter('zestimate'):
    print ('zestimate (value): ' + format(zestimate[0].text)) #there are multiple zestimates w/ diff dates, what to do?
    print('last updated: ' + format(zestimate[1].text))
    print('\n')

for bedroom in root.iter('bedrooms'):
    print("bedrooms: " + bedroom.text)

for bathroom in root.iter('bathrooms'):
    print("bathrooms: " + bathroom.text)

https://www.zillow.com/webservice/GetDeepSearchResults.htm?zws-id=X1-ZWz1gba2odrvgr_6plsv&address=3824%20Wisconsin%20St&citystatezip=%20Los%20Angeles%20CA%2090037

3824 Wisconsin St Los Angeles CA 90037

zestimate (value): None
last updated: 01/01/1970


zestimate (value): None
last updated: 01/01/1970


zestimate (value): 531317
last updated: 03/30/2018


zestimate (value): 503833
last updated: 03/30/2018


zestimate (value): 544159
last updated: 03/30/2018


zestimate (value): None
last updated: 01/01/1970


bedrooms: 2
bedrooms: 2
bedrooms: 2
bedrooms: 2
bedrooms: 2
bathrooms: 1.0
bathrooms: 1.0
bathrooms: 1.0
bathrooms: 1.0
bathrooms: 1.0
