In [27]:


from lxml import html
import requests
import unicodecsv as csv
import argparse
import json
from bs4 import BeautifulSoup

In [59]:
def clean(text):
    if text:
        return ' '.join(' '.join(text).split())
    return None





def create_url(zipcode, filter):
    # Creating Zillow URL based on the filter.

    if filter == "newest":
        url = "https://www.zillow.com/homes/for_sale/{0}/0_singlestory/days_sort".format(zipcode)
    elif filter == "cheapest":
        url = "https://www.zillow.com/homes/for_sale/{0}/0_singlestory/pricea_sort/".format(zipcode)
    else:
        url = "https://www.zillow.com/homes/for_sale/{0}_rb/?fromHomePage=true&shouldFireSellPageImplicitClaimGA=false&fromHomePageTab=buy".format(zipcode)
    print(url)
    return url


def save_to_file(response):
    # saving response to `response.html`

    with open("response.html", 'w') as fp:
        fp.write(response.text)


def write_data_to_csv(data):
    # saving scraped data to csv.

    with open("properties-%s.csv" % (zipcode), 'wb') as csvfile:
        fieldnames = ['title', 'address', 'city', 'state', 'postal_code', 'price', 'facts and features', 'real estate provider', 'url']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for row in data:
            writer.writerow(row)


def get_data_from_json(raw_json_data):
    # getting data from json (type 2 of their A/B testing page)

    cleaned_data = clean(raw_json_data).replace('<!--', "").replace("-->", "")
    properties_list = []

    try:
        json_data = json.loads(cleaned_data)
        search_results = json_data.get('searchResults').get('listResults', [])

        for properties in search_results:
            address = properties.get('addressWithZip')
            property_info = properties.get('hdpData', {}).get('homeInfo')
            city = property_info.get('city')
            state = property_info.get('state')
            postal_code = property_info.get('zipcode')
            price = properties.get('price')
            bedrooms = properties.get('beds')
            bathrooms = properties.get('baths')
            area = properties.get('area')
            info = f'{bedrooms} bds, {bathrooms} ba ,{area} sqft'
            broker = properties.get('brokerName')
            property_url = properties.get('detailUrl')
            title = properties.get('statusText')

            data = {'address': address,
                    'city': city,
                    'state': state,
                    'postal_code': postal_code,
                    'price': price,
                    'facts and features': info,
                    'real estate provider': broker,
                    'url': property_url,
                    'title': title}
            properties_list.append(data)

        return properties_list

    except ValueError:
        print("Invalid json")
        return None


def parse(zipcode, filter=None):
    url = create_url(zipcode, filter)
    response = get_response(url)

    if not response:
        print("Failed to fetch the page, please check `response.html` to see the response received from zillow.com.")
        return None

    parser = html.fromstring(response.text)
    search_results = parser.xpath("//div[@id='search-results']//article")

    if not search_results:
        print("parsing from json data")
        # identified as type 2 page
        raw_json_data = parser.xpath('//script[@data-zrr-shared-data-key="mobileSearchPageStore"]//text()')
        return get_data_from_json(raw_json_data)

    print("parsing from html page")
    properties_list = []
    for properties in search_results:
        raw_address = properties.xpath(".//span[@itemprop='address']//span[@itemprop='streetAddress']//text()")
        raw_city = properties.xpath(".//span[@itemprop='address']//span[@itemprop='addressLocality']//text()")
        raw_state = properties.xpath(".//span[@itemprop='address']//span[@itemprop='addressRegion']//text()")
        raw_postal_code = properties.xpath(".//span[@itemprop='address']//span[@itemprop='postalCode']//text()")
        raw_price = properties.xpath(".//span[@class='zsg-photo-card-price']//text()")
        raw_info = properties.xpath(".//span[@class='zsg-photo-card-info']//text()")
        raw_broker_name = properties.xpath(".//span[@class='zsg-photo-card-broker-name']//text()")
        url = properties.xpath(".//a[contains(@class,'overlay-link')]/@href")
        raw_title = properties.xpath(".//h4//text()")

        address = clean(raw_address)
        city = clean(raw_city)
        state = clean(raw_state)
        postal_code = clean(raw_postal_code)
        price = clean(raw_price)
        info = clean(raw_info).replace(u"\xb7", ',')
        broker = clean(raw_broker_name)
        title = clean(raw_title)
        property_url = "https://www.zillow.com" + url[0] if url else None
        is_forsale = properties.xpath('.//span[@class="zsg-icon-for-sale"]')

        properties = {'address': address,
                      'city': city,
                      'state': state,
                      'postal_code': postal_code,
                      'price': price,
                      'facts and features': info,
                      'real estate provider': broker,
                      'url': property_url,
                      'title': title}
        if is_forsale:
            properties_list.append(properties)
    return properties_list

In [60]:
url = create_url("78133","")
print(url)

https://www.zillow.com/homes/for_sale/78133_rb/?fromHomePage=true&shouldFireSellPageImplicitClaimGA=false&fromHomePageTab=buy
https://www.zillow.com/homes/for_sale/78133_rb/?fromHomePage=true&shouldFireSellPageImplicitClaimGA=false&fromHomePageTab=buy


In [63]:
url="https://www.trulia.com/TX/Canyon_Lake/78133/"
response = get_response(url)
print(response.text)

status code received: 200
<!DOCTYPE html><html lang="en"><head><meta charSet="utf-8"/><script>
            window.__uspapi = function(command, version, callback) {
              try {
                if (command === 'getUSPData') {
                  var cookies = document.cookie.split(';');
                  for (var i = 0; i < cookies.length; i++) {
                    var cookie = cookies[i];
                    var separatorIndex = cookie.indexOf('=');
                    separatorIndex = separatorIndex < 0 ? cookie.length : separatorIndex;
                    var cookie_name = decodeURIComponent(cookie.slice(0, separatorIndex).replace(/^\s+/, ''));
                    if (cookie_name === 'usprivacy') {
                      var uspString = decodeURIComponent(cookie.slice(separatorIndex + 1));
                      callback({ version: version, uspString: uspString }, true);
                      return;
                    }
                  }
                }
              } catc

In [55]:
URL = 'https://www.monster.com/jobs/search/?q=Software-Developer&where=Australia'
page = requests.get(URL)

In [56]:
page.text

'<!DOCTYPE html>\r\n<html xmlns="https://www.w3.org/1999/xhtml" xml:lang="en" lang="en">\r\n<head>\r\n    \r\n            <link rel="preconnect" href="https://coda.newjobs.com" />\r\n            <link rel="preconnect" href="https://js-seeker.newjobs.com" />\r\n            <link rel="preconnect" href="https://css-seeker.newjobs.com" />\r\n            <link rel="preconnect" href="https://securemedia.newjobs.com" />\r\n            <link rel="preconnect" href="https://logs2.jobs.com" />\r\n            <link rel="preconnect" href="https://job-openings.monster.com" />\r\n            <link rel="preconnect" href="https://apis.google.com" />\r\n            <link rel="preconnect" href="https://www.google.com" />\r\n            <link rel="preconnect" href="https://accounts.google.com" />\r\n            <link rel="preconnect" href="https://content.googleapis.com" />\r\n            <link rel="preconnect" href="https://ssl.gstatic.com" />\r\n            <link rel="preconnect" href="https://www.dropb

In [62]:
parser = html.fromstring(response.text)
search_results = parser.xpath("//div[@id='search-results']//article")
print(search_results)

AttributeError: 'bytes' object has no attribute 'fromstring'

In [18]:
if not search_results:
        print("parsing from json data")

parsing from json data


In [23]:
zipcode = "02126"
sort = "newest"
#scraped_data = parse(zipcode, sort)
#if scraped_data:
#    print ("Writing data to output file")
#    write_data_to_csv(scraped_data)

In [31]:
#!/usr/bin/python
# -*- coding: utf-8 -*-

import urllib.request
import urllib.parse
import urllib.error
from bs4 import BeautifulSoup
import ssl
import json
import ast
import os
from urllib.request import Request, urlopen



In [32]:
# For ignoring SSL certificate errors

ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

# Input from user

# url = input('Enter Zillow House Listing Url- ')
url = "https://www.zillow.com/homedetails/638-Grant-Ave-North-Baldwin-NY-11510/31220792_zpid/"
# Making the website believe that you are accessing it using a mozilla browser

req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
webpage = urlopen(req).read()

# Creating a BeautifulSoup object of the html page for easy extraction of data.

soup = BeautifulSoup(webpage, 'html.parser')
html = soup.prettify('utf-8')
property_json = {}
property_json['Details_Broad'] = {}
property_json['Address'] = {}

# Extract Title of the property listing

for title in soup.findAll('title'):
    property_json['Title'] = title.text.strip()
    break

for meta in soup.findAll('meta', attrs={'name': 'description'}):
    property_json['Detail_Short'] = meta['content'].strip()

for div in soup.findAll('div', attrs={'class': 'character-count-truncated'}):
    property_json['Details_Broad']['Description'] = div.text.strip()

for (i, script) in enumerate(soup.findAll('script',
                             attrs={'type': 'application/ld+json'})):
    if i == 0:
        json_data = json.loads(script.text)
        property_json['Details_Broad']['Number of Rooms'] = json_data['numberOfRooms']
        property_json['Details_Broad']['Floor Size (in sqft)'] = json_data['floorSize']['value']
        property_json['Address']['Street'] = json_data['address']['streetAddress']
        property_json['Address']['Locality'] = json_data['address']['addressLocality']
        property_json['Address']['Region'] = json_data['address']['addressRegion']
        property_json['Address']['Postal Code'] = json_data['address']['postalCode']
    if i == 1:
        json_data = json.loads(script.text)
        property_json['Price in $'] = json_data['offers']['price']
        property_json['Image'] = json_data['image']
        break

with open('data.json', 'w') as outfile:
    json.dump(property_json, outfile, indent=4)

with open('output_file.html', 'wb') as file:
    file.write(html)

print ('----------Extraction of data is complete. Check json file.----------')

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [240]:
us_state_abbrev = {
    'Alabama': 'AL',
    'Alaska': 'AK',
    'American Samoa': 'AS',
    'Arizona': 'AZ',
    'Arkansas': 'AR',
    'California': 'CA',
    'Colorado': 'CO',
    'Connecticut': 'CT',
    'Delaware': 'DE',
    'District of Columbia': 'DC',
    'Florida': 'FL',
    'Georgia': 'GA',
    'Guam': 'GU',
    'Hawaii': 'HI',
    'Idaho': 'ID',
    'Illinois': 'IL',
    'Indiana': 'IN',
    'Iowa': 'IA',
    'Kansas': 'KS',
    'Kentucky': 'KY',
    'Louisiana': 'LA',
    'Maine': 'ME',
    'Maryland': 'MD',
    'Massachusetts': 'MA',
    'Michigan': 'MI',
    'Minnesota': 'MN',
    'Mississippi': 'MS',
    'Missouri': 'MO',
    'Montana': 'MT',
    'Nebraska': 'NE',
    'Nevada': 'NV',
    'New Hampshire': 'NH',
    'New Jersey': 'NJ',
    'New Mexico': 'NM',
    'New York': 'NY',
    'North Carolina': 'NC',
    'North Dakota': 'ND',
    'Northern Mariana Islands':'MP',
    'Ohio': 'OH',
    'Oklahoma': 'OK',
    'Oregon': 'OR',
    'Pennsylvania': 'PA',
    'Puerto Rico': 'PR',
    'Rhode Island': 'RI',
    'South Carolina': 'SC',
    'South Dakota': 'SD',
    'Tennessee': 'TN',
    'Texas': 'TX',
    'Utah': 'UT',
    'Vermont': 'VT',
    'Virgin Islands': 'VI',
    'Virginia': 'VA',
    'Washington': 'WA',
    'West Virginia': 'WV',
    'Wisconsin': 'WI',
    'Wyoming': 'WY'
}

abbrev_us_state = dict(map(reversed, us_state_abbrev.items()))

In [267]:
import geopy

def create_url(city,state,zipcode):
    # Creating Zillow URL based on the filter.

    url = "https://www.trulia.com/" + state + "/" + city + "/" + zipcode
    return url

def get_headers():
    # Creating headers.
    headers = {'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
               'accept-encoding': 'gzip, deflate, sdch, br',
               'accept-language': 'en-GB,en;q=0.8,en-US;q=0.6,ml;q=0.4',
               'cache-control': 'max-age=0',
               'upgrade-insecure-requests': '1',
               'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'}

    #headers = {'cache-control': 'no-cache'}
    return headers


def get_response(url):
    # Getting response from zillow.com.

    for i in range(5):
        response = requests.get(url, headers=get_headers())
        print("status code received:", response.status_code)
        if response.status_code != 200:
            # saving response to file for debugging purpose.
            save_to_file(response)
            continue
        else:
            save_to_file(response)
            return response
    return None


def GetCityStateZip(lat,lon):
    lat = lat/10**6
    lon = lon/10**6

    location = geolocator.reverse((lat, lon))
   

    city = location.raw['address']['city']
    state = us_state_abbrev[location.raw['address']['state']]

    zipcode = location.raw['address']['postcode'].split('-')[0]
   
    return city,state,zipcode

In [268]:
city,state,zipcode = GetCityStateZip(34144442,-118654084)
print("city=", city)
print("state=", state)
print("zipcode=",zipcode)

city= Calabasas
state= CA
zipcode= 91302


In [159]:
url = create_url(city,state,zipcode)
print(url)

https://www.trulia.com/CA/Calabasas/91302


In [175]:
response = get_response(url)
print(response.text)



status code received: 200
<!DOCTYPE html>
<html lang="en">

<head>
    <meta charset="utf-8">
    <meta name="viewport" content="width=device-width, initial-scale=1">
    <title>Access to this page has been denied.</title>
    <style>
        @font-face {
            font-family: "PostGrotesk";
            src: url("//static.trulia-cdn.com/images/fonts/PostGroteskBook_20150623/PostGrotesk-Book.eot");
            src: url("//static.trulia-cdn.com/images/fonts/PostGroteskBook_20150623/PostGrotesk-Book.eot?#iefix") format("embedded-opentype"), url("//static.trulia-cdn.com/images/fonts/PostGroteskBook_20150623/PostGrotesk-Book.woff") format("woff"), url("//static.trulia-cdn.com/images/fonts/PostGroteskBook_20150623/PostGrotesk-Book.ttf") format("truetype"), url("//static.trulia-cdn.com/images/fonts/PostGroteskBook_20150623/PostGrotesk-Book.svg#PostGroteskRegular") format("svg");
            font-weight: 400;
            font-style: normal;
            font-display: swap;
        }

       

In [249]:
import urllib.request
import urllib.parse
import urllib.error
from bs4 import BeautifulSoup
#import ssl
#import json
#import ast
#import os
from urllib.request import Request, urlopen



In [250]:
# For ignoring SSL certificate errors

#ctx = ssl.create_default_context()
#ctx.check_hostname = False
#ctx.verify_mode = ssl.CERT_NONE


# Making the website believe that you are accessing it using a mozilla browser

req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
webpage = urlopen(req).read()

# Creating a BeautifulSoup object of the html page for easy extraction of data.

soup = BeautifulSoup(webpage, 'html.parser')


In [231]:
html = soup.prettify('utf-8')
product_json = {}
# This code block will get you a one liner description of the listed property

for meta in soup.findAll('meta', attrs={'name': 'description'}):
    try:
        product_json['description'] = meta['content']
        break
    except:
        pass
print(product_json)


{'description': '127 Homes For Sale in Calabasas, CA 91302. Browse photos, see new properties, get open house info, and research neighborhoods on Trulia.'}


In [232]:
# This code block will get you the link of the listed property

for link in soup.findAll('link', attrs={'rel': 'canonical'}):
    try:
        product_json['link'] = link['href']
        break
    except:
        pass


print(product_json)



{'description': '127 Homes For Sale in Calabasas, CA 91302. Browse photos, see new properties, get open house info, and research neighborhoods on Trulia.', 'link': 'https://www.trulia.com/CA/Calabasas/91302/'}


In [226]:
#scripts = soup.findAll('script',attrs={'type': 'application/ld+json'})
#print(len(scripts))
#details_json = scripts[0]
#print(details_json)

In [254]:
details = {}
listings_json = {}
index = 0
for price in  soup.findAll('div',attrs={'data-testid': 'property-price'}):
    details.update({index:price.text.strip()}) 
    index = index + 1

listings_json['price'] = {}
listings_json['price']  = details
print(listings_json['price'])

{0: '$8,899,000', 1: '$3,199,000', 2: '$1,499,000', 3: '$3,599,000', 4: '$32,000,000', 5: '$1,249,000', 6: '$25,000,000', 7: '$2,299,000', 8: '$10,980,000', 9: '$300,000', 10: '$6,749,000', 11: '$1,649,000', 12: '$6,750,000', 13: '$3,940,000', 14: '$1,777,000', 15: '$299,500', 16: '$4,599,000', 17: '$22,995,000', 18: '$90,000', 19: '$3,950,000', 20: '$3,049,000', 21: '$10,850,000', 22: '$1,995,000', 23: '$1,699,000', 24: '$11,900,000', 25: '$12,850,000', 26: '$14,495,000', 27: '$799,900', 28: '$6,250,000', 29: '$2,049,000'}


In [256]:
details = {}
index = 0
for bedroom  in  soup.findAll('div',attrs={'data-testid': 'property-beds'}):
    details.update({index:bedroom.text.strip()}) 
    index = index + 1

listings_json['bedroom'] = {}
listings_json['bedroom']  = details
print(listings_json)

{'price': {0: '$8,899,000', 1: '$3,199,000', 2: '$1,499,000', 3: '$3,599,000', 4: '$32,000,000', 5: '$1,249,000', 6: '$25,000,000', 7: '$2,299,000', 8: '$10,980,000', 9: '$300,000', 10: '$6,749,000', 11: '$1,649,000', 12: '$6,750,000', 13: '$3,940,000', 14: '$1,777,000', 15: '$299,500', 16: '$4,599,000', 17: '$22,995,000', 18: '$90,000', 19: '$3,950,000', 20: '$3,049,000', 21: '$10,850,000', 22: '$1,995,000', 23: '$1,699,000', 24: '$11,900,000', 25: '$12,850,000', 26: '$14,495,000', 27: '$799,900', 28: '$6,250,000', 29: '$2,049,000'}, 'bedroom': {0: '6bd', 1: '5bd', 2: '4bd', 3: '5bd', 4: '7bd', 5: '3bd', 6: '8bd', 7: '5bd', 8: '6bd', 9: '2bd', 10: '6bd', 11: '3bd', 12: '6bd', 13: '5bd', 14: '4bd', 15: '3bd', 16: '6bd', 17: '7bd', 18: '6bd', 19: '5bd', 20: '8bd', 21: '5bd', 22: '5bd', 23: '8bd', 24: '7bd', 25: '7bd', 26: '4bd', 27: '5bd', 28: '4bd'}}


In [257]:
details = {}
index = 0
for bathroom  in  soup.findAll('div',attrs={'data-testid': 'property-baths'}):
    details.update({index:bathroom.text.strip()}) 
    index = index + 1

listings_json['bathroom'] = {}
listings_json['bathroom']  = details
print(listings_json)

{'price': {0: '$8,899,000', 1: '$3,199,000', 2: '$1,499,000', 3: '$3,599,000', 4: '$32,000,000', 5: '$1,249,000', 6: '$25,000,000', 7: '$2,299,000', 8: '$10,980,000', 9: '$300,000', 10: '$6,749,000', 11: '$1,649,000', 12: '$6,750,000', 13: '$3,940,000', 14: '$1,777,000', 15: '$299,500', 16: '$4,599,000', 17: '$22,995,000', 18: '$90,000', 19: '$3,950,000', 20: '$3,049,000', 21: '$10,850,000', 22: '$1,995,000', 23: '$1,699,000', 24: '$11,900,000', 25: '$12,850,000', 26: '$14,495,000', 27: '$799,900', 28: '$6,250,000', 29: '$2,049,000'}, 'bedroom': {0: '6bd', 1: '5bd', 2: '4bd', 3: '5bd', 4: '7bd', 5: '3bd', 6: '8bd', 7: '5bd', 8: '6bd', 9: '2bd', 10: '6bd', 11: '3bd', 12: '6bd', 13: '5bd', 14: '4bd', 15: '3bd', 16: '6bd', 17: '7bd', 18: '6bd', 19: '5bd', 20: '8bd', 21: '5bd', 22: '5bd', 23: '8bd', 24: '7bd', 25: '7bd', 26: '4bd', 27: '5bd', 28: '4bd'}, 'bathroom': {0: '9ba', 1: '6ba', 2: '3ba', 3: '6ba', 4: '10ba', 5: '2ba', 6: '15ba', 7: '5ba', 8: '9ba', 9: '2ba', 10: '9ba', 11: '4ba', 

In [258]:
details = {}
index = 0
for floorSpace  in  soup.findAll('div',attrs={'data-testid': 'property-floorSpace'}):
    details.update({index:floorSpace.text.strip()}) 
    index = index + 1

listings_json['floorSpace'] = {}
listings_json['floorSpace']  = details
print(listings_json)

{'price': {0: '$8,899,000', 1: '$3,199,000', 2: '$1,499,000', 3: '$3,599,000', 4: '$32,000,000', 5: '$1,249,000', 6: '$25,000,000', 7: '$2,299,000', 8: '$10,980,000', 9: '$300,000', 10: '$6,749,000', 11: '$1,649,000', 12: '$6,750,000', 13: '$3,940,000', 14: '$1,777,000', 15: '$299,500', 16: '$4,599,000', 17: '$22,995,000', 18: '$90,000', 19: '$3,950,000', 20: '$3,049,000', 21: '$10,850,000', 22: '$1,995,000', 23: '$1,699,000', 24: '$11,900,000', 25: '$12,850,000', 26: '$14,495,000', 27: '$799,900', 28: '$6,250,000', 29: '$2,049,000'}, 'bedroom': {0: '6bd', 1: '5bd', 2: '4bd', 3: '5bd', 4: '7bd', 5: '3bd', 6: '8bd', 7: '5bd', 8: '6bd', 9: '2bd', 10: '6bd', 11: '3bd', 12: '6bd', 13: '5bd', 14: '4bd', 15: '3bd', 16: '6bd', 17: '7bd', 18: '6bd', 19: '5bd', 20: '8bd', 21: '5bd', 22: '5bd', 23: '8bd', 24: '7bd', 25: '7bd', 26: '4bd', 27: '5bd', 28: '4bd'}, 'bathroom': {0: '9ba', 1: '6ba', 2: '3ba', 3: '6ba', 4: '10ba', 5: '2ba', 6: '15ba', 7: '5ba', 8: '9ba', 9: '2ba', 10: '9ba', 11: '4ba', 

In [259]:
details = {}
index = 0
for region  in  soup.findAll('div',attrs={'data-testid': 'property-region'}):
    details.update({index:region.text.strip()}) 
    index = index + 1

listings_json['region'] = {}
listings_json['region']  = details
print(listings_json)

{'price': {0: '$8,899,000', 1: '$3,199,000', 2: '$1,499,000', 3: '$3,599,000', 4: '$32,000,000', 5: '$1,249,000', 6: '$25,000,000', 7: '$2,299,000', 8: '$10,980,000', 9: '$300,000', 10: '$6,749,000', 11: '$1,649,000', 12: '$6,750,000', 13: '$3,940,000', 14: '$1,777,000', 15: '$299,500', 16: '$4,599,000', 17: '$22,995,000', 18: '$90,000', 19: '$3,950,000', 20: '$3,049,000', 21: '$10,850,000', 22: '$1,995,000', 23: '$1,699,000', 24: '$11,900,000', 25: '$12,850,000', 26: '$14,495,000', 27: '$799,900', 28: '$6,250,000', 29: '$2,049,000'}, 'bedroom': {0: '6bd', 1: '5bd', 2: '4bd', 3: '5bd', 4: '7bd', 5: '3bd', 6: '8bd', 7: '5bd', 8: '6bd', 9: '2bd', 10: '6bd', 11: '3bd', 12: '6bd', 13: '5bd', 14: '4bd', 15: '3bd', 16: '6bd', 17: '7bd', 18: '6bd', 19: '5bd', 20: '8bd', 21: '5bd', 22: '5bd', 23: '8bd', 24: '7bd', 25: '7bd', 26: '4bd', 27: '5bd', 28: '4bd'}, 'bathroom': {0: '9ba', 1: '6ba', 2: '3ba', 3: '6ba', 4: '10ba', 5: '2ba', 6: '15ba', 7: '5ba', 8: '9ba', 9: '2ba', 10: '9ba', 11: '4ba', 

In [261]:
with open('house_details.json', 'w') as outfile:
    json.dump(listings_json, outfile, indent=4)

In [263]:
listings_table = pd.read_json("house_details.json")

In [264]:
listings_table

Unnamed: 0,price,bedroom,bathroom,floorSpace,region
0,"$8,899,000",6bd,9ba,"10,565 sqft","The Oaks, Calabasas, CA"
1,"$3,199,000",5bd,6ba,"4,997 sqft","Calabasas, CA"
2,"$1,499,000",4bd,3ba,"3,419 sqft","Calabasas, CA"
3,"$3,599,000",5bd,6ba,"5,676 sqft","Hidden Hills, CA"
4,"$32,000,000",7bd,10ba,"16,570 sqft","The Oaks, Calabasas, CA"
5,"$1,249,000",3bd,2ba,"1,805 sqft","Calabasas, CA"
6,"$25,000,000",8bd,15ba,"24,491 sqft","Calabasas, CA"
7,"$2,299,000",5bd,5ba,"5,300 sqft","Calabasas, CA"
8,"$10,980,000",6bd,9ba,"12,953 sqft","The Oaks, Calabasas, CA"
9,"$300,000",2bd,2ba,"1,632 sqft","Calabasas, CA"


In [152]:
# This code block will get you the price and the currency of the listed property

for scripts in soup.findAll('script',attrs={'type': 'application/ld+json'}):                            
    details_json = ast.literal_eval(scripts.text.strip())
    
product_json['price'] = {}
product_json['price']['amount'] = details_json['offers']['price']
product_json['price']['currency'] = details_json['offers']['priceCurrency']

print(product_json)

SyntaxError: unexpected EOF while parsing (<unknown>, line 0)

In [155]:
divs = soup.findAll('div',attrs={'data-auto-test-id': 'home-details-overview'})
print(divs)

[]


In [154]:
# This code block will get you the detailed description of the the listed property

for paragraph in soup.findAll('p', attrs={'id': 'propertyDescription'}):
    product_json['broad-description'] = paragraph.text.strip()
product_json['overview'] = []

# This code block will get you the important points regarding the listed property

for divs in soup.findAll('div',attrs={'data-auto-test-id': 'home-details-overview'}):
    for divs_second in divs.findAll('div'):
        for uls in divs_second.findAll('ul'):
            for lis in uls.findAll('li', text=True, recursive=False):
                product_json['overview'].append(lis.text.strip())

# Creates a json file with all the information that you extracted

with open('house_details.json', 'w') as outfile:
    json.dump(product_json, outfile, indent=4)

# Creates an html file in your local with the html content of the page you parsed.

with open('output_file.html', 'wb') as file:
    file.write(html)

print ('----------Extraction of data is complete. Check json file.----------')

----------Extraction of data is complete. Check json file.----------
