In [22]:
import sys
import json
import requests
import time
import argparse
from urllib.parse import urlparse
from bs4 import BeautifulSoup
from urllib.request import urlretrieve 
import pymongo
import pandas as pd
import numpy as np
import random

# Initiate MongoDB

In [2]:
mc = pymongo.MongoClient()  # Connect to the MongoDB server using default settings
db = mc['demo_airbnb']  # Use (or create) a database called 'election_predictions'
collection_NY = db['NewYork']  # Use (or create) a collection called 'docs'

# Parse urls for apartments

In [2]:
def request_url( url, retries=3 ):
    """
    GET a page from url with a number of retries and exponential backoff.
    """
    print("request_url:{}".format(url))
    r = None
    wait = 2
    for i in range( retries ):
        r = requests.get( url )
        if r.status_code >= 400:
            print( "HTTP error: %d %s" % (r.status_code, r.reason) )
            print( r.text )
            
            # exponential backoff
            time.sleep( wait )
            wait *= 2
        else:
            break
    if r == None:
        raise CannotOpenUrl
    else:
        return r 


In [3]:
def parse_urls_single_page(soup):
    '''
    Input: BeautifulSoup obj
    Output:list of bnb apartments webpages
    '''
    meta_info = {}
    taglist = soup.find_all('a', attrs={'class': '_15ns6vh'}) 
    for tag in taglist:
        sub_dict = {}        
        sub_dict['href'] = tag.get('href')        
        sub_dict['name'] = tag.select('div._1rths372')[0].get_text() 
        sub_dict['review_counts'] = int(tag.select('span._ulku2jm')[0].get_text())
        meta_info[tag.get('target')] = sub_dict
    return meta_info
    

In [4]:
base_url = "https://www.airbnb.com/s"
city = "New York"
State = "NY"
Country = "United States"

page_url = base_url+"/"+"city".replace(' ','-')+'--'+State+'--'+Country.replace(' ','-')



In [24]:
r = request_url(page_url)

request_url:https://www.airbnb.com/s/city--NY--United-States


In [25]:
soup = BeautifulSoup(r.content, 'html.parser')

In [26]:
url_list_1 = parse_urls_single_page(soup)

In [124]:
url_list_1

{'listing_10016353': {'href': '/rooms/10016353?location=city%2C%20NY%2C%20United%20States',
  'name': 'Cozy Orchard St. Bedroom in the Lower East Side',
  'review_counts': 112},
 'listing_14014541': {'href': '/rooms/14014541?location=city%2C%20NY%2C%20United%20States',
  'name': 'Bright Room in Bedstuy One Block To Metro',
  'review_counts': 71},
 'listing_14014629': {'href': '/rooms/14014629?location=city%2C%20NY%2C%20United%20States',
  'name': 'Cosy Room One Stop to Metro',
  'review_counts': 66},
 'listing_14293057': {'href': '/rooms/14293057?location=city%2C%20NY%2C%20United%20States',
  'name': 'Apartment with roofterrace in downtown Manhattan',
  'review_counts': 15},
 'listing_15951388': {'href': '/rooms/15951388?location=city%2C%20NY%2C%20United%20States',
  'name': 'LARGE SUNNY ROOM/ WINDOW CITY VIEW OF MANHATTAN',
  'review_counts': 151},
 'listing_15990320': {'href': '/rooms/15990320?location=city%2C%20NY%2C%20United%20States',
  'name': 'Big Blue Room in Bushwick - 20 Mins

# Parse an apartment's webpage

In [5]:
def get_bootstrap_data_for_hypernova_key( body, hypernova_key ):
    """
    Extract bootstrap JSON data from a page body.
    No idea what hypernova is, but it sure requires some useful JSON.
    """
    soup = BeautifulSoup( body, "html.parser" )
    for tag in soup.find_all( "script", attrs={ "data-hypernova-key" : hypernova_key } ):
        s = tag.string
        if "bootstrapData" in s:
            # HACK! remove html comment just by truncating string.
            # will need to change if sting in html changes
            return json.loads( s[4:-3] )

# Download Apart Basic Info

In [6]:
def parse_each_webpage(page_url):
    try:
        r = request_url(page_url)
    except CannotOpenUrl:
        raise CannotOpenUrl
    else:
        buf = r.text
        page_data = get_bootstrap_data_for_hypernova_key( buf, "spaspabundlejs" )
    
    try:
    #get photos
        photos = page_data['bootstrapData']['reduxData']['homePDP']['listingInfo']\
                                                        ['listing']['photos']
        photo_urls = []
        for pho in photos:
            if 'large' in pho:
                photo_urls.append(pho['large'])
            else:
                photo_urls.append(pho['large_cover'])


        #get Id & Canonical url & Room_type
        Id = page_data['bootstrapData']['reduxData']['homePDP']['listingInfo']['listingId'] 
        canonical_url = page_data['bootstrapData']['canonical_url']
        room_type = page_data['bootstrapData']['reduxData']['homePDP']['listingInfo']\
                ['listing']['room_type_category']
        room_capacity = page_data['bootstrapData']['reduxData']['homePDP']['listingInfo']\
                ['listing']['person_capacity']

        #get location info
        localized_city = page_data['bootstrapData']['reduxData']['homePDP']['listingInfo']\
                    ['listing']['localized_city']
        country_code = page_data['bootstrapData']['reduxData']['homePDP']['listingInfo']\
                    ['listing']['country_code']
        coordinate = [page_data['bootstrapData']['reduxData']['homePDP']['listingInfo']\
                    ['listing']['lat'],page_data['bootstrapData']['reduxData']['homePDP']['listingInfo']\
                    ['listing']['lng']]

        #get host Info
        host_about = page_data['bootstrapData']['reduxData']['homePDP']['listingInfo']['listing']\
                ['primary_host']['about']
        host_name = page_data['bootstrapData']['reduxData']['homePDP']['listingInfo']\
                ['listing']['primary_host']['host_name']
        host_id = page_data['bootstrapData']['reduxData']['homePDP']['listingInfo']\
                ['listing']['primary_host']['id']
        host_member_since = page_data['bootstrapData']['reduxData']['homePDP']['listingInfo']\
                ['listing']['primary_host']['member_since']
        host_member_profile = page_data['bootstrapData']['reduxData']['homePDP']['listingInfo']\
                ['listing']['primary_host']['profile_path']

        #get overal rating
        overall_rating = page_data['bootstrapData']['reduxData']['homePDP']['listingInfo']\
                ['listing']['star_rating']
        review_highlight = page_data['bootstrapData']['reduxData']['homePDP']['listingInfo']\
                ['listing']['review_highlight']


        #form up returns
        info_collected = {
                            'id':Id,
                            'canonical_url':canonical_url,
                            'room_type':room_type,
                            'room_capacity':room_capacity,
                            'localized_city':localized_city,
                            'country_code':country_code,
                            'coordinate':coordinate,
                            'host_about':host_about,
                            'host_name':host_name,
                            'host_id':host_id,
                            'host_member_since':host_member_since,
                            'host_member_profile':host_member_profile,
                            'overall_rating':overall_rating,
                            'review_highlight':review_highlight, 
                            'photo_urls':photo_urls
                    }
    except:
        return [], page_data
            
    return info_collected, page_data
    

In [7]:
folder = 'Austin- Texas- United States'
file_tmp = pd.read_csv('DataSet/'+folder+'/listings.csv.gz',compression = 'gzip')
print('Total number of apartments in ' + folder +" is: {}".format(len(file_tmp)))
file_tmp.describe()

Total number of apartments inAustin- Texas- United States is: 9663


  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,id,scrape_id,host_id,host_acceptance_rate,host_listings_count,host_total_listings_count,neighbourhood_cleansed,neighbourhood_group_cleansed,zipcode,latitude,...,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,license,calculated_host_listings_count,reviews_per_month
count,9663.0,9663.0,9663.0,0.0,9661.0,9661.0,9663.0,0.0,9551.0,9663.0,...,5943.0,5933.0,5933.0,5919.0,5933.0,5915.0,5912.0,1.0,9663.0,6007.0
mean,9655724.0,20170310000000.0,29076230.0,,8.62478,8.62478,78722.467764,,78724.126688,30.276059,...,95.848898,9.735378,9.587055,9.866363,9.872409,9.633643,9.573241,32041660000.0,3.673704,1.359161
std,5702724.0,4.090055,31110860.0,,43.569728,43.569728,20.714283,,190.369965,0.054293,...,6.554311,0.681435,0.858134,0.524057,0.503931,0.697731,0.779787,,9.636083,1.675813
min,1078.0,20170310000000.0,23.0,,0.0,0.0,78701.0,,78218.0,30.114882,...,20.0,2.0,2.0,2.0,2.0,2.0,2.0,32041660000.0,1.0,0.01
25%,4849021.0,20170310000000.0,4689866.0,,1.0,1.0,78704.0,,78704.0,30.244039,...,94.0,10.0,9.0,10.0,10.0,9.0,9.0,32041660000.0,1.0,0.24
50%,10367680.0,20170310000000.0,17383940.0,,1.0,1.0,78721.0,,78721.0,30.266359,...,98.0,10.0,10.0,10.0,10.0,10.0,10.0,32041660000.0,1.0,0.74
75%,15188280.0,20170310000000.0,44270750.0,,2.0,2.0,78745.0,,78745.0,30.298177,...,100.0,10.0,10.0,10.0,10.0,10.0,10.0,32041660000.0,2.0,1.83
max,17590940.0,20170310000000.0,119516200.0,,843.0,843.0,78759.0,,97202.0,30.506688,...,100.0,10.0,10.0,10.0,10.0,10.0,10.0,32041660000.0,73.0,12.0


In [None]:
logfile_name = 'DataSet/'+folder+'/log_file.txt'

apt_infos = {}
apt_jses = {}

In [17]:
# for i in range(len(file_tmp)):
for i in range(200,500):
    print(i)
    url = file_tmp.loc[i,'listing_url']
    apt_id = file_tmp.loc[i,'id']
    apt_name = file_tmp.loc[i,'name']
    try:
        return_parsed = parse_each_webpage(url)
    except CannotOpenUrl:
        
        with open(logfile_name,"w") as f:
            f.writelines('Cannot open '+ str(i)+ ' url.\t' + 'Id: '+ str(apt_id) + ' |  Url: '+ url)                        
        print('Failed to get access to URL, sleep for 20 secs ')
        time.sleep(20)
        return_parsed = parse_each_webpage(url)
    finally:
        scrape_info,ori_json = return_parsed[0],return_parsed[1]
        dict_apt = {'apt_name':apt_name,'url' : url, 'info': scrape_info}
        apt_jses.update({str(apt_id):ori_json})
        apt_infos.update({str(apt_id):dict_apt})
        if i % 100 == 0 and i != 0:
            filename = "DataSet/"+folder+"//ori_json_"+str(i)
            with open(filename,"w") as f:
                json.dump(apt_jses,f)
            print("Wrote  down"+str(i)+"json file")
            jses = {}
        time.sleep(random.random()+2)

200
request_url:https://www.airbnb.com/rooms/16835586
Wrote  down200json file
201
request_url:https://www.airbnb.com/rooms/17220777
202
request_url:https://www.airbnb.com/rooms/16998086
203
request_url:https://www.airbnb.com/rooms/15455621
204
request_url:https://www.airbnb.com/rooms/11719566
205
request_url:https://www.airbnb.com/rooms/11959131
206
request_url:https://www.airbnb.com/rooms/15706510
207
request_url:https://www.airbnb.com/rooms/12847393
208
request_url:https://www.airbnb.com/rooms/4794526
209
request_url:https://www.airbnb.com/rooms/967091
210
request_url:https://www.airbnb.com/rooms/1805671
211
request_url:https://www.airbnb.com/rooms/7926452
212
request_url:https://www.airbnb.com/rooms/15582726
213
request_url:https://www.airbnb.com/rooms/5473833
214
request_url:https://www.airbnb.com/rooms/6797353
215
request_url:https://www.airbnb.com/rooms/14709769
216
request_url:https://www.airbnb.com/rooms/5511448
217
request_url:https://www.airbnb.com/rooms/8828449
218
request_u

353
request_url:https://www.airbnb.com/rooms/5387815
354
request_url:https://www.airbnb.com/rooms/5256761
355
request_url:https://www.airbnb.com/rooms/11475165
356
request_url:https://www.airbnb.com/rooms/8067300
357
request_url:https://www.airbnb.com/rooms/17046317
358
request_url:https://www.airbnb.com/rooms/3513732
359
request_url:https://www.airbnb.com/rooms/13606650
360
request_url:https://www.airbnb.com/rooms/17160899
361
request_url:https://www.airbnb.com/rooms/9183548
362
request_url:https://www.airbnb.com/rooms/17120213
363
request_url:https://www.airbnb.com/rooms/10551854
364
request_url:https://www.airbnb.com/rooms/14885254
365
request_url:https://www.airbnb.com/rooms/13452319
366
request_url:https://www.airbnb.com/rooms/13185122
367
request_url:https://www.airbnb.com/rooms/5386336
368
request_url:https://www.airbnb.com/rooms/5481502
369
request_url:https://www.airbnb.com/rooms/918416
370
request_url:https://www.airbnb.com/rooms/7093624
371
request_url:https://www.airbnb.com

In [21]:
filename = "DataSet/"+folder+"//webscrapted"
folder = 'Austin- Texas- United States'
with open(filename,"w") as f:
    json.dump(apt_infos,f)
    print("writing down")

writing down


In [20]:
filename = "DataSet/"+folder+"//ori_json"
folder = 'Austin- Texas- United States'
with open(filename,"w") as f:
    json.dump(apt_jses,f)
    print("writing down")

writing down


# Stash

## Parse a single searching page and get some information 

In [93]:
base_url_aparts = "https://www.airbnb.com/"

# url_apart = 'https://www.airbnb.com/rooms/10016353?location=city%2C%20NY%2C%20United%20States'
url_apart = 'https://www.airbnb.com/rooms/10016353'

r2 = request_url(url_apart)

buf= r2.text

Get some information(all top_destinations from JSON file)

page_data = get_bootstrap_data_for_hypernova_key( buf, "spaspabundlejs" )

scrape_info = parse_each_webpage(url_apart)

scrape_info

top_destinations = page_data['bootstrapData']['all_top_destinations']

with open("data/top_destinations.json","w") as f:
    json.dump(top_destinations,f)
    print("writing")

with open("data/top_destinations.json",'r') as load_f:
    load_dict = json.load(load_f)
    print(load_dict)

with open("data/webpage.json","w") as f:
    json.dump(page_data,f)
    print("writing")

## Script for json parsing

In [21]:
photo = page_data['bootstrapData']['reduxData']['homePDP']['listingInfo']\
                                                    ['listing']['photos']
photo_urls = []
for pho in photo:
    if 'large' in pho:
        photo_urls.append(pho['large'])
    else:
        photo_urls.append(pho['large_cover'])


In [22]:
photo = page_data['bootstrapData']['reduxData']

In [24]:
#get Id & Canonical url & Room_type
Id = page_data['bootstrapData']['reduxData']['homePDP']['listingInfo']['listingId'] 
canonical_url = page_data['bootstrapData']['canonical_url']
room_type = page_data['bootstrapData']['reduxData']['homePDP']['listingInfo']\
            ['listing']['room_type_category']
room_capacity = page_data['bootstrapData']['reduxData']['homePDP']['listingInfo']\
            ['listing']['person_capacity']

In [207]:
#get location info
localized_city = page_data['bootstrapData']['reduxData']['homePDP']['listingInfo']\
            ['listing']['localized_city']
country_code = page_data['bootstrapData']['reduxData']['homePDP']['listingInfo']\
            ['listing']['country_code']
coordinate = [page_data['bootstrapData']['reduxData']['homePDP']['listingInfo']\
            ['listing']['lat'],page_data['bootstrapData']['reduxData']['homePDP']['listingInfo']\
            ['listing']['lng']]


In [208]:
print(localized_city)
print('------------\n')
print(country_code)
print('------------\n')
print(coordinate)
print('------------\n')

New York
------------

US
------------

[40.71834541711498, -73.99138321378771]
------------



In [198]:
#get host Info
host_about = page_data['bootstrapData']['reduxData']['homePDP']['listingInfo']['listing']\
            ['primary_host']['about']
host_name = page_data['bootstrapData']['reduxData']['homePDP']['listingInfo']\
            ['listing']['primary_host']['host_name']
host_id = page_data['bootstrapData']['reduxData']['homePDP']['listingInfo']\
            ['listing']['primary_host']['id']
host_member_since = page_data['bootstrapData']['reduxData']['homePDP']['listingInfo']\
            ['listing']['primary_host']['member_since']
host_member_profile = page_data['bootstrapData']['reduxData']['homePDP']['listingInfo']\
            ['listing']['primary_host']['profile_path']

In [199]:
print(host_about)
print('--------------\n' )
print(host_name)
print('--------------\n' )
print(host_id)
print('--------------\n' )
print(host_member_since)
print('--------------\n' )
print(host_member_profile)

Ibanker age 28 that is laid back, artistic, and open-minded. Always looking for the next opportunity, exploring new places, meeting new people, and making new friends! 

I grew up in Hawaii and Massachusetts went to College in DC and London. World traveler and frequent airbnb user. My favorite places are France, Belgium, and the UK. I go back every year! 
--------------

D
--------------

16664377
--------------

June 2014
--------------

/users/show/16664377


In [189]:
# get overall ratings

In [195]:
overall_rating = page_data['bootstrapData']['reduxData']['homePDP']['listingInfo']\
            ['listing']['star_rating']
review_highlight = page_data['bootstrapData']['reduxData']['homePDP']['listingInfo']\
            ['listing']['review_highlight']

## do some json travers

In [302]:
def traverse_json_file(jstr,t_str,path =[]):
    
    if not isinstance(jstr,dict):
#         print("---")
#         print(jstr)
#         print("is not a dict")        
        path.pop()
        return False
    if len(jstr) == 0:
#         print('len is zero')
        path.pop()
        return False

    if t_str in jstr:
        print('find t_str!')
#         print(path)
        print('----------\n')
#         print(jstr[t_str])
        print(path)
        return True
    set_1 = False
    for k,v in jstr.items():
        
        path.append(k)
        set_1 = set_1 or traverse_json_file(v,t_str,path) 
    if set_1:
#         print(path)
        return True
    else:
        path.pop()
        return False
            
#     if isinstance(jstr,list):
#     if isinstance(jstr,np.array):


In [305]:
z = traverse_json_file(page_data,'listing')

find t_str!
----------

['bootstrapData', 'reduxData', 'homePDP', 'all_top_destinations', 'javascript_paths', 'stylesheet_paths', 'asset_paths', 'signup_login_urls', 'has_p2_bootstrap_data', 'i18n-init', 'map_provider', 'is_mobile', 'inspectlet_data', 'p2_ethnio', 'p2_p3_show_from_price_v3_assignment', 'show_ib_filter_panel', 'p2_sidebar_max_width', 'cn_show_description_language_filter', 'show_employee_host_filter', 'launch_infants_v2', 'p2_marker_image_path', 'dls_filters', 'p2_show_webcot_listing_cards', 'p2_currency', 'request_host', 'p2_display_location', 'show_from_min_available_price', 'show_include_service_fee_but_tax_disclaimer', 'show_family_preferred', 'business_travel_welcome_modal_hash', 'p2_recently_viewed_listings', 'p2_recently_viewed_listings_force', 'has_p1_bootstrap_data', 'luxury_pre_launch', 'satori_autocomplete_query_web_force', 'has_luxury_bootstrap_data', 'luxury_pre_launch_dev', 'luxury_pre_launch_polling', 'has_itinerary_bootstrap_data', 'itinerary.phase_1.roll

# Get reviews

In [174]:
host_name = page_data['bootstrapData']['reduxData']['homePDP']['listingInfo']\
            ['listing']['primary_host']['badges']

In [176]:
Host_name

[{'count': 274,
  'id': 'reviews',
  'image_path': None,
  'image_size': None,
  'label': '274 Reviews',
  'link': '/users/show/16664377#reviews'},
 {'count': None,
  'id': 'verified',
  'image_path': 'badges/verified_badge.png',
  'image_size': '32x32',
  'label': 'Verified',
  'link': None}]

In [57]:
section = page_data["bootstrapData"]["reduxData"]["exploreTab"]["response"]["explore_tabs"][0]["sections"][0]

items_offset = page_data["bootstrapData"]["reduxData"]["exploreTab"]["response"]["explore_tabs"][0]["pagination_metadata"]["items_offset"]

KeyError: 'explore_tabs'

In [58]:
section_tmp = page_data["bootstrapData"]["reduxData"]

In [43]:
type(soup.prettify())

str

In [59]:
section_tmp

{'exploreTab': {'fetchError': None,
  'loading': False,
  'loadingMore': False,
  'response': {},
  'responseFilters': {},
  'tabSectionOffsets': {'home_tab': None}},
 'tabMetadata': {}}

In [65]:
results_df = pd.DataFrame([x for x in page_data['bootstrapData']])
results_df

Unnamed: 0,0
0,webcot
1,spa_home
2,select_data
3,spa_header
4,spa_footer
5,path
6,query
7,best_guess_screen_size
8,canonical_url
9,spa_hero


In [12]:
with open('buf.txt','w') as f:
    f.write(page_data)

TypeError: write() argument must be str, not dict

In [35]:
import pandas as pd
results_df = pd.DataFrame([x for x in section])


In [36]:
results_df

Unnamed: 0,0
0,responseFilters
1,response
2,loading
3,loadingMore
4,fetchError
5,tabSectionOffsets


In [23]:
urlretrieve(IMAGE_URL, 'img1.jpg')    

('img1.jpg', <http.client.HTTPMessage at 0x7f157405d668>)

In [21]:
IMAGE_URL = 'https://a0.muscache.com/airbnb/static/account_verification/front_of_id_context_web-identity-cfc107f13c39559d1fba3dd8fef6cc15.svg'

# Load in saved files

In [86]:
with open('DataSet/Asheville-NorthCarolina-US/webscrapted','r') as load_f:
    load_dict = json.load(load_f)
    print(load_dict)

{'3990524': {'apt_name': 'Sweet room in W. Asheville bungalow', 'url': 'https://www.airbnb.com/rooms/3990524', 'info': {'id': 3990524, 'canonical_url': 'https://www.airbnb.com/rooms/3990524', 'room_type': 'private_room', 'room_capacity': 1, 'localized_city': 'Asheville', 'country_code': 'US', 'coordinate': [35.58006310161262, -82.58663285575625], 'host_about': 'I spent my entire adult life in theatre as an actress, director and producer. Nearly 20 years of that I created original works with objects and puppets. My work has been seen at The Kennedy Center, the Henson International Festival in N.Y., UCLA, and the Walker Arts Center, to name a few.  I tired of the touring life however and am now making beautiful handmade art lamps inspired by a 2 year stay in Korea.  I love my hours in the studio which is located in Asheville\'s River Arts District!  I must still travel some to get my wares out into the public, but with a studio in the district, many travelers come my way. \r\n\r\nI love 