In [1]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import requests
import time
import sys
import chromedriver_binary
import re
from datetime import datetime

In [2]:
def get_property_html(driver, link):
    driver.switch_to.window(driver.window_handles[0])
    driver.get(link)
    time.sleep(10)
    soup = BeautifulSoup(driver.page_source, "lxml")
    return soup

# Example
# soup = get_property_html(driver, 'https://www.airbnb.com/')

In [3]:
def get_attrs_1_4(div_list):
    attrs_dict = {}
    res_dict = {'guests' : re.compile(r'^\d+ guests'),
                'br' : re.compile(r'^\d+ bedrooms?'),
                'beds' : re.compile(r'^\d+ beds?'),
                'baths' : re.compile(r'^(\d+|\d+\.\d+) baths?')}
    for div in div_list:
        for key, val in res_dict.items():
            if val.match(div.text):
                attrs_dict[key] = div.text
    if not 'br' in attrs_dict:
        attrs_dict['br'] = 'studio'
    return attrs_dict

# Example
# get_attrs_1_4(soup.find_all('div', {'class' : '_czm8crp'}))

In [4]:
def get_price(span):
    
    # If span is None then the following line returns None
    # otherwise it returns the span.text
    return span and span.text

# Example
# get_price(soup.find('span', {'class' : '_doc79r'}))
# <span class="_1p0spma2">

In [5]:
def get_attrs_5_6(spans):
    attrs_dict = {}
    attrs_dict['clean_fee'] = 0
    spans_len = len(spans)
    for idx, span in enumerate(spans):
        if span.text == 'Cleaning fee' and idx + 1 <= spans_len:
            attrs_dict['clean_fee'] = spans[idx + 1].text
        elif span.text == 'Service fee' and idx + 1 <= spans_len:
            attrs_dict['serv_fee'] = spans[idx + 1].text
        else:
            pass
    return attrs_dict

# Eample
# get_attrs_5_6(soup.find_all('span', {'class' : '_1jlnvra2'}))

In [6]:
def get_amenities(buttons):
    amentities = ''
    ams = re.compile(r'^Show all \d+ amenities')
    for button in buttons:
        if ams.match(button.text):
            amenities = button.text
    return amenities

# Example
# get_amenities(soup.find_all('button', {'class' : '_b0ybw8s'}))

In [7]:
def get_lat_lon(scripts):
    lat_lon = ''
    ll = re.compile(r'ViewportInfoService.GetViewportInfo')
    parse_ll = re.compile(r';1d47\.\d{14}&amp;2d-122\.\d{14}')
    for script in scripts:
        if ll.search(script.attrs['src']):
            lat_lon = script.attrs['src']
    return lat_lon

# Example
# get_lat_lon(soup.find_all('script',  {'charset' : "UTF-8"}))

In [8]:
def get_superhost(spans):
    superhost = False
    sh = re.compile(r'^\w+ is a Superhost')
    for span in spans:
        if sh.match(span.text):
            superhost = True
    return superhost

# Example
# get_superhost(soup.find_all('span', {'class' : "_1p3joamp"}))

In [9]:
def get_narrative(s_driver, x_path):
    return s_driver.find_element_by_xpath(x_path).text

In [10]:
def get_rules(s_driver, x_path):
    return driver.find_element_by_xpath(x_path).text

In [11]:
def get_all(html_soup, s_driver, property_id):
    d1_4 = get_attrs_1_4(html_soup.find_all('div', {'class' : '_czm8crp'}))
    d5_6 = get_attrs_5_6(html_soup.find_all('span', {'class' : '_1jlnvra2'}))
    ams = get_amenities(html_soup.find_all('button', {'class' : '_b0ybw8s'}))
    sh = get_superhost(html_soup.find_all('span', {'class' : "_1p3joamp"}))
    latlon = get_lat_lon(html_soup.find_all('script',  {'charset' : "UTF-8"}))
    price = get_price(html_soup.find('span', {'class' : '_doc79r'})) or get_price(html_soup.find('span', {'class' : '_1p0spma2'}))
    narr = get_narrative(s_driver, PATHS['narrative'])
    rules = get_rules(s_driver, PATHS['rules'])
    return [property_id, d1_4['guests'], d1_4['br'], d1_4['beds'], d1_4['baths'],
            d5_6['clean_fee'], d5_6['serv_fee'], ams, sh, latlon, narr, rules, price]

In [12]:
PATHS = {'narrative' : '//*[@id="details"]/div',
         'rules' : '//*[@id="house-rules"]/div/section'}

In [13]:
lnks = pd.read_csv('../data/links_bogota_1004.csv')
lnks.head()

Unnamed: 0,link,id
0,https://www.airbnb.com/rooms/38238821?check_in...,38238821
1,https://www.airbnb.com/rooms/31941600?check_in...,31941600
2,https://www.airbnb.com/rooms/32079582?check_in...,32079582
3,https://www.airbnb.com/rooms/34259803?check_in...,34259803
4,https://www.airbnb.com/rooms/10823924?check_in...,10823924


In [14]:
# Create Selenium chrome browser driver instance
driver = webdriver.Chrome()
time.sleep(5)
window = driver.window_handles[0]

In [15]:
data = []
ouch = []
a_counter = 0
for idx, row in lnks.loc[:].iterrows():
    try:
        data.append(get_all(get_property_html(driver, row['link']), driver, row['id']))
    except Exception as ex:
        ouch.append((row['link'], ex))
        print('Count:', a_counter, '--Length of errors array:', len(ouch))
        

Count: 0 --Length of errors array: 1
Count: 0 --Length of errors array: 2
Count: 0 --Length of errors array: 3
Count: 0 --Length of errors array: 4
Count: 0 --Length of errors array: 5
Count: 0 --Length of errors array: 6
Count: 0 --Length of errors array: 7
Count: 0 --Length of errors array: 8
Count: 0 --Length of errors array: 9
Count: 0 --Length of errors array: 10
Count: 0 --Length of errors array: 11
Count: 0 --Length of errors array: 12
Count: 0 --Length of errors array: 13
Count: 0 --Length of errors array: 14
Count: 0 --Length of errors array: 15
Count: 0 --Length of errors array: 16
Count: 0 --Length of errors array: 17
Count: 0 --Length of errors array: 18
Count: 0 --Length of errors array: 19
Count: 0 --Length of errors array: 20
Count: 0 --Length of errors array: 21
Count: 0 --Length of errors array: 22
Count: 0 --Length of errors array: 23
Count: 0 --Length of errors array: 24
Count: 0 --Length of errors array: 25
Count: 0 --Length of errors array: 26
Count: 0 --Length of 

In [None]:
ouch

In [None]:
get_all(get_property_html(driver, 'https://www.airbnb.com/rooms/23547665?check_in=2019-12-22&check_out=2019-12-28&previous_page_section_name=1000'), driver, row['id'])

In [None]:
#'//*[@id="house-rules"]/div/section'
stew = get_property_html(driver, 'https://www.airbnb.com/rooms/23547665?check_in=2019-12-22&check_out=2019-12-28&previous_page_section_name=1000')

In [None]:
print(stew.find_all('span', {'class' : '_1jlnvra2'}))
spans = stew.find_all('span', {'class' : '_1jlnvra2'})
attrs_dict = {}
attrs_dict['clean_fee'] = 0
spans_len = len(spans)
for idx, span in enumerate(spans):
    if span.text == 'Cleaning fee' and idx + 1 <= spans_len:
        attrs_dict['clean_fee'] = spans[idx + 1].text
    elif span.text == 'Service fee' and idx + 1 <= spans_len:
        attrs_dict['serv_fee'] = spans[idx + 1].text
    else:
        pass
attrs_dict

In [None]:
driver.find_element_by_xpath('//*[@id="house-rules"]/div/section').text

In [16]:
len(data)

869

In [17]:
data[0]

[38238821,
 '3 guests',
 '1 bedroom',
 '1 bed',
 '2 baths',
 0,
 '$212',
 'Show all 18 amenities',
 True,
 'https://maps.googleapis.com/maps/api/js/ViewportInfoService.GetViewportInfo?1m6&1m2&1d4.649672404335915&2d-74.23008880920821&2m2&1d4.74248621279336&2d-74.07440113050541&2u14&4sen&5e0&6sm%40487000000&7b0&8e0&callback=_xdc_._x80rev&key=AIzaSyAytC_TusuhG7kpNQ19hMrCzXDIUjd307o&token=56152',
 'Translate this description to English\nEs un apartamento con muy buena distribución y con todo lo necesario Para que su alojamiento sea lo más cómodo y agradable ,con una habitación y su baño privado,en la sala ,un sofácama y dos asientos con sus mesas de reserva y el comedor para tres o cuatro puestos,la cocina con todos sus elementos para preparar todos sus alimentos como si estuvieran en su propia cocina,en cuanto a la limpieza y el orden ,se facilita por la clase de piso y material con que está construido\nRead more about the space\nContact host',
 'House rules\nCheck-in time is 3PM - 12AM (

In [19]:
lnk_data = pd.DataFrame(data, columns=['id', 'guests', 'bedrooms', 'beds', 'baths', 'cleaning_fee', 'service_fee', 'amenities', 'superhost', 'lat_lon', 'narrative', 'rules', 'price'])

In [20]:
lnk_data.to_csv('../data/initial_bogota_data_scrape-{}.csv'.format(datetime.strftime(datetime.now(), '%m-%d-%y--%H-%M-%S'), index=False))

In [None]:
datetime.strftime(datetime.now(), '%m-%d-%y--%H-%M-%S')

In [None]:
lnk_data.head()

In [None]:
get_all(ht, driver, '1234')

In [None]:
soup = get_property_html(driver, 'https://www.airbnb.com/rooms/34259803?check_in=2019-12-22&check_out=2019-12-28&source_impression_id=p3_1570219189_6Bgm%2BKK1fr1REjBE&s=ugSl5Kgp')
# <span class="_czm8crp"><span>A perfect place to rest after a long day of work or visit. The apartment is idealy located, close to the historical center, at 15 mins by car of the airport, and even closer to Downtown Bogota, at 15 mins walking of the American Embassy and at 300m of Corferias (Bogota's fair center).</span><br><span>Another plus of the apartement is the private 15 hectares park you will enjoy during your stay. It's one of the biggest green sanctuary of the city. Trees, birds, grass etc..., it's something truly unique.</span></span>


In [None]:
soup.find_all('span', {'class' : '_czm8crp'})

In [None]:
driver.find_element_by_xpath('//*[@id="details"]/div').text

In [None]:
driver.find_element_by_xpath('//*[@id="house-rules"]/div/section/div[3]/div[1]/div/div/div[2]/div[2]/div/p/span').text

In [None]:
get_rules(driver, '//*[@id="house-rules"]/div/section/div[3]/div[1]/div/div/div[2]/div[2]/div/p/span')