In [1]:
# Imports
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import requests
import time
import sys
import chromedriver_binary
import re
from datetime import datetime

In [2]:
def get_property_html(driver, link):
    """Retrieves a webpage, creates a Beatifulsoup object from the page source 
    and returns the beautifulsouop object."""
    driver.switch_to.window(driver.window_handles[0])
    driver.get(link)
    time.sleep(10)
    soup = BeautifulSoup(driver.page_source, "lxml")
    return soup

# Example
# soup = get_property_html(driver, 'https://www.airbnb.com/')

In [3]:
def get_attrs_1_4(div_list):
    """Pulls the number of guests, bedrooms, beds, and bathrooms out of a list
    of beautiful soup objects. Returns a dictionary of the results."""
    attrs_dict = {}
    res_dict = {'guests' : re.compile(r'^\d+ guests'),
                'br' : re.compile(r'^\d+ bedrooms?'),
                'beds' : re.compile(r'^\d+ beds?'),
                'baths' : re.compile(r'^(\d+|\d+\.\d+) baths?')}
    for div in div_list:
        for key, val in res_dict.items():
            if val.match(div.text):
                attrs_dict[key] = div.text
    if not 'br' in attrs_dict:
        attrs_dict['br'] = 'studio'
    return attrs_dict

# Example
# get_attrs_1_4(soup.find_all('div', {'class' : '_czm8crp'}))

In [4]:
def get_price(span):
    """Returns the price of a rental."""
    return span and span.text

# Example
# get_price(soup.find('span', {'class' : '_doc79r'}))
# <span class="_1p0spma2">

In [5]:
def get_attrs_5_6(spans):
    """Returns the cleaning fees, and servie fee from the list of span tags
    passed in."""
    attrs_dict = {}
    attrs_dict['clean_fee'] = 0
    spans_len = len(spans)
    for idx, span in enumerate(spans):
        if span.text == 'Cleaning fee' and idx + 1 <= spans_len:
            attrs_dict['clean_fee'] = spans[idx + 1].text
        elif span.text == 'Service fee' and idx + 1 <= spans_len:
            attrs_dict['serv_fee'] = spans[idx + 1].text
        else:
            pass
    return attrs_dict

# Eample
# get_attrs_5_6(soup.find_all('span', {'class' : '_1jlnvra2'}))

In [6]:
def get_amenities(buttons):
    """Caputes the number of amenities that a listing has, and returns it."""
    amentities = ''
    ams = re.compile(r'^Show all \d+ amenities')
    for button in buttons:
        if ams.match(button.text):
            amenities = button.text
    return amenities

# Example
# get_amenities(soup.find_all('button', {'class' : '_b0ybw8s'}))

In [7]:
def get_lat_lon(scripts):
    """Captures the altitude and longitude data from the list of script objects passed ot it."""
    lat_lon = ''
    ll = re.compile(r'ViewportInfoService.GetViewportInfo')
    parse_ll = re.compile(r';1d47\.\d{14}&amp;2d-122\.\d{14}')
    for script in scripts:
        if ll.search(script.attrs['src']):
            lat_lon = script.attrs['src']
    return lat_lon

# Example
# get_lat_lon(soup.find_all('script',  {'charset' : "UTF-8"}))

In [8]:
def get_superhost(spans):
    """Returns the superhost status of the listing owner."""
    superhost = False
    sh = re.compile(r'^\w+ is a Superhost')
    for span in spans:
        if sh.match(span.text):
            superhost = True
    return superhost

# Example
# get_superhost(soup.find_all('span', {'class' : "_1p3joamp"}))

In [9]:
def get_narrative(s_driver, x_path):
    """Returns the text from the listing narrative."""
    return s_driver.find_element_by_xpath(x_path).text

In [10]:
def get_rules(s_driver, x_path):
    """Returns the text of the house rules for a listing."""
    return driver.find_element_by_xpath(x_path).text

In [11]:
def get_all(html_soup, s_driver, property_id):
    """Returns a list off all data fields by calling the functions that gather the
    individual data elements."""
    d1_4 = get_attrs_1_4(html_soup.find_all('div', {'class' : '_czm8crp'}))
    d5_6 = get_attrs_5_6(html_soup.find_all('span', {'class' : '_1jlnvra2'}))
    ams = get_amenities(html_soup.find_all('button', {'class' : '_b0ybw8s'}))
    sh = get_superhost(html_soup.find_all('span', {'class' : "_1p3joamp"}))
    latlon = get_lat_lon(html_soup.find_all('script',  {'charset' : "UTF-8"}))
    price = get_price(html_soup.find('span', {'class' : '_doc79r'})) or get_price(html_soup.find('span', {'class' : '_1p0spma2'}))
    narr = get_narrative(s_driver, PATHS['narrative'])
    rules = get_rules(s_driver, PATHS['rules'])
    return [property_id, d1_4['guests'], d1_4['br'], d1_4['beds'], d1_4['baths'],
            d5_6['clean_fee'], d5_6['serv_fee'], ams, sh, latlon, narr, rules, price]

In [12]:
# Set some xpath paths
PATHS = {'narrative' : '//*[@id="details"]/div',
         'rules' : '//*[@id="house-rules"]/div/section'}

In [13]:
# Read in the list of previously gathered links.
lnks = pd.read_csv('../data/links_bogota_1004.csv')
lnks.head()

Unnamed: 0,link,id
0,https://www.airbnb.com/rooms/38238821?check_in...,38238821
1,https://www.airbnb.com/rooms/31941600?check_in...,31941600
2,https://www.airbnb.com/rooms/32079582?check_in...,32079582
3,https://www.airbnb.com/rooms/34259803?check_in...,34259803
4,https://www.airbnb.com/rooms/10823924?check_in...,10823924


In [14]:
# Create Selenium chrome browser driver instance
driver = webdriver.Chrome()
time.sleep(5)
window = driver.window_handles[0]

In [15]:
data = []
ouch = []
a_counter = 0
for idx, row in lnks.loc[:].iterrows():
    try:
        data.append(get_all(get_property_html(driver, row['link']), driver, row['id']))
    except Exception as ex:
        ouch.append((row['link'], ex))
        print('Count:', a_counter, '--Length of errors array:', len(ouch))
        

Count: 0 --Length of errors array: 1
Count: 0 --Length of errors array: 2
Count: 0 --Length of errors array: 3
Count: 0 --Length of errors array: 4
Count: 0 --Length of errors array: 5
Count: 0 --Length of errors array: 6
Count: 0 --Length of errors array: 7
Count: 0 --Length of errors array: 8
Count: 0 --Length of errors array: 9
Count: 0 --Length of errors array: 10
Count: 0 --Length of errors array: 11
Count: 0 --Length of errors array: 12
Count: 0 --Length of errors array: 13
Count: 0 --Length of errors array: 14
Count: 0 --Length of errors array: 15
Count: 0 --Length of errors array: 16
Count: 0 --Length of errors array: 17
Count: 0 --Length of errors array: 18
Count: 0 --Length of errors array: 19
Count: 0 --Length of errors array: 20
Count: 0 --Length of errors array: 21
Count: 0 --Length of errors array: 22
Count: 0 --Length of errors array: 23
Count: 0 --Length of errors array: 24
Count: 0 --Length of errors array: 25
Count: 0 --Length of errors array: 26
Count: 0 --Length of 

In [19]:
# Make a dataframe of the listing data
lnk_data = pd.DataFrame(data, columns=['id', 'guests', 'bedrooms', 'beds', 'baths', 'cleaning_fee', 'service_fee', 'amenities', 'superhost', 'lat_lon', 'narrative', 'rules', 'price'])

In [20]:
# Write the listing data to a csv.
lnk_data.to_csv('../data/initial_bogota_data_scrape-{}.csv'.format(datetime.strftime(datetime.now(), '%m-%d-%y--%H-%M-%S'), index=False))