In [1]:
# Imports
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import requests
import time
import sys
import chromedriver_binary
import re
from collections import defaultdict
from glob import glob
import pickle
import pandas as pd
import numpy as np
import json

In [2]:
def build_prop(info_dict, price_list):
    """Builds a property listing from the dictionaries passed into it.
    
    Inputs: info_dict, a dictionary containing property information.
            price_list, a list of dictionaries with pricing information.
    
    Outputs: A complete property listing."""
    prop = defaultdict(str)
    listing_keys = ['bathrooms', 'bedrooms', 'beds', 'id', 'is_superhost', 'lat', 'lng',
             'person_capacity', 'picture_count', 'preview_amenity_names']
    try:
        for item in listing_keys:
            prop[item] = info_dict[item]
        for item in price_list:
            if re.match(r'\$\d+ x \d+ nights', item['localized_title']):
                prop['price'] = item['localized_title']
            elif re.match(r'Cleaning fee', item['localized_title']):
                prop['cleaning_fee'] = item['total']['amount']
            else:
                pass
    except:
        return None
    return prop

In [3]:
def fmt_url(url_string, items_per_page, page_nbr):
    """Formats a URL by inserting page offset numbers into it. Inserting the page
    offset allows for iteration through Airbnb listings."""
    offset = (items_per_page * (page_nbr - 1))
    return url_string.format(offset)

In [4]:
def get_property_info(driver, link):
    """Loads a web page with a selenium driver, and returns a Beautifulsoup object
    of the page source."""
    driver.switch_to.window(driver.window_handles[0])
    driver.get(link)
    time.sleep(10)
    soup = BeautifulSoup(driver.page_source, "lxml")
    return soup

# Example
# soup = get_property_info(driver, 'https://www.airbnb.com/')

In [5]:
def get_properties(s_driver, input_url):
    """Iterates through the listings of an Airbnb query. The query is defined by the inpout URL.
    Information about each property on a page is gathered and saved as a pandas DataFrame object."""
    properties = []
    for i in range(1, 18):
        url = fmt_url(input_url, 18, i)
        s_driver.switch_to.window(window)
        s_driver.get(url)
        time.sleep(5)
        bs = BeautifulSoup(s_driver.page_source)
        json_obj = bs.find_all('script', {'id' : "data-state", 'data-state' : "true",
                                          'type' :"application/json"})
        jso = json.loads(json_obj[0].text)
        listings = jso['bootstrapData']['reduxData']['exploreTab']['response']['explore_tabs'][0]['sections'][0]['listings']
        for listing in listings:
            prop_dict = build_prop(listing['listing'], listing['pricing_quote']['price']['price_items'])
            if prop_dict:
                properties.append(pd.DataFrame(prop_dict))
    return properties

In [6]:
# URLS we will pass in to get links to individual properties
URL_0_BR = 'https://www.airbnb.com/s/Bogot%C3%A1-~-Bogota--Colombia/homes?refinement_paths%5B%5D=%2Fhomes&current_tab_id=home_tab&selected_tab_id=home_tab&place_id=ChIJKcumLf2bP44RFDmjIFVjnSM&source=mc_search_bar&search_type=pagination&screen_size=large&hide_dates_and_guests_filters=true&checkin=2019-12-22&checkout=2019-12-28&price_min=0&price_max=20&room_types%5B%5D=Entire%20home%2Fapt&s_tag=FX0YwQiV&section_offset=4&items_offset={}&last_search_session_id=b86d3d59-81d2-4d20-be92-c69deac30a4b'
URL_1_BR = 'https://www.airbnb.com/s/Bogot%C3%A1-~-Bogota--Colombia/homes?refinement_paths%5B%5D=%2Fhomes&current_tab_id=home_tab&selected_tab_id=home_tab&place_id=ChIJKcumLf2bP44RFDmjIFVjnSM&source=mc_search_bar&search_type=pagination&screen_size=large&hide_dates_and_guests_filters=true&checkin=2019-12-22&checkout=2019-12-28&price_min=21&price_max=40&room_types%5B%5D=Entire%20home%2Fapt&s_tag=FX0YwQiV&section_offset=4&items_offset={}&last_search_session_id=b86d3d59-81d2-4d20-be92-c69deac30a4b'
URL_2_BR = 'https://www.airbnb.com/s/Bogot%C3%A1-~-Bogota--Colombia/homes?refinement_paths%5B%5D=%2Fhomes&current_tab_id=home_tab&selected_tab_id=home_tab&place_id=ChIJKcumLf2bP44RFDmjIFVjnSM&source=mc_search_bar&search_type=pagination&screen_size=large&hide_dates_and_guests_filters=true&checkin=2019-12-22&checkout=2019-12-28&price_min=41&price_max=60&room_types%5B%5D=Entire%20home%2Fapt&s_tag=FX0YwQiV&section_offset=4&items_offset={}&last_search_session_id=b86d3d59-81d2-4d20-be92-c69deac30a4b'
URL_3_BR = 'https://www.airbnb.com/s/Bogot%C3%A1-~-Bogota--Colombia/homes?refinement_paths%5B%5D=%2Fhomes&current_tab_id=home_tab&selected_tab_id=home_tab&place_id=ChIJKcumLf2bP44RFDmjIFVjnSM&source=mc_search_bar&search_type=pagination&screen_size=large&hide_dates_and_guests_filters=true&checkin=2019-12-22&checkout=2019-12-28&price_min=61&price_max=80&room_types%5B%5D=Entire%20home%2Fapt&s_tag=FX0YwQiV&section_offset=4&items_offset={}&last_search_session_id=b86d3d59-81d2-4d20-be92-c69deac30a4b'
URL_4_BR = 'https://www.airbnb.com/s/Bogot%C3%A1-~-Bogota--Colombia/homes?refinement_paths%5B%5D=%2Fhomes&current_tab_id=home_tab&selected_tab_id=home_tab&place_id=ChIJKcumLf2bP44RFDmjIFVjnSM&source=mc_search_bar&search_type=pagination&screen_size=large&hide_dates_and_guests_filters=true&checkin=2019-12-22&checkout=2019-12-28&price_min=81&price_max=120&room_types%5B%5D=Entire%20home%2Fapt&s_tag=FX0YwQiV&section_offset=4&items_offset={}&last_search_session_id=b86d3d59-81d2-4d20-be92-c69deac30a4b'
url_list = [URL_0_BR, URL_1_BR, URL_2_BR, URL_3_BR, URL_4_BR]

In [7]:
# Create Selenium chrome browser driver instance, and an empty list to hold links
driver = webdriver.Chrome()
time.sleep(5)
window = driver.window_handles[0]

In [8]:
# Get all the properties by iterating through all the URLs.
list_of_links = []
for url in url_list[:]:
    list_of_links += get_properties(driver, url)

In [9]:
# Concatenate the DataFrames together into a single dataframe
stays = pd.concat(list_of_links, sort=True)

In [10]:
stays.head()

Unnamed: 0,bathrooms,bedrooms,beds,cleaning_fee,id,is_superhost,lat,lng,person_capacity,picture_count,preview_amenity_names,price
0,1.0,1,1,12.0,37721845,False,4.63778,-74.06339,2,1,Wifi,$12 x 6 nights
1,1.0,1,1,12.0,37721845,False,4.63778,-74.06339,2,1,Kitchen,$12 x 6 nights
2,1.0,1,1,12.0,37721845,False,4.63778,-74.06339,2,1,Washer,$12 x 6 nights
0,1.0,1,1,,21240481,False,4.59605,-74.0695,2,12,Wifi,$15 x 6 nights
1,1.0,1,1,,21240481,False,4.59605,-74.0695,2,12,Kitchen,$15 x 6 nights


In [11]:
# Get dummies from the 'preview_amenity_names' column.
stays2 = pd.get_dummies(stays, columns=['preview_amenity_names']).groupby('id').max()

In [12]:
# Save the dataframe so data isn't lost.
stays2.to_csv('../data/bogota_stays2.csv', index=False)

In [13]:
# Save the original dataframe so data isn't lost.
stays.to_csv('../data/bogota_stays.csv', index=False)

In [14]:
stays2.reset_index(inplace=True)
stays2.head()

Unnamed: 0,id,bathrooms,bedrooms,beds,cleaning_fee,is_superhost,lat,lng,person_capacity,picture_count,price,preview_amenity_names_Free parking,preview_amenity_names_Kitchen,preview_amenity_names_Washer,preview_amenity_names_Wifi
0,4454536,1.5,0,1,25.0,False,4.64272,-74.06424,1,43,$14 x 6 nights,0,1,1,1
1,15874696,1.5,1,1,5.0,False,4.65144,-74.07452,2,10,$15 x 6 nights,0,1,0,1
2,21240481,1.0,1,1,,False,4.59605,-74.0695,2,12,$15 x 6 nights,0,1,1,1
3,23634117,1.0,1,3,6.0,False,4.63359,-74.094322,5,7,$13 x 6 nights,1,0,0,1
4,24918902,1.0,0,1,,True,4.61267,-74.07119,2,18,$15 x 6 nights,0,1,0,1


In [15]:
# La candelaria: 4.591722 -74.07413

In [16]:
# Extract the prices as a float
stays2['price_numeric'] = stays2['price'].str.extract(r'\$(\d+) x 6 nights').astype(float)

In [17]:
# Some properties don't have cleaning fees so fill those with zeros
stays2['cleaning_fee'] = stays2['cleaning_fee'].fillna(0)

In [22]:
# Create a basic distance measure by applying the distance formula to the latitude and longitude
stays2['linear_distance'] = np.sqrt(np.power(stays2['lat'] - 4.591722, 2) + np.power((stays2['lng'] - (-74.0741)) * np.cos(stays2['lat']), 2))
stays2['is_superhost'] = stays2['is_superhost'].astype(int)

In [19]:
stays2.head()

Unnamed: 0,id,bathrooms,bedrooms,beds,cleaning_fee,is_superhost,lat,lng,person_capacity,picture_count,price,preview_amenity_names_Free parking,preview_amenity_names_Kitchen,preview_amenity_names_Washer,preview_amenity_names_Wifi,price_numeric,linear_distance
0,4454536,1.5,0,1,25.0,0,4.64272,-74.06424,1,43,$14 x 6 nights,0,1,1,1,14.0,0.051942
1,15874696,1.5,1,1,5.0,0,4.65144,-74.07452,2,10,$15 x 6 nights,0,1,0,1,15.0,0.059719
2,21240481,1.0,1,1,0.0,0,4.59605,-74.0695,2,12,$15 x 6 nights,0,1,1,1,15.0,0.006316
3,23634117,1.0,1,3,6.0,0,4.63359,-74.094322,5,7,$13 x 6 nights,1,0,0,1,13.0,0.046496
4,24918902,1.0,0,1,0.0,1,4.61267,-74.07119,2,18,$15 x 6 nights,0,1,0,1,15.0,0.021149


In [20]:
# Create a final dataframe by ordering the stays2 dataframe.
CITY = 'bogota'
final = stays2[['id', 'bathrooms', 'bedrooms', 'beds', 'cleaning_fee', 'is_superhost',
       'lat', 'lng', 'person_capacity', 'picture_count',
       'preview_amenity_names_Free parking', 'preview_amenity_names_Kitchen',
       'preview_amenity_names_Washer', 'preview_amenity_names_Wifi',
       'price_numeric', 'linear_distance']]

In [21]:
# Save the data to a file for LinearRegression processing.
final.to_csv('../data/bogota_with_amenities_1010_sample.csv', index=False)