In [1]:
# Imports
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import requests
import time
import sys
import chromedriver_binary
import re
from collections import defaultdict
from glob import glob
import pickle
import pandas as pd

In [2]:
def fmt_url(url_string, items_per_page, page_nbr):
    """Formats a URL by inserting page offset numbers into it. Inserting the page
    offset allows for iteration through Airbnb listings."""
    offset = (items_per_page * (page_nbr - 1))
    return url_string.format(offset)

In [3]:
def get_property_info(driver, link):
    """Loads a web page with a selenium driver, and returns a Beautifulsoup object
    of the page source."""
    driver.switch_to.window(driver.window_handles[0])
    driver.get(link)
    time.sleep(10)
    soup = BeautifulSoup(driver.page_source, "lxml")
    return soup

# Example
# soup = get_property_info(driver, 'https://www.airbnb.com/')

In [10]:
def get_links(driver, input_url):
    """Interates through the 18 pages of listings by updating the url and loading pages.
    Gathers links to airbnb listings on each page and returns them."""
    links = []
    prop = re.compile(r'^https://www\.airbnb\.com/rooms/\d+\?')
    for i in range(1, 18):
        url = fmt_url(input_url, 18, i)
        driver.switch_to.window(window)
        driver.get(url)
        time.sleep(10)
        dict_of_urls = defaultdict(int)
        a = driver.find_elements_by_tag_name('a')
        for element in a:
            link = element.get_attribute('href')
            if link and prop.match(link):
                dict_of_urls[link] += 1
        links += list(dict_of_urls.keys())
    return links

# Example
# get_links(driver, URL, 1)

In [13]:
# URLS we will pass in to get links to individual properties
URL_0_BR = 'https://www.airbnb.com/s/Bogot%C3%A1-~-Bogota--Colombia/homes?refinement_paths%5B%5D=%2Fhomes&current_tab_id=home_tab&selected_tab_id=home_tab&place_id=ChIJKcumLf2bP44RFDmjIFVjnSM&source=mc_search_bar&search_type=pagination&screen_size=large&hide_dates_and_guests_filters=false&checkin=2019-12-22&checkout=2019-12-28&room_types%5B%5D=Entire%20home%2Fapt&min_bedrooms=0&s_tag=ugSl5Kgp&section_offset=4&items_offset={}&last_search_session_id=eadc2168-062d-44fe-a0fe-c6c5ece684f1'
URL_1_BR = 'https://www.airbnb.com/s/Bogot%C3%A1-~-Bogota--Colombia/homes?refinement_paths%5B%5D=%2Fhomes&current_tab_id=home_tab&selected_tab_id=home_tab&place_id=ChIJKcumLf2bP44RFDmjIFVjnSM&source=mc_search_bar&search_type=pagination&screen_size=large&hide_dates_and_guests_filters=false&checkin=2019-12-22&checkout=2019-12-28&room_types%5B%5D=Entire%20home%2Fapt&min_bedrooms=1&s_tag=ugSl5Kgp&section_offset=4&items_offset={}&last_search_session_id=eadc2168-062d-44fe-a0fe-c6c5ece684f1'
URL_2_BR = 'https://www.airbnb.com/s/Bogot%C3%A1-~-Bogota--Colombia/homes?refinement_paths%5B%5D=%2Fhomes&current_tab_id=home_tab&selected_tab_id=home_tab&place_id=ChIJKcumLf2bP44RFDmjIFVjnSM&source=mc_search_bar&search_type=pagination&screen_size=large&hide_dates_and_guests_filters=false&checkin=2019-12-22&checkout=2019-12-28&room_types%5B%5D=Entire%20home%2Fapt&min_bedrooms=2&s_tag=ugSl5Kgp&section_offset=4&items_offset={}&last_search_session_id=eadc2168-062d-44fe-a0fe-c6c5ece684f1'
URL_3_BR = 'https://www.airbnb.com/s/Bogot%C3%A1-~-Bogota--Colombia/homes?refinement_paths%5B%5D=%2Fhomes&current_tab_id=home_tab&selected_tab_id=home_tab&place_id=ChIJKcumLf2bP44RFDmjIFVjnSM&source=mc_search_bar&search_type=pagination&screen_size=large&hide_dates_and_guests_filters=false&checkin=2019-12-22&checkout=2019-12-28&room_types%5B%5D=Entire%20home%2Fapt&min_bedrooms=3&s_tag=ugSl5Kgp&section_offset=4&items_offset={}&last_search_session_id=eadc2168-062d-44fe-a0fe-c6c5ece684f1'
URL_4_BR = 'https://www.airbnb.com/s/Bogot%C3%A1-~-Bogota--Colombia/homes?refinement_paths%5B%5D=%2Fhomes&current_tab_id=home_tab&selected_tab_id=home_tab&place_id=ChIJKcumLf2bP44RFDmjIFVjnSM&source=mc_search_bar&search_type=pagination&screen_size=large&hide_dates_and_guests_filters=false&checkin=2019-12-22&checkout=2019-12-28&room_types%5B%5D=Entire%20home%2Fapt&min_bedrooms=4&s_tag=ugSl5Kgp&section_offset=4&items_offset={}&last_search_session_id=eadc2168-062d-44fe-a0fe-c6c5ece684f1'
url_list = [URL_0_BR, URL_1_BR, URL_2_BR, URL_3_BR, URL_4_BR]

In [14]:
# Create Selenium chrome browser driver instance, and an empty list to hold links
driver = webdriver.Chrome()
time.sleep(5)
window = driver.window_handles[0]

In [15]:
# Iterate through the above list of links to gather links to rentals
list_of_links = []
for url in url_list[:]:
    list_of_links += get_links(driver, url)
    print('Number of links so far:', len(list_of_links))

Number of links so far: 304
Number of links so far: 608
Number of links so far: 912
Number of links so far: 1218
Number of links so far: 1470


In [16]:
# See ow many results were captured
len(list_of_links)

1470

In [17]:
# Pickle the data
with open('../data/bogota_links.pkl', 'wb' ) as f:
    pickle.dump(list_of_links, f)

In [18]:
# Create a dataframe of the links
lnk_df = pd.DataFrame(list_of_links, columns=['link'])

In [19]:
lnk_df.head()

Unnamed: 0,link
0,https://www.airbnb.com/rooms/38238821?check_in...
1,https://www.airbnb.com/rooms/31941600?check_in...
2,https://www.airbnb.com/rooms/32079582?check_in...
3,https://www.airbnb.com/rooms/34259803?check_in...
4,https://www.airbnb.com/rooms/10823924?check_in...


In [20]:
lnk_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 1 columns):
link    1470 non-null object
dtypes: object(1)
memory usage: 11.6+ KB


In [21]:
# Add a new column for the property id.
lnk_df['id'] = lnk_df['link'].str.extract(r"^https://www\.airbnb\.com/rooms/(\d+)")

In [22]:
# Drop and duplicates
lnk_df = lnk_df.drop_duplicates(subset=['id'])

In [23]:
lnk_df.shape

(927, 2)

In [25]:
# Write results to a csv file.
lnk_df.to_csv('../data/links_bogota_1004.csv', index=False)