In [1]:
# Imports
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import requests
import time
import sys
import chromedriver_binary
import re
from collections import defaultdict
from glob import glob
import pickle
import pandas as pd

In [2]:
def fmt_url(url_string, items_per_page, page_nbr):
    offset = (items_per_page * (page_nbr - 1))
    return url_string.format(offset)

In [3]:
def get_property_info(driver, link):
    driver.switch_to.window(driver.window_handles[0])
    driver.get(link)
    time.sleep(10)
    soup = BeautifulSoup(driver.page_source, "lxml")
    return soup

# Example
# soup = get_property_info(driver, 'https://www.airbnb.com/')

In [4]:
def get_attrs_1_4(div_list):
    attrs_dict = {}
    res_dict = {'guests' : re.compile(r'^\d+ guests'),
                'br' : re.compile(r'^\d+ bedrooms?'),
                'beds' : re.compile(r'^\d+ beds?'),
                'baths' : re.compile(r'^\d+ baths?')}
    for div in div_list:
        for key, val in res_dict.items():
            if val.match(div.text):
                attrs_dict[key] = div.text
    return attrs_dict

# Example
# get_attrs_1_4(soup.find_all('div', {'class' : '_czm8crp'}))

In [5]:
def get_price(span):
    return span.text

# Example
# get_price(soup.find('span', {'class' : '_doc79r'}))

In [6]:
def get_attrs_5_6(spans):
    attrs_dict = {}
    spans_len = len(spans)
    for idx, span in enumerate(spans):
        if span.text == 'Cleaning fee' and idx + 1 <= spans_len:
            attrs_dict['clean_fee'] = spans[idx + 1].text
        elif span.text == 'Service fee' and idx + 1 <= spans_len:
            attrs_dict['serv_fee'] = spans[idx + 1].text
        else:
            pass
    return attrs_dict

# Eample
# get_attrs_5_6(soup.find_all('span', {'class' : '_1jlnvra2'}))

In [7]:
def get_amenities(buttons):
    amentities = ''
    ams = re.compile(r'^Show all \d+ amenities')
    for button in buttons:
        if ams.match(button.text):
            amenities = button.text
    return amenities

# Example
# get_amenities(soup.find_all('button', {'class' : '_b0ybw8s'}))

In [8]:
def get_lat_lon(scripts):
    lat_lon = ''
    ll = re.compile(r'ViewportInfoService.GetViewportInfo')
    parse_ll = re.compile(r';1d47\.\d{14}&amp;2d-122\.\d{14}')
    for script in scripts:
        if ll.search(script.attrs['src']):
            lat_lon = script.attrs['src']
    return lat_lon

# Example
# get_lat_lon(soup.find_all('script',  {'charset' : "UTF-8"}))

In [9]:
def get_superhost(spans):
    superhost = False
    sh = re.compile(r'^\w+ is a Superhost')
    for span in spans:
        if sh.match(span.text):
            superhost = True
    return superhost

# Example
# get_superhost(soup.find_all('span', {'class' : "_1p3joamp"}))

In [10]:
def get_links(driver, input_url):
    links = []
    prop = re.compile(r'^https://www\.airbnb\.com/rooms/\d+\?')
    for i in range(1, 18):
        url = fmt_url(input_url, 18, i)
        driver.switch_to.window(window)
        driver.get(url)
        time.sleep(10)
        dict_of_urls = defaultdict(int)
        a = driver.find_elements_by_tag_name('a')
        for element in a:
            link = element.get_attribute('href')
            if link and prop.match(link):
                dict_of_urls[link] += 1
        links += list(dict_of_urls.keys())
    return links

# Example
# get_links(driver, URL, 1)

In [11]:
# URLS we will pass in to get links to individual properties
URL_1_GUEST = 'https://www.airbnb.com/s/Seattle--WA--United-States/homes?refinement_paths%5B%5D=%2Fhomes&current_tab_id=home_tab&selected_tab_id=home_tab&place_id=ChIJVTPokywQkFQRmtVEaUZlJRA&search_type=pagination&screen_size=large&hide_dates_and_guests_filters=false&checkin=2019-12-22&checkout=2019-12-28&adults=1&room_types%5B%5D=Entire%20home%2Fapt&s_tag=hw8cp5Pu&section_offset=5&items_offset={}&last_search_session_id=788955f0-95fc-4576-8543-ed1dcfe94ed3'
URL_2_GUEST = 'https://www.airbnb.com/s/Seattle--WA--United-States/homes?refinement_paths%5B%5D=%2Fhomes&current_tab_id=home_tab&selected_tab_id=home_tab&place_id=ChIJVTPokywQkFQRmtVEaUZlJRA&search_type=pagination&screen_size=large&hide_dates_and_guests_filters=false&checkin=2019-12-22&checkout=2019-12-28&adults=2&room_types%5B%5D=Entire%20home%2Fapt&s_tag=M4qG3z8c&section_offset=5&items_offset={}&last_search_session_id=40094e9c-6a7b-4bf5-a60d-ad757e06c4bf'
URL_3_GUEST = 'https://www.airbnb.com/s/Seattle--WA--United-States/homes?refinement_paths%5B%5D=%2Fhomes&current_tab_id=home_tab&selected_tab_id=home_tab&place_id=ChIJVTPokywQkFQRmtVEaUZlJRA&search_type=pagination&screen_size=large&hide_dates_and_guests_filters=false&checkin=2019-12-22&checkout=2019-12-28&adults=3&room_types%5B%5D=Entire%20home%2Fapt&s_tag=TNcnvTMZ&section_offset=5&items_offset={}&last_search_session_id=161c0850-d38d-459a-a90d-91cab7b2dff7'
URL_4_GUEST = 'https://www.airbnb.com/s/Seattle--WA--United-States/homes?refinement_paths%5B%5D=%2Fhomes&current_tab_id=home_tab&selected_tab_id=home_tab&place_id=ChIJVTPokywQkFQRmtVEaUZlJRA&search_type=pagination&screen_size=large&hide_dates_and_guests_filters=false&checkin=2019-12-22&checkout=2019-12-28&adults=4&room_types%5B%5D=Entire%20home%2Fapt&s_tag=NCdHnnOK&section_offset=5&items_offset={}&last_search_session_id=45e6e542-db96-4fc2-8eba-7d58709bcb1e'
url_list = [URL_1_GUEST, URL_2_GUEST, URL_3_GUEST, URL_4_GUEST]

In [12]:
# Create Selenium chrome browser driver instance, and an empty list to hold links

driver = webdriver.Chrome()
time.sleep(5)
window = driver.window_handles[0]

In [13]:
list_of_links = []
for url in url_list[:]:
    list_of_links += get_links(driver, url)
    print('Number of links so far:', len(list_of_links))

Number of links so far: 298
Number of links so far: 594
Number of links so far: 890
Number of links so far: 1185


In [14]:
len(list_of_links)

1185

In [15]:
with open('../data/long_list_of_links.pkl', 'wb' ) as f:
    pickle.dump(list_of_links, f)

In [16]:
lnk_df = pd.DataFrame(list_of_links, columns=['link'])

In [17]:
lnk_df.head()

Unnamed: 0,link
0,https://www.airbnb.com/rooms/33079505?location...
1,https://www.airbnb.com/rooms/34127978?location...
2,https://www.airbnb.com/rooms/22750242?location...
3,https://www.airbnb.com/rooms/29736429?location...
4,https://www.airbnb.com/rooms/32822782?location...


In [18]:
lnk_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1185 entries, 0 to 1184
Data columns (total 1 columns):
link    1185 non-null object
dtypes: object(1)
memory usage: 9.4+ KB


In [19]:
# lnk_df['id'] = lnk_df['link'].str.extract(r"^https://www\.airbnb\.com/rooms/\d+\?)")
lnk_df['id'] = lnk_df['link'].str.extract(r"^https://www\.airbnb\.com/rooms/(\d+)")

In [20]:
lnk_df.head(20)

Unnamed: 0,link,id
0,https://www.airbnb.com/rooms/33079505?location...,33079505
1,https://www.airbnb.com/rooms/34127978?location...,34127978
2,https://www.airbnb.com/rooms/22750242?location...,22750242
3,https://www.airbnb.com/rooms/29736429?location...,29736429
4,https://www.airbnb.com/rooms/32822782?location...,32822782
5,https://www.airbnb.com/rooms/30405281?location...,30405281
6,https://www.airbnb.com/rooms/2373176?location=...,2373176
7,https://www.airbnb.com/rooms/38217141?location...,38217141
8,https://www.airbnb.com/rooms/33735026?location...,33735026
9,https://www.airbnb.com/rooms/25098816?location...,25098816


In [21]:
lnk_df = lnk_df.drop_duplicates(subset=['id'])

In [22]:
lnk_df.shape

(547, 2)

In [23]:
lnk_df

Unnamed: 0,link,id
0,https://www.airbnb.com/rooms/33079505?location...,33079505
1,https://www.airbnb.com/rooms/34127978?location...,34127978
2,https://www.airbnb.com/rooms/22750242?location...,22750242
3,https://www.airbnb.com/rooms/29736429?location...,29736429
4,https://www.airbnb.com/rooms/32822782?location...,32822782
...,...,...
1178,https://www.airbnb.com/rooms/24129466?location...,24129466
1179,https://www.airbnb.com/rooms/37853356?location...,37853356
1180,https://www.airbnb.com/rooms/34572158?location...,34572158
1182,https://www.airbnb.com/rooms/10475901?location...,10475901
