In [1]:
# Imports
import pandas as pd
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import json
import time
import re
from collections import defaultdict
from airbnb_lib import number_of_stays_page_bottom

In [2]:
def setup_webdriver(width=800, height=600):
    """Opens a chrome window and sets the size of the window. The webdriver object is returned."""
    driver = webdriver.Chrome()
    driver.set_window_size(width, height, windowHandle='current')
    time.sleep(1)
    return driver

**This function should do the following:**
1. Find the number of rooms returned by the given min/max values
2. If the number of rooms is greater than 300 then reduce the max until a number of rooms less than 300 is return
3. Maximize the number of rooms returned such that the number of rooms returned is the largest amount less than 300

In [3]:
def find_room_count_for_range(url, driver, min_price=0, max_price=10):
    """Requests Airbnb listings and searches for the price range that yields the greatest number
    of stays that is less than 300. The price range is returned as a tuple. The function adds $20
    until it exceeds 299 stays. The function will do a binary search for the lowest price that will yield
    the greatest number of stays that is less than 300.
    
    Inputs:
        min_price: The minimum price to consider.
    Outputs
        (min, max) tuple. A tuple containing the minimum and maximum price range."""
    
    offset = 0
    
    # Occasionally Airbnb will not return the room count data and the BeautifulSoup selector will return
    # an empty list. The following loop will relaod the page until the room counts are returned. It will
    # try 5 times.
    for i in range(5):
        try:
            driver.get(url.format(min_price, max_price, offset))
            time.sleep(5)
            soup = BeautifulSoup(driver.page_source)
            room_count = soup.select('div._1snxcqc')[0].string.split()[0]
            room_count.replace('+', '')
            room_count = int(room_count.replace('+', ''))
            break
        except Exception as e:
            print(e)
    
    return room_count
# driver = setup_webdriver(1730, 1020)
# url = 'https://www.airbnb.com/s/Bogota--Colombia/homes?tab_id=home_tab&refinement_paths%5B%5D=%2Fhomes&place_id=ChIJKcumLf2bP44RFDmjIFVjnSM&source=structured_search_input_header&search_type=pagination&federated_search_session_id=43bfeb76-6669-4ea0-9fca-ad6e6789fb2a&query=Bogota%2C%20Colombia&checkin=2020-07-12&checkout=2020-07-15&price_min={}&price_max={}&room_types%5B%5D=Entire%20home%2Fapt&section_offset=4&items_offset={}'
# print(find_room_count_for_range(url, driver, max_price=16))
# # time.sleep(7)
# driver.close()

In [4]:
!pwd

/home/scott/projects/mp2/src


In [5]:
def find_optimum_range(url, driver, min_price=0, max_price=20):
    """Finds the optimum praice range to get less than 300 stays.
    
    This function will:
    1. Get the number of rooms at the given price range.
    2. If more than 300 rooms are returned it will decrease the max_price until a value less than 300 rooms is returned.
    3. If less than 300 rooms are returned it will increase the max_price until more than 300 rooms are returned.
    4. Steps 2 & 3 will be repeated until a price range that will yield the largest number of rooms less than 300 is found."""
    
    top = max_price + 500
    bottom = min_price
    room_count = find_room_count_for_range(url, driver, min_price, max_price)
    
    while(room_count >= 300 or (top > max_price + 1 and room_count > 0)):
        if room_count >= 300:
            top = max_price
            max_price = bottom + (max_price - bottom) // 2
        else:
            bottom = max_price
            max_price = max_price + (top - max_price) // 2
        room_count = find_room_count_for_range(url, driver, min_price, max_price)
    
    return (room_count, min_price, max_price)
            

# driver = setup_webdriver(1730, 1020)
# url = 'https://www.airbnb.com/s/Bogota--Colombia/homes?tab_id=home_tab&refinement_paths%5B%5D=%2Fhomes&place_id=ChIJKcumLf2bP44RFDmjIFVjnSM&source=structured_search_input_header&search_type=pagination&federated_search_session_id=43bfeb76-6669-4ea0-9fca-ad6e6789fb2a&query=Bogota%2C%20Colombia&checkin=2020-07-12&checkout=2020-07-15&price_min={}&price_max={}&room_types%5B%5D=Entire%20home%2Fapt&section_offset=4&items_offset={}'
# print(find_optimum_range(url, driver, 0, 100))
# driver.close()

In [6]:
def get_price_ranges(url, driver, min_price=0, max_price=0):
    price_ranges = []
    last_count = 1
    while(last_count > 0):
        current_range = find_optimum_range(url, driver, min_price, max_price)
        if current_range[0] > 0:
            price_ranges.append(current_range)
            min_price = current_range[2] + 1
            max_price = current_range[2] + 1
        last_count = current_range[0]
    
    return price_ranges
# driver = setup_webdriver(1730, 1020)
# url = 'https://www.airbnb.com/s/Bogota--Colombia/homes?tab_id=home_tab&refinement_paths%5B%5D=%2Fhomes&place_id=ChIJKcumLf2bP44RFDmjIFVjnSM&source=structured_search_input_header&search_type=pagination&federated_search_session_id=43bfeb76-6669-4ea0-9fca-ad6e6789fb2a&query=Bogota%2C%20Colombia&checkin=2020-07-12&checkout=2020-07-15&price_min={}&price_max={}&room_types%5B%5D=Entire%20home%2Fapt&section_offset=4&items_offset={}'
# print(get_price_ranges(url, driver, 0, 100))
# driver.close()

In [7]:
def get_cleaning_fee(price_item_list):
    """Applies a filter to get the pricing item that is the cleaning."""
    def cleaning_fee_filter(item):
        return item['localizedTitle'] == 'Cleaning fee'
    cleaning_fee_item = list(filter(cleaning_fee_filter, price_item_list))
    if cleaning_fee_item:
        return cleaning_fee_item[0]['total']['amount']
    else:
        return 0
                             
# stays[0]['pricingQuote']['price']['priceItems']))[0]['total']['amount']

In [8]:
def format_url(url, price_range):
    """Accepts a url and a price_range tuple and will format the url with the information
    in the tuple. The offset will be filled with the '{}' string and the offset value will be
    formated when paging through the Airbnb listings."""
    
    return url.format(price_range[1], price_range[2], '{}')
format_url('one_{}_two_{}_three_{}', (0, 1, 2, 3))

'one_1_two_2_three_{}'

In [24]:
def get_listings(driver, url, price_ranges):
    driver.switch_to.window(driver.window_handles[0])
    stays_added = 0
    seen_ids = set()
    stay_dict = defaultdict(list)
    dup_dict = defaultdict(list)
    for pr in price_ranges:
        keep_looping = True
        offset = 0
        while keep_looping:
            stays_added = 0
            driver.get(format_url(url, pr).format(offset))
            time.sleep(6)
            soup = BeautifulSoup(driver.page_source)
            json_dict = soup.select('script#data-state')
            listings = json.loads(json_dict[0].string)
            page_nbrs = number_of_stays_page_bottom(soup)
            print(page_nbrs)
            stays = listings['niobeClientData']['__niobe_denormalized']['queries'][0][1]['dora']['exploreV3']['sections'][0]['items']
            for stay in stays:
                current_id = stay['listing']['id']
                if current_id in seen_ids:
    #                 print('Duplicate ID:', current_id)
                    dup_dict['sid'].append(stay['listing']['id'])
                    dup_dict['guests'].append(stay['listing']['personCapacity'])
                    dup_dict['bedrooms'].append(stay['listing']['bedrooms'])
                    dup_dict['beds'].append(stay['listing']['beds'])
                    dup_dict['bathrooms'].append(stay['listing']['bathrooms'])
                    dup_dict['amenities'].append(stay['listing']['amenityIds'])
                    dup_dict['superhost'].append(stay['listing']['isSuperhost'])
                    dup_dict['starRating'].append(stay['listing']['avgRating'])
                    dup_dict['amount'].append(stay['pricingQuote']['price']['total']['amount'])
                    dup_dict['cleaning_fee'].append(get_cleaning_fee(stay['pricingQuote']['price']['priceItems']))
                else:
                    seen_ids.add(current_id)
                    stays_added += 1
                    stay_dict['sid'].append(stay['listing']['id'])
                    stay_dict['guests'].append(stay['listing']['personCapacity'])
                    stay_dict['bedrooms'].append(stay['listing']['bedrooms'])
                    stay_dict['beds'].append(stay['listing']['beds'])
                    stay_dict['bathrooms'].append(stay['listing']['bathrooms'])
                    stay_dict['amenities'].append(stay['listing']['amenityIds'])
                    stay_dict['superhost'].append(stay['listing']['isSuperhost'])
                    stay_dict['starRating'].append(stay['listing']['avgRating'])
                    stay_dict['amount'].append(stay['pricingQuote']['price']['total']['amount'])
                    stay_dict['cleaning_fee'].append(get_cleaning_fee(stay['pricingQuote']['price']['priceItems']))
            if page_nbrs[1] == page_nbrs[2]:
                keep_looping = False
            offset += 20
    return (stay_dict, dup_dict)

In [25]:
prs = [(237, 0, 16), (255, 17, 20)]
prs

[(237, 0, 16), (255, 17, 20)]

In [26]:
driver = setup_webdriver(1730, 1020)
url = 'https://www.airbnb.com/s/Bogota--Colombia/homes?tab_id=home_tab&refinement_paths%5B%5D=%2Fhomes&place_id=ChIJKcumLf2bP44RFDmjIFVjnSM&source=structured_search_input_header&search_type=pagination&federated_search_session_id=43bfeb76-6669-4ea0-9fca-ad6e6789fb2a&query=Bogota%2C%20Colombia&checkin=2020-07-12&checkout=2020-07-15&price_min={}&price_max={}&room_types%5B%5D=Entire%20home%2Fapt&section_offset=4&items_offset={}'
stays, dups = get_listings(driver, url, prs)
driver.close()

[1, 20, 237]
[21, 40, 237]
[41, 60, 237]
[61, 80, 237]
[81, 100, 237]
[101, 120, 237]
[121, 140, 237]
[141, 160, 237]
[161, 180, 237]
[181, 200, 237]
[201, 220, 237]
[221, 237, 237]
[1, 20, 254]
[21, 40, 254]
[41, 60, 254]
[61, 80, 254]
[81, 100, 254]
[101, 120, 254]
[121, 140, 254]
[141, 160, 254]
[161, 180, 254]
[181, 200, 254]
[201, 220, 254]
[221, 240, 254]
[241, 254, 254]


In [28]:
df = pd.DataFrame(stays)
df.head()

Unnamed: 0,sid,guests,bedrooms,beds,bathrooms,amenities,superhost,starRating,amount,cleaning_fee
0,18669058,2,1,1.0,1.0,"[1, 4, 8, 10, 77, 21, 89, 90, 91, 92, 93, 96, ...",False,4.7,37,5
1,39965150,2,1,1.0,1.0,"[1, 2, 4, 8, 10, 77, 21, 86, 89, 90, 91, 93, 9...",True,4.84,65,12
2,39032523,4,2,2.0,1.0,"[96, 1, 2, 4, 103, 40, 8, 10, 44, 45, 77, 89, ...",False,4.4,62,18
3,29097089,3,2,2.0,1.0,"[96, 1, 129, 2, 4, 103, 40, 8, 104, 41, 73, 44...",True,4.86,43,4
4,29375215,4,1,1.0,1.0,"[64, 1, 129, 4, 71, 8, 10, 12, 77, 85, 86, 23,...",False,4.54,45,8
