In [1]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
import requests
import csv
import re
import os
import time
from chromedriver_py import binary_path
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [19]:
location_file = "airbnb_input.txt"
todo_file = "airbnb_todo.txt"
done_file = "airbnb_done.txt"
output_file = "airbnb_corpus.csv"
headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/601.3.9 (KHTML, like Gecko) Version/9.0.2 Safari/601.3.9"}

In [16]:
def setupDriver(url, waiting_time = 2.5):
    ''' Initializes the driver of selenium'''
    driver = webdriver.Chrome(executable_path=binary_path)
    driver.get(url)
    time.sleep(waiting_time) 
    return driver

def get_content(soup):
    '''Get What you'll do section'''
    meta = soup.find("meta", {"property": "og:description"})
    text = meta["content"]
    i = text.find("-")
    text = text[i+2:]
    return text

def get_location(soup):
    '''Get location of experience'''
    span = soup.find_all("span", {"class": "_q9bblh"})
    location = span[0].get_text() 
    return location

def get_experience_type(soup):
    '''Get type of experience'''
    span = soup.find_all("span", {"class": "_q9bblh"})
    exp_type = span[1].get_text() 
    return exp_type

def get_todo_experiences(url):
    ''' Extracts list of experiences from the webpage '''
    driver = setupDriver(url)
    index = 1
    experiences = set()
    exp_all_links = []
    
    while True:
        if index > 17:
            break
        try:
            index = index + 1
            time.sleep(6)
            exp_list = (driver.find_elements_by_xpath('//a[@class="_sqvp1j"]'))
            exp_links = [exp.get_attribute('href') for exp in exp_list ] 
            exp_all_links.extend(exp_links)
            wait_button = WebDriverWait(driver, 10)
            next_button = wait_button.until(EC.element_to_be_clickable((By.XPATH,'//button[@class="_m095vcq"]')))
            next_button.click()
        except:
            pass
    driver.close()
    for item in exp_all_links:
        exp_id = re.search(r'https://www.airbnb.com/experiences/(.*)\?', item)
        experiences.add(exp_id.group(1))
    return experiences

In [6]:
todo_locations = set()
if os.path.exists(location_file):
    with open(location_file,encoding="utf-8") as f:
        for line in f:
            todo_locations.add(line.strip())
todo_locations

{'Australia',
 'Canada',
 'France',
 'India',
 'Italy',
 'Japan',
 'Spain',
 'Thailand',
 'USA',
 'Vietnam'}

In [7]:
while todo_locations:
    current_location = todo_locations.pop()
    print(current_location)
    url = r"https://www.airbnb.com/s/" + current_location + "/experiences"
    experiences = get_todo_experiences(url)
    
    fout = open(todo_file, "a", encoding="utf-8")
    for item in experiences:     
        fout.write(item + "\n")
    fout.close()

Thailand
USA
Vietnam
France
India
Spain
Australia
Japan
Italy
Canada


In [28]:
todo_experiences = set()
done_experiences = set()
fail_expericenes = set()

if os.path.exists(todo_file):
    f = open(todo_file, encoding="utf-8")
    for line in f:
        todo_experiences.add(line.strip())
    f.close()

if os.path.exists(done_file):
    f = open(done_file, encoding="utf-8")
    for line in f:
        done_experiences.add(line.strip())
    f.close()

In [29]:
while todo_experiences:
    current_exp = todo_experiences.pop()
    
    if current_exp in done_experiences:
        continue
        
    success = False
    count = 0
    
    while not success and count < 10:
        try:
            url = r"https://www.airbnb.com/experiences/" + current_exp 
            response = requests.get(url, headers=headers)
            soup = BeautifulSoup(response.content, "lxml")
            content = get_content(soup)
            location = get_location(soup)
            exp_type = get_experience_type(soup)
            with open("../../data/" + output_file, 'a', newline='', encoding="utf-8") as csv_file:
                csvWriter = csv.writer(csv_file)
                if len(done_experiences) == 0:
                    csvWriter.writerow(["id", "text", "location", "type"]) 
                csvWriter.writerow([url, content, location, exp_type]) 
            success = True
        except:
            print(count, " fail!")
            count += 1
            time.sleep(1)
    if count == 10:
        continue
    done_experiences.add(current_exp)
    fout = open(done_file, "a", encoding="utf-8")
    fout.write(current_exp + "\n")
    fout.close()
    time.sleep(1)

0  fail!
1  fail!
2  fail!
3  fail!
4  fail!
5  fail!
6  fail!
7  fail!
8  fail!
9  fail!
0  fail!
1  fail!
2  fail!
3  fail!
4  fail!
5  fail!
6  fail!
7  fail!
8  fail!
9  fail!
0  fail!
1  fail!
2  fail!
3  fail!
4  fail!
5  fail!
6  fail!
7  fail!
8  fail!
9  fail!
0  fail!
1  fail!
2  fail!
3  fail!
4  fail!
5  fail!
6  fail!
7  fail!
8  fail!
9  fail!
0  fail!
1  fail!
2  fail!
3  fail!
4  fail!
5  fail!
6  fail!
7  fail!
8  fail!
9  fail!
0  fail!
1  fail!
2  fail!
3  fail!
4  fail!
5  fail!
6  fail!
7  fail!
8  fail!
9  fail!
0  fail!
1  fail!
2  fail!
3  fail!
4  fail!
5  fail!
6  fail!
7  fail!
8  fail!
9  fail!
0  fail!
1  fail!
2  fail!
3  fail!
4  fail!
5  fail!
6  fail!
7  fail!
8  fail!
9  fail!
0  fail!
1  fail!
2  fail!
3  fail!
4  fail!
5  fail!
6  fail!
7  fail!
8  fail!
9  fail!
0  fail!
1  fail!
2  fail!
3  fail!
4  fail!
5  fail!
6  fail!
7  fail!
8  fail!
9  fail!
0  fail!
1  fail!
2  fail!
3  fail!
4  fail!
5  fail!
6  fail!
7  fail!
8  fail!
9  fail!
0  fail!
1