In [80]:
import pandas as pd

import re

#webscraping
import time, os
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import requests
from fake_useragent import UserAgent 



In [81]:
chromedriver = "/Applications/chromedriver" # path to the chromedriver executable
os.environ["webdriver.chrome.driver"] = chromedriver

In [82]:
def get_state_df(state):
    '''
    A fucntion for scraping TripAdvisor to get all info for top 30 attractions for one state

    Parameters
    ----------
    state as a string

    Returns
    -------
    df containing all top attractions and images for a state
    '''
    URL = "https://www.tripadvisor.com/Attractions/"
    driver = webdriver.Chrome(chromedriver)
    driver.get(URL)
    time.sleep(2)  #pause to be sure page has loaded
    
    #find state with search on main page
    search = driver.find_element_by_name('q')
    search.send_keys(state + ", United States"  )
    time.sleep(2)
    search.send_keys(Keys.DOWN)
    if state == 'Washington DC':
        search.send_keys(Keys.DOWN)
    search.send_keys(Keys.RETURN)

    # click Things to Do button
    things_to_do_button = driver.find_elements_by_xpath('//*[contains(text() , "Things to Do")]')[0]
    things_to_do_button.click()
    time.sleep(2)
    
    error = 0
    try:
        # click second See All button
        see_all_button = driver.find_elements_by_xpath('//*[contains(text() , "See all")]')[1]
        see_all_button.click()
        time.sleep(2)
    except:
        error = error + 1
    
    #check if second window appears, if yes switch back to main window and click see more button
    try: 
        driver.switch_to.window(driver.window_handles[1])

        driver.switch_to.window(driver.window_handles[0])

        #click See more button
        attraction_button = driver.find_elements_by_xpath('//*[contains(text() , "See more")]')[0]
        attraction_button.click()
        
        #initalize BeautifulSoup
        page_source = driver.page_source
        soup = BeautifulSoup(page_source, "lxml")
    
    except:
        #initalize BeautifulSoup
        page_source = driver.page_source
        soup = BeautifulSoup(page_source, "lxml")
        
    url = "https://www.tripadvisor.com"
    links = {}
    body = soup.find_all('body')[0]
    link_list = body.find_all( 'a') #find all links on page
    
    #loop through range to get numbers 1-30, we want top 30 attractions of each state, then for each number
    #loop through links, add attraction name and link to links dictionary if number match outside loop
    for x in range(1,31):
        find = str(x) + "[.] "
        for l in link_list:
            if re.search(find,l.text):
                name = l.text
                name = re.sub(r'^.*? ', '', name) #clean name
                links[name] = url + l['href']
    
    driver.quit() #close browser

    return get_state_attraction_info(links, state)

In [91]:
def get_state_attraction_info(link_dict, state):
    '''
    A fucntion for scraping TripAdvisor to get locations and image links for top 30 attractions for one state

    Parameters
    ----------
    dictionary, key is attraction name and value is link

    Returns
    -------
    df containing all top attractions, locations, and images for a state
    '''
    df = pd.DataFrame(columns = ['name', 'location', 'img_num'])
    idx = 0
    img_link_dict = {}
    
    ua = UserAgent()
    headers = {'user-agent': ua.random}
    
    #loop through dictionary append info to df
    for key, value in link_dict.items():

        response = requests.get(value, headers = headers)
        page= response.text
        soup = BeautifulSoup(page, "lxml")
        time.sleep(1)  #pause to be sure page has loaded
        
        #try to find attraction location, return state name if none
        try:
            body = soup.find('div', {'data-tab' :'TABS_LOCATION'})
            divs = body.find_all('span')
            address = divs[3].text
        except:
            address = state
            
        img_link_list = get_photo_links(value)
        img_link_dict[key] = img_link_list
        
        df = df.append(pd.DataFrame({'name':key, 'location': address, 'img_num' : len(img_link_list)}, 
                                    index=[idx]), ignore_index=True)
        
        idx = idx + 1
        
        #change user agent, chosen randomly every 3 loops
        if idx%3 == 0:
            headers = {'user-agent': ua.random}
    
    return (df,img_link_dict)

In [92]:
def get_photo_links(URL):
    '''
    A fucntion for scraping TripAdvisor to get all image links in a gallery for a specified attraction

    Parameters
    ----------
    url of attraction

    Returns
    -------
    list containing all image links for an attraction
    '''
    driver = webdriver.Chrome(chromedriver)
    driver.get(URL)
    time.sleep(2)  #pause to be sure page has loaded

    try:
        #click All photos button
        all_photos_button = driver.find_elements_by_xpath('//*[contains(text() , "All photos")]')[0]
        all_photos_button.click()
        time.sleep(2)
    except:
        return []
    
    error = 0
    try:
        #click first photo
        all_photos_button = driver.find_elements_by_class_name("photoGridImg")[0]
        all_photos_button.click()
        time.sleep(1)
    except:
        error = error +1
    
    #initalize BeautifulSoup
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, "lxml")
    time.sleep(1)
    
    images = soup.find_all('img') #find all imgs
    
    #loop through imgs, check for 2 image types, add to list if it is a TripAdvisor user uploaded photo
    img_links = []
    errors = 0
    for i in images[:-1]:
        try:
            try:
                link = i.attrs['data-lazyurl']
                if re.search('photo',link):
                    img_links.append(link)
            except:
                link = i.attrs['src']
                if re.search('photo',link):
                    img_links.append(link)
        except:
            errors = errors + 1

    driver.quit()

    return img_links


### Get top 30 attarctions for each state in continental US with location and 50+ images

In [93]:
# loc_df = pd.DataFrame(columns = ['name', 'location', 'img_num'])
# img_dict = {}


In [94]:
states = [
          "Kansas","Kentucky", "Louisiana",
          "Massachusetts", "Maryland", "Maine", "Michigan", "Minnesota", "Missouri", "Mississippi", "Montana",
          "North Carolina", "North Dakota", "Nebraska", "New Hampshire", "New Jersey", "New Mexico", "Nevada",
          "New York", "Ohio", "Oklahoma", "Oregon", "Pennsylvania", "Rhode Island", "South Carolina", "South Dakota",
          "Tennessee","Texas","Utah","Virginia","Vermont","Washington","Wisconsin","West Virginia", "Wyoming"]
# "Alabama","Arkansas", "Arizona","California","Colorado","Connecticut", "Washington DC", "Delaware",
# "Florida", "Georgia", "Iowa", "Idaho", "Illinois", "Indiana", 

for s in states:
    df, images = get_state_df(s)
#     print(images)
#     print(df)
    loc_df = pd.concat([loc_df, df], axis=0, ignore_index=True)
    img_dict.update(images)

In [95]:
loc_df


Unnamed: 0,name,location,img_num
0,Barber Vintage Motorsports Museum,"6030 Barber Motorsports, Birmingham, AL 35094-...",58
1,Rosa Parks Library and Museum,"251 Montgomery St 251 Montgomery Street, 36104...",58
2,Orange Beach Welcome Center,"23685 Perdido Beach Blvd, Orange Beach, AL 365...",23
3,U.S. Space and Rocket Center,"1 Tranquility Base, Huntsville, AL 35805-3371",58
4,Adventure Island,"24559 Perdido Beach Blvd, Orange Beach, AL 365...",58
...,...,...,...
1464,Antelope Flats,"Jackson, WY",58
1465,Medicine Bow National Forest,"2468 Jackson St, Laramie, WY 82070-6535",58
1466,Chief Joseph Scenic Highway,WY,58
1467,Grand Teton National Park,Wyoming,58


In [100]:
len(img_dict)*30

42690

In [101]:
loc_df.to_pickle('../Data/attractions_loc_df.pkl')

FileNotFoundError: [Errno 2] No such file or directory: '../Data/attractions_loc_df.pkl'