In [1]:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
import json

In [2]:
# initiating the webdriver. Parameter includes the path of the webdriver.
driver = webdriver.Chrome('./chromedriver') 

In [3]:
class City:
    def __init__(self, cityName, cityHref):
        self.cityName = cityName
        self.cityHref = cityHref
    
    def __eq__(self, other):
        return self.cityHref == other.cityHref
    
    def __hash__(self):
        return hash(('city', self.cityName, 'city_href', self.cityHref))

In [4]:
def goToPageAndGetSoup(URL):
    driver.get(URL)
    
    # this is just to ensure that the page is loaded
    time.sleep(15) 
    
    html = driver.page_source
    
    soup = BeautifulSoup(html, 'html.parser')
    return soup

In [5]:
# URL = 'http://api.scraperapi.com/?api_key=c59648ab998ca23833e502f4638bfc49&url=https://www.grubhub.com/food/the_ice_cream_shop&render=true&country_code=us'
# page = requests.get(URL)
URL = "https://www.grubhub.com/food/the_ice_cream_shop"

In [6]:
main_soup = goToPageAndGetSoup(URL)

In [7]:
def getAllCities(soup):
    all_s_row_elems = soup.find_all('div', class_='s-row', recursive=True)
    print(len(all_s_row_elems))
    state_elems = all_s_row_elems[10:78]
    print(len(state_elems)) # should be 68
    
    cities = []
    cityHrefs = []

    for i, state_elem in enumerate(state_elems):
        city_elems = state_elem.find_all('a', recursive=True)
        for j, city_elem in enumerate(city_elems):
            cities.append(city_elem.text)
            cityHrefs.append("https://www.grubhub.com{}".format(city_elem['href']))

    print(len(cities)) # Should be 5550
    print(len(cityHrefs)) #^
    
    # Converting into objects
    cityObjs = []
    for i, city in enumerate(cities):
        cityObjs.append(City(city, cityHrefs[i]))
        
    uniqueCities = list(set(cityObjs))
    print(len(uniqueCities))
    return uniqueCities

In [8]:
uniqueCities = getAllCities(main_soup)
print(len(uniqueCities))

84
68
5550
5550
2775
2775


In [9]:
def getAllStoresInCitySoup(city):
    URL = city.cityHref
    soup = goToPageAndGetSoup(URL)
    
    all_s_row_elems = soup.find_all('div', class_='s-row', recursive=True)
    all_shops = list(filter(lambda x: "See menu" in x.text, all_s_row_elems))
    all_shops = list(filter(lambda x: len(x['class']) == 1, all_shops))
    
    # See if there are more
    while True:
        next_btn = soup.find("li", class_="page-item s-pagination-next", recursive=True)
        if next_btn is None or "disabled" in next_btn["class"]:
            break
        else:
            nextURL = next_btn.find("a")["href"]
            nextURL = "https:{}".format(nextURL)
            
            soup = goToPageAndGetSoup(nextURL)
            all_s_row_elems = soup.find_all('div', class_='s-row', recursive=True)
            all_next_shops = list(filter(lambda x: "See menu" in x.text, all_s_row_elems))
            all_shops += list(filter(lambda x: len(x['class']) == 1, all_next_shops))
    
    return all_shops

In [10]:
class Store:
    def __init__(self, city, cityHref, name, storeHref, address, number):
        self.city = city
        self.cityHref = cityHref
        self.name = name
        self.storeHref = storeHref
        self.address = address
        self.number = number
        
    def __str__(self):
        return "{}\n{}\n{}\n{}\n{}\n{}".format(self.city, self.cityHref, self.name, self.storeHref, self.address, self.number)

In [11]:
def getStoreNumber(shop_url):
    soup = goToPageAndGetSoup(shop_url)
    all_buttons = soup.find_all('button', class_='u-noWrap s-btn s-btn-tertiary u-padding-cancel restaurant-phone-button', recursive=True)
    return all_buttons[0].text

In [12]:
def getInfoAllStores(allStoresSoup, city):
    results = []
    for i, storeSoup in enumerate(allStoresSoup):
        
        # Get name
        name = ""
        for i, child in enumerate(storeSoup.find("h5").children):
            if i == 1:
                name = child
        
        # Get address
        all_child_divs = storeSoup.find_all("div", recursive=True)
        address = all_child_divs[6].text
        
        # Get href
        shop_url = storeSoup.find_all("a", recursive=True)[1]['href']
        shop_url = "https://www.grubhub.com{}".format(shop_url)
        
        # Get phone number
        phone = getStoreNumber(shop_url)
        
        newStore = Store(city.cityName, city.cityHref, name, shop_url, address, phone)
        results.append(newStore)
    
    return results

In [13]:
def exploreAllCities(uniqueCities):
    allStores = []
    for i, city in enumerate(uniqueCities):
        allStoresSoup = getAllStoresInCitySoup(city)
        allStores += getInfoAllStores(allStoresSoup, city)
    return allStores

In [None]:
allStores = []
for i, city in enumerate(uniqueCities):
    tempStores = exploreAllCities([city])
    allStores += tempStores
    print(i)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41


In [19]:
print(len(allStores))

14
Greene
https://www.grubhub.com/food/the_ice_cream_shop/me-greene
The Ice Cream Shop
https://www.grubhub.com/restaurant/the-ice-cream-shop-145-college-st-lewiston/2403267
145 College St, Lewiston, ME, 04240
(207) 786-0785


In [15]:
# Use to find the store for debugging
for i, city in enumerate(uniqueCities):
    if "ca-los_angeles" in city.cityHref:
        print(i)
        break

2487
