### Solutions for Web Scraping Homework

In [1]:
# imports
import bs4
from bs4 import BeautifulSoup
import pandas as pd
import requests

In [42]:
# connect to homework
i = 30
url = f'https://www.yelp.com/search?find_desc=Restaurants&find_loc=London%2C+United+Kingdom&ns=1&start={i}'
req = requests.get(url).text
scraper = BeautifulSoup(req)
restaurants = scraper.find_all('div', {'class': classes[1]})

In [7]:
classes = 'lemon--div__373c0__1mboc container__373c0__3HMKB hoverable__373c0__VqkG7 margin-t3__373c0__1l90z margin-b3__373c0__q1DuY padding-t3__373c0__1gw9E padding-r3__373c0__57InZ padding-b3__373c0__342DA padding-l3__373c0__1scQ0 border--top__373c0__3gXLy border--right__373c0__1n3Iv border--bottom__373c0__3qNtD border--left__373c0__d1B7K border-color--default__373c0__3-ifU'.split()

In [43]:
len(restaurants)

30

### Scraping Titles, Categories

In [23]:
restaurant_links = [restaurant.find_all('a', {'class': 'link-color--inherit__373c0__3dzpk'}) for restaurant in restaurants]
titles = [link.text for link_group in restaurant_links for link in link_group if 'biz' in link['href']]

In [37]:
# no list comprehension
titles = []
for link_group in restaurant_links:
    for link in link_group:
        if 'biz' in link['href']:
            titles.append(link.text)

In [54]:
# restaurant categories
list_of_cats = []
for link_groups in restaurant_links:
    restaurant_cats = []
    for link in link_groups:
        if 'find_desc' in link['href']:
            restaurant_cats.append(link.text)
    list_of_cats.append(restaurant_cats)
    
category_one = []
category_two = []
category_three = []
category_four = []
there_are_lists = True

master_list = []
i = 0
while there_are_lists:
    cat_list = []
    for restaurant in list_of_cats:
            try:
                cat_list.append(restaurant[i])
            except:
                cat_list.append(None)
    there_are_lists = any(cat_list)
    if there_are_lists:
        master_list.append(cat_list)
        i += 1

In [56]:
master_list[2]

[None,
 None,
 None,
 None,
 None,
 'Cajun/Creole',
 None,
 None,
 'Gastropubs',
 None,
 None,
 None,
 None,
 None,
 'British',
 None,
 None,
 None,
 None,
 None,
 None,
 'Cocktail Bars',
 None,
 'Noodles',
 'Cocktail Bars',
 None,
 None,
 None,
 None,
 None]

### Scraping Phone Numbers, Neighborhoods, Reviews

In [410]:
# paragraphs
restaurant_paragraphs = [restaurant.find_all('p') for restaurant in restaurants]

In [62]:
# get the phone numbers
phone_numbers = [paragraph_group[0].text if paragraph_group[0].text.replace(' ', '').isdigit() else None for paragraph_group in restaurant_paragraphs]

In [67]:
# without a comprehension
phone_numbers = []
for paragraph_group in restaurant_paragraphs:
    if paragraph_group[0].text.replace(' ', '').isdigit():
        phone_numbers.append(paragraph_group[0].text)
    else:
        phone_numbers.append(None)

In [420]:
# finding out neighborhoods
neighborhoods = []
for paragraph_group in restaurant_paragraphs:
    if paragraph_group[-2].text != 'Delivery':
        neighborhoods.append(paragraph_group[-2].text)
    else:
        neighborhoods.append(paragraph_group[-3].text)

In [242]:
# reviews -- sometimes they are missing, hence the check
reviews = []
for idx, paragraph_group in enumerate(restaurant_paragraphs):
    if idx == len(restaurant_paragraphs) - 1:
        reviews.append(paragraph_group[1][3].text.replace('\xa0more', ''))
    else: 
        if '\xa0more' in paragraph_group[-1].text:
                reviews.append(paragraph_group[-1].text.replace('\xa0more', ''))
        else:
            reviews.append(None)

### Scraping Price Ranges, Addresses

In [230]:
restaurant_spans = [restaurant.find_all('span') for restaurant in restaurants]

In [286]:
# number of reviews
num_reviews = [int(span_group[4].text) if span_group[4].text.isdigit() else None for span_group in restaurant_spans]

In [300]:
# price ranges -- sometimes they have blanks:
price_ranges = []
for span_group in restaurant_spans:
    if '\xA3' in span_group[5].text:
        price_ranges.append(span_group[5].text)
    elif '\xA3' in span_group[6].text:
        price_ranges.append(span_group[6].text)
    else:
        price_ranges.append(None)

In [321]:
# for addresses

# helper function to check if a given piece of text is an address or not
def is_address(text):
    text = text.split()
    try:
        # accommodates for values like 34-66, 4/6, etc
        if text[0].replace('-', '').replace('/', '').isdigit() and text[-1].isalpha():
            return True
        else:
            return False
    except:
        return False

addresses = []
for span_group in restaurant_spans:
    has_address = False
    for group in span_group:
        if is_address(group.text):
            has_address = True
            address = group.text
    if has_address:
        addresses.append(address)
    else:
        addresses.append(None)

### Scraping Average Rating

In [345]:
# get all the divs
restaurant_divs = [restaurant.find_all('div', {'role': 'img'}) for restaurant in restaurants]

In [358]:
restaurant_ratings = [float(str(restaurant[0]).split('"')[1].split()[0]) for restaurant in restaurant_divs]

### Final DataFrame

In [138]:
# neighborhoods
neighborhoods = [paragraph_group[-2].text for paragraph_group in restaurant_paragraphs[:-1]]
neighborhoods.append(restaurant_paragraphs[-1][-1][-2].text)

In [363]:
df_dict = {
    'Name': titles,
    'PhoneNumber': phone_numbers,
    'Address': addresses,
    'Neighborhood': neighborhoods,
    'PriceRange': price_ranges,
    'AvgRating': restaurant_ratings,
    'NumRatings': num_reviews,
    'CategoryOne': category_one,
    'CategoryTwo': category_two,
    'CategoryThree': category_three,
    'Review': reviews
}

df = pd.DataFrame(df_dict)
df

Unnamed: 0,Name,PhoneNumber,Address,Neighborhood,PriceRange,AvgRating,NumRatings,CategoryOne,CategoryTwo,CategoryThree,Review
0,Dinner by Heston Blumenthal,020 7201 3833,66 Knightsbridge,Hyde Park,££££,4.5,289,British,,,“Still our favourite restaurant in London! Din...
1,Kazan,020 7233 7100,93-94 Wilton Road,Victoria,££,4.5,112,Turkish,,,"“Absolutely delicious food, best Turkish food ..."
2,Wright Brothers - South Kensington,020 7581 0131,56 Old Brompton Road,South Kensington,££,4.5,22,Seafood,,,“Dinner at Wright Bros was a great way to wrap...
3,Barrafina,020 7440 1456,43 Drury Lane,Covent Garden,££,4.5,61,Spanish,Tapas Bars,,"“A stone's throw from our hotel, we walked ove..."
4,Belgo Centraal,020 7813 2233,50 Earlham Street,Covent Garden,££,4.0,319,Belgian,,,“Don't let the entrance fool you! When you wal...
5,Lanzhou Noodle Bar,020 7467 4546,33 Cranbourne Street,Covent Garden,£,4.0,351,Chinese,Noodles,,“We got to our hotel somewhere in between earl...
6,Laughing Halibut,020 7799 2844,38 Strutton Ground,Westminster,££,4.0,243,Fish & Chips,,,“Just plain YES! Go here if you want to enjoy ...
7,Duck & Waffle Local,020 3900 4444,52 Haymarket,Leicester Square,££,4.5,142,Bars,Waffles,,“My first review of 2020! I was in Europe for ...
8,Yasmeen Restaurant,020 7624 2921,1 Blenheim Terrace,St John's Wood,££,5.0,14,Lebanese,Mediterranean,,“The only sad part of my review is that we onl...
9,Barrafina,020 7440 1456,26 Dean Street,Soho,£££,4.0,37,Tapas/Small Plates,Spanish,,“A lovely spot with authentic tapas and a dive...


### Final Answer, That Loops Through The Entire Website

In [362]:
pd.DataFrame(df_dict)

Unnamed: 0,Name,PhoneNumber,Address,Neighborhood,PriceRange,AvgRating,NumRatings,CategoryOne,CategoryTwo,CategoryThree,Review
0,Dinner by Heston Blumenthal,020 7201 3833,66 Knightsbridge,Hyde Park,££££,4.5,289,British,,,“Still our favourite restaurant in London! Din...
1,Kazan,020 7233 7100,93-94 Wilton Road,Victoria,££,4.5,112,Turkish,,,"“Absolutely delicious food, best Turkish food ..."
2,Wright Brothers - South Kensington,020 7581 0131,56 Old Brompton Road,South Kensington,££,4.5,22,Seafood,,,“Dinner at Wright Bros was a great way to wrap...
3,Barrafina,020 7440 1456,43 Drury Lane,Covent Garden,££,4.5,61,Spanish,Tapas Bars,,"“A stone's throw from our hotel, we walked ove..."
4,Belgo Centraal,020 7813 2233,50 Earlham Street,Covent Garden,££,4.0,319,Belgian,,,“Don't let the entrance fool you! When you wal...
5,Lanzhou Noodle Bar,020 7467 4546,33 Cranbourne Street,Covent Garden,£,4.0,351,Chinese,Noodles,,“We got to our hotel somewhere in between earl...
6,Laughing Halibut,020 7799 2844,38 Strutton Ground,Westminster,££,4.0,243,Fish & Chips,,,“Just plain YES! Go here if you want to enjoy ...
7,Duck & Waffle Local,020 3900 4444,52 Haymarket,Leicester Square,££,4.5,142,Bars,Waffles,,“My first review of 2020! I was in Europe for ...
8,Yasmeen Restaurant,020 7624 2921,1 Blenheim Terrace,St John's Wood,££,5.0,14,Lebanese,Mediterranean,,“The only sad part of my review is that we onl...
9,Barrafina,020 7440 1456,26 Dean Street,Soho,£££,4.0,37,Tapas/Small Plates,Spanish,,“A lovely spot with authentic tapas and a dive...


In [29]:
i = 0
num_round = 1
total_titles = []
total_phone_numbers = []
total_addresses = []
total_neighborhoods = []
total_price_ranges = []
total_restaurant_ratings = []
total_num_reviews = []
total_category_one = []
total_category_two = []
total_category_three = []
total_reviews = []
looping = True

# helper function to check if a given piece of text is an address or not
def is_address(text):
    text = text.split()
    try:
        # accommodates for values like 34-66, 4/6, etc
        if text[0].replace('-', '').replace('/', '').isdigit() and text[-1].isalpha():
            return True
        else:
            return False
    except:
        return False

while looping:
    url = f'https://www.yelp.com/search?find_desc=Restaurants&find_loc=London%2C+United+Kingdom&ns=1&start={i}'
    req = requests.get(url)
    scraper = BeautifulSoup(req.text)
    restaurants = scraper.find_all('div', {'class': 'container__373c0__3HMKB'})
    if len(restaurants) == 0:
        print("No more restaurants found.  Breaking the loop.")
        break
    
    #### Getting All Links, Code for Titles and Categories
    restaurant_links = [restaurant.find_all('a', {'class': 'link-color--inherit__373c0__3dzpk'}) for restaurant in restaurants]
    titles = [link.text for link_group in restaurant_links for link in link_group if 'biz' in link['href']]
    total_titles.extend(titles)
    
    # restaurant categories
    list_of_cats = []
    for link_groups in restaurant_links:
        restaurant_cats = []
        for link in link_groups:
            if 'find_desc' in link['href']:
                restaurant_cats.append(link.text)
        list_of_cats.append(restaurant_cats)
    category_one = []
    category_two = []
    category_three = []
    for cat in list_of_cats:
        for idx in range(3):
            if idx == 0:
                try:
                    category_one.append(cat[idx])
                except:
                    category_one.append(None)
            if idx == 1:
                try:
                    category_two.append(cat[idx])
                except:
                    category_two.append(None)
            if idx == 2:
                try:
                    category_three.append(cat[idx])
                except:
                    category_three.append(None)
    total_category_one.extend(category_one)
    total_category_two.extend(category_two)
    total_category_three.extend(category_three)
                    
    # getting all the paragraphs within each restaurant
    restaurant_paragraphs = [restaurant.find_all('p') for restaurant in restaurants]
    
    # get the phone numbers
    phone_numbers = [paragraph_group[0].text if paragraph_group[0].text.replace(' ', '').isdigit() else None for paragraph_group in restaurant_paragraphs]
    total_phone_numbers.extend(phone_numbers)
    
    # get the neighborhoods
    neighborhoods = []
    for paragraph_group in restaurant_paragraphs:
        if '\xa0more' in paragraph_group[-1].text:
            if paragraph_group[-2].text != 'Delivery':
                neighborhoods.append(paragraph_group[-2].text)
            else:
                neighborhoods.append(paragraph_group[-3].text)
        else:
            if paragraph_group[-1].text != 'Delivery':
                neighborhoods.append(paragraph_group[-1].text)
            else:
                neighborhoods.append(paragraph_group[-2].text)
            
    total_neighborhoods.extend(neighborhoods)
    
    # get the reviews
    reviews = []
    for paragraph_group in restaurant_paragraphs:
        if '\xa0more' in paragraph_group[-1].text:
            reviews.append(paragraph_group[-1].text.replace('\xa0more', ''))
        else:
            reviews.append(None)
                
    total_reviews.extend(reviews)
    
    # get all the spans in a given page
    restaurant_spans = [restaurant.find_all('span') for restaurant in restaurants]
    
    # get the number of reviews
    num_reviews = [int(span_group[4].text) if span_group[4].text.isdigit() else None for span_group in restaurant_spans]
    total_num_reviews.extend(num_reviews)
    
    # get the price ranges
    price_ranges = []
    for span_group in restaurant_spans:
        if '\xA3' in span_group[5].text:
            price_ranges.append(span_group[5].text)
        elif '\xA3' in span_group[6].text:
            price_ranges.append(span_group[6].text)
        else:
            price_ranges.append(None)
    total_price_ranges.extend(price_ranges)
            
    # get the addresses
    addresses = []
    for span_group in restaurant_spans:
        has_address = False
        for group in span_group:
            if is_address(group.text):
                has_address = True
                address = group.text
        if has_address:
            addresses.append(address)
        else:
            addresses.append(None)
    total_addresses.extend(addresses)
            
    # get all the rating divs
    restaurant_divs = [restaurant.find_all('div', {'role': 'img'}) for restaurant in restaurants]
    
    # and get the restaurant ratings
    restaurant_ratings = [float(str(restaurant[0]).split('"')[1].split()[0]) for restaurant in restaurant_divs]
    total_restaurant_ratings.extend(restaurant_ratings)
    
    print(f'Finished Round {num_round}, value of i: {i}')
    i += 30
    num_round += 1
    
df_dict = {
    'Name': total_titles,
    'PhoneNumber': total_phone_numbers,
    'Address': total_addresses,
    'Neighborhood': total_neighborhoods,
    'PriceRange': total_price_ranges,
    'AvgRating': total_restaurant_ratings,
    'NumRatings': total_num_reviews,
    'CategoryOne': total_category_one,
    'CategoryTwo': total_category_two,
    'CategoryThree': total_category_three,
    'Review': total_reviews
}

df = pd.DataFrame(df_dict)
df

Finished Round 1, value of i: 0
No more restaurants found.  Breaking the loop.


Unnamed: 0,Name,PhoneNumber,Address,Neighborhood,PriceRange,AvgRating,NumRatings,CategoryOne,CategoryTwo,CategoryThree,Review
0,Dishoom,020 7420 9320,12 Upper Saint Martin's Lane,Covent Garden,££,4.5,1842,Indian,,,“Kicking off our first international trip with...
1,The Mayfair Chippy,020 7741 2233,14 North Audley Street,Mayfair,££,4.5,276,Fish & Chips,,,“I had the Mayfair Classic during my visit to ...
2,Ffiona’s Restaurant,020 7937 4152,51 Kensington Church Street,Kensington,££,4.5,270,British,,,“Ffiona's is easily my favorite restaurant in ...
3,Flat Iron,,17 Beak Street,Soho,££,4.5,377,Steakhouses,,,“Ok guys...here is how it is. We were in Londo...
4,Restaurant Gordon Ramsay,020 7352 4441,68 Royal Hospital Road,Chelsea,££££,4.5,204,French,British,,“Compared to Michelin 3-star restaurants in Ca...
5,The Fat Bear - Temp. CLOSED,020 7236 2498,61 Carter Lane,Blackfriars,££,4.5,122,American (Traditional),Soul Food,Cajun/Creole,“A most enjoyable experience. I had personally...
6,Dishoom,020 7420 9322,22 Kingly Street,Soho,££,4.5,544,Indian,,,“I visited Dishoom during my recent London tri...
7,Mother Mash,020 7494 9644,26 Ganton Street,Soho,££,4.0,468,British,,,“Soho is full of culture and amazing places to...
8,The Queens Arms,020 7834 3313,11 Warwick Way,Victoria,££,4.5,118,British,Pubs,Gastropubs,“The staff at The Queens Arms were amazing ton...
9,The Grazing Goat,020 7724 7243,6 New Quebec Street,Marylebone,££,4.0,240,Gastropubs,British,,“Sometimes it's just the little things. It wa...
