### Import the Libraries

In [3]:
from splinter import Browser
from bs4 import BeautifulSoup
import os
import pandas as pd
from datetime import datetime
import platform
import matplotlib.pyplot as plt

### Create five functions to scrap data 
- def init_browser(): Open a new Chrome window (For Window user, you need to download a file named 'chromedriver.exe')

- def get_html(browser, url): Get the html of url through new Chrome window
- get_data(html): Scrap the data in html (Find the \<tag> divs or spans, a, p, address with class names) and store them into dictionary
- def scrape(browser, url): Call two functions get_html(browser, url) and get_data(html)

- def main(): Call two functions init_browser() and scrape(browser, url) and return the dictionary that contain all dataset from url

In [230]:
data = {} 

def init_browser():
    if platform.system().lower() == 'windows'.lower():
        executable_path = {
            'executable_path': 
            os.path.join(os.getcwd(), 'chromedriver.exe')}
        return Browser('chrome', **executable_path, headless=False)
    else:
        return Browser('chrome')
    
def get_html(browser, url):
    browser.visit(url)
    html = browser.html
    return html

def get_data(html):   
    soup = BeautifulSoup(html, "html.parser")
    ol_lists = soup.find('div', class_='lemon--div__373c0__1mboc mapColumnTransition__373c0__10KHB arrange-unit__373c0__1piwO arrange-unit-fill__373c0__17z0h border-color--default__373c0__2oFDT').find('div', class_='lemon--div__373c0__1mboc border-color--default__373c0__2oFDT').find('ul', class_='lemon--ul__373c0__1_cxs undefined list__373c0__2G8oH')
    
    ranks = ol_lists.find_all('p', class_='lemon--p__373c0__3Qnnj text__373c0__2pB8f text-color--black-regular__373c0__38bRH text-align--left__373c0__2pnx_ text-size--inherit__373c0__2gFQ3')
    title = ol_lists.find_all('a', class_='lemon--a__373c0__IEZFH link__373c0__29943 link-color--blue-dark__373c0__1mhJo link-size--inherit__373c0__2JXk5')
    target_bf_rating = ol_lists.find_all('div', class_='lemon--div__373c0__1mboc attribute__373c0__1hPI_ display--inline-block__373c0__2de_K u-space-r1 border-color--default__373c0__2oFDT')
    reviews = ol_lists.find_all('span', class_='lemon--span__373c0__3997G text__373c0__2pB8f reviewCount__373c0__2r4xT text-color--mid__373c0__3G312 text-align--left__373c0__2pnx_') 
    infos = ol_lists.find_all('div', class_='lemon--div__373c0__1mboc mainAttributes__373c0__1r0QA arrange-unit__373c0__1piwO arrange-unit-fill__373c0__17z0h border-color--default__373c0__2oFDT')
    addresses = ol_lists.find_all('address', class_='lemon--address__373c0__2sPac')
    time = datetime.now()

    if len(ranks) > 30 or len(ranks) < 33:
        diff = len(ranks) - 30
        ranks = ranks[diff:]
        target_bf_rating = target_bf_rating[diff:]
        infos = infos[diff:]
    if len(ranks) == 33:
        ranks = ranks[2:-1]
        target_bf_rating = target_bf_rating[2:-1]
        infos = infos[2:-1]       
        
    for i in range(len(ranks)):
        rank = [p.text.split('.')[0] for p in ranks] 
        title = [a.a.text for a in ranks]
        rating = [a.span.div for a in target_bf_rating if a.span]
        num_review = [a.text.split()[0]for a in reviews]
        price = [a.div.div.find_next_sibling('div').find_next_sibling('div') for a in infos]
        types = [b.find_all('a', class_='lemon--a__373c0__IEZFH link__373c0__29943 link-color--inherit__373c0__15ymx link-size--default__373c0__1skgq') for b in price]

        address = [a.div.div.p.span.text for a in addresses]
        neighbourhood = [a.find_next_sibling('div') for a in addresses]
        
        data[rank[i]] = {'title': title[i], 
                         'rating': rating[i]['aria-label'].split(' ')[0], 
                         'num_review': num_review[i],
                         'price': ''.join([a for a in price[i].div.div.span.span.text if a is '$' or a is '$$' or a is '$$$' or a is '$$$$']),
                         'types': [b.text for b in types[i]],
                         'address': address[i],
                         'neighbourhood': ''.join([a.div.div.p.text for a in neighbourhood[i]]),
                         'time': time}        
    return data

def scrape(browser, url):
    html = get_html(browser, url)
    datas = get_data(html)
    return len(datas), datas

def main():
    browser = init_browser()
    url = "https://www.yelp.com/search?cflt=restaurants&find_loc=New+York%2C+NY"
#     url = "https://www.yelp.com/search?cflt=restaurants&find_loc=San+Francisco%2C+CA"
#     url = "https://www.yelp.com/search?cflt=restaurants&find_loc=Chicago%2C+IL"
    datas = scrape(browser, url)
    return datas

main()
df = pd.DataFrame(data).T
df.head()

Unnamed: 0,title,rating,num_review,price,types,address,neighbourhood,time
1,Amélie,4.5,1,$$,"[French, Wine Bars]",22 W 8th St,Greenwich Village,2019-10-25 21:20:23.057085
2,Upstate,4.5,2593,$$,"[Seafood, Wine Bars, Beer Bar]",95 1st Ave,East Village,2019-10-25 21:20:23.057085
3,LoveMama,4.5,1796,$$,"[Thai, Malaysian, Vietnamese]",174 2nd Ave,East Village,2019-10-25 21:20:23.057085
4,Burger & Lobster,4.0,4647,$$,"[Seafood, Burgers, American (New)]",39 W 19th St,Flatiron,2019-10-25 21:20:23.057085
5,Thai Villa,4.5,5298,$$,"[Thai, Asian Fusion]",5 E 19th St,Flatiron,2019-10-25 21:20:23.057085


### Make the multiple columns with string value from a column with list value
- [types] -> 'type_1', 'type_2', 'type_3'
- Drop the column [types]
- Make a ranking column from index

In [231]:
df2 = pd.DataFrame(df['types'].values.tolist())
df = df.assign(**{'type_1': df2[0].values, 'type_2': df2[1].values, 'type_3': df2[2].values})
df = df.drop(['types'], axis=1)
df = df.reset_index()
df = df.rename(columns = {'index':'ranking'})
df.head()

Unnamed: 0,ranking,title,rating,num_review,price,address,neighbourhood,time,type_1,type_2,type_3
0,1,Amélie,4.5,1,$$,22 W 8th St,Greenwich Village,2019-10-25 21:20:23.057085,French,Wine Bars,
1,2,Upstate,4.5,2593,$$,95 1st Ave,East Village,2019-10-25 21:20:23.057085,Seafood,Wine Bars,Beer Bar
2,3,LoveMama,4.5,1796,$$,174 2nd Ave,East Village,2019-10-25 21:20:23.057085,Thai,Malaysian,Vietnamese
3,4,Burger & Lobster,4.0,4647,$$,39 W 19th St,Flatiron,2019-10-25 21:20:23.057085,Seafood,Burgers,American (New)
4,5,Thai Villa,4.5,5298,$$,5 E 19th St,Flatiron,2019-10-25 21:20:23.057085,Thai,Asian Fusion,


### Check data types of the data frame

In [232]:
df.dtypes

ranking                  object
title                    object
rating                   object
num_review               object
price                    object
address                  object
neighbourhood            object
time             datetime64[ns]
type_1                   object
type_2                   object
type_3                   object
dtype: object

### Change numuric values to be a int or float

In [233]:
df['ranking'] = df['ranking'].astype(int)
df['rating'] = df['rating'].astype(float)
df['num_review'] = df['num_review'].astype(int)

### Change the price values to be a numeric value

In [234]:
for i in range(len(df['price'])):
    if df['price'][i] == '$':
        df['price'][i] = 5
    elif df['price'][i] == '$$':
        df['price'][i] = 55
    elif df['price'][i] == '$$$':
        df['price'][i] = 550
    elif df['price'][i] == '$$$$':
        df['price'][i] = 5500
    elif df['price'][i] == '':
        df['price'][i] = 0

df['price'] = df['price'].astype(int)
print(df)

    ranking                       title  rating  num_review  price  \
0         1                      Amélie     4.5           1     55   
1         2                     Upstate     4.5        2593     55   
2         3                    LoveMama     4.5        1796     55   
3         4            Burger & Lobster     4.0        4647     55   
4         5                  Thai Villa     4.5        5298     55   
5         6          Ipanema Restaurant     4.0        2947     55   
6         7                 Aunt Jake’s     4.0        1307     55   
7         8                Uglyduckling     4.0        1371     55   
8         9  Paesano of Mulberry Street     4.0         463     55   
9        10                  Chama Mama     4.5         805     55   
10       11             Salt + Charcoal     4.0         159     55   
11       12             Juliana’s Pizza     4.5         421     55   
12       13      Boucherie West Village     4.5        2099    550   
13       14         

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


### Check data types of the data frame

In [235]:
df.dtypes

ranking                   int64
title                    object
rating                  float64
num_review                int64
price                     int64
address                  object
neighbourhood            object
time             datetime64[ns]
type_1                   object
type_2                   object
type_3                   object
dtype: object

### Store three different dataset to CSV files

In [236]:
 export_csv = df.to_csv ('Dataset/Yelp_Ranking_NY.csv', index = None, header=True)
# export_csv = df.to_csv ('Dataset/Yelp_Ranking_SF.csv', index = None, header=True)
# export_csv = df.to_csv ('Dataset/Yelp_Ranking_CHI.csv', index = None, header=True)
export_csv

### Check the data frame group by neighbourhood adding counts

In [237]:
neighbour_df = df.groupby(['neighbourhood']).agg({'neighbourhood': {'neighbourhood_name':'first', 'neighbourhood_count':'count'}, 'ranking':{'ranking':'mean'}, 'rating':{'rating':'mean'},'num_review':{'num_review':'mean'}, 'price':{'price':'mean'}})
neighbour_df.columns = [col[1] for col in neighbour_df.columns]
neighbour_df.head()

in a future version.

For column-specific groupby renaming, use named aggregation

    >>> df.groupby(...).agg(name=('column', aggfunc))

  return super().aggregate(arg, *args, **kwargs)


Unnamed: 0_level_0,neighbourhood_name,neighbourhood_count,ranking,rating,num_review,price
neighbourhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Chelsea,Chelsea,1,10.0,4.5,805.0,55
Chinatown,Chinatown,1,18.0,4.0,1563.0,55
Clinton Hill,Clinton Hill,1,28.0,4.0,729.0,55
Cobble Hill,Cobble Hill,1,8.0,4.0,1371.0,55
DUMBO,DUMBO,2,14.0,4.0,1097.5,55
