<a href="https://colab.research.google.com/github/prasetyofmalik/google-colab/blob/main/WebScraping-BeautifulSoup/Project_3_Advanced_Part2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Imports

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import urllib.parse

### Put all together

In [3]:
# create dataframe
df_restaurant = pd.DataFrame(columns=['Restaurant Name', 'Address', 'Phone', 'Email', 'Website', 'Info'])

# sequence from 1 to 3, 4 is not included
for i in range(1,4):
  website = 'https://www.yellowpages.com/search?search_terms=restaurant&geo_location_terms=New%20York%2C%20NY&page=' + str(i)
  response = requests.get(website)

  # create soup object
  soup = BeautifulSoup(response.content, 'html.parser')

  # result container
  result_container = soup.find_all('div', {'class':'result'})

  # combine url part  with part 2 to get absolute url
  url_part_1 = 'https://www.yellowpages.com/'
  url_part_2 = []

  # loop through results
  for item in result_container:
    # loop through links
    for link in item.find_all('a', {'class':'business-name'}):
      url_part_2.append(link.get('href'))

  # join url part 1 and 2
  url_joined = []

  for link_2 in url_part_2:
    url_joined.append(urllib.parse.urljoin(url_part_1, link_2))
  

  # loop through all joined links
  for link in url_joined:
    response = requests.get(link)

    # create soup object
    soup = BeautifulSoup(response.content, 'html.parser')

    # name
    try:
      name = soup.find('h1').get_text()
    except:
      name = 'n/a'
    
    # address
    try:
      address = soup.find('span', {'class':'address'}).get_text()
    except:
      address = 'n/a'
    
    # phone
    try:
      phone = soup.find('p', {'class':'phone'}).get_text().split('Phone:  ')[1]
    except:
      phone = 'n/a'
    
    # email
    try:
      email = soup.find('a', {'class':'email-business'}).get('href').split('mailto:')[1]
    except:
      email = 'n/a'
    
    # website
    try:
      website = soup.find('a',{'class':'website-link'}).get('href')
    except:
      website = 'n/a'
    
    # general info
    try:
      info = soup.find('dd', {'class':'general-info'}).get_text()
    except:
      info = 'n/a'

    # pandas dataframe
    df_restaurant = df_restaurant.append({'Restaurant Name': name, 'Address': address, 'Phone': phone,
                                          'Email': email, 'Website': website, 'Info': info}, ignore_index=True)


### Output Pandas Dataframe

In [6]:
df_restaurant

Unnamed: 0,Restaurant Name,Address,Phone,Email,Website,Info
0,Mr. K's,"570 Lexington AveNew York, NY 10022",(212) 583-1668,info@movingmaninc.com,http://www.mrksny.com,We offer exclusive seating to you and your par...
1,Seven's Turkish Grill,"158 W 72nd StNew York, NY 10023",(212) 724-4700,,,Here at Seven's Mediterranean Turkish Grill we...
2,Todaro Brothers,"555 2nd AveNew York, NY 10016",(212) 532-0633,eat@todarobros.com,http://www.todarobros.com,NYC's Neighborhood Market since 1917. Todaro ...
3,Babbo,"110 Waverly PlNew York, NY 10011",(212) 777-0303,hospitality@babbonyc.com,http://www.babbonyc.com,Babbo Ristorante e Enoteca is an exuberant cel...
4,Sparks Steak House,"210 E 46th StNew York, NY 10017",(212) 687-4855,office@sparkssteakhouse.com,http://www.sparkssteakhouse.com,"Established in 1966, Sparks Steak House featur..."
...,...,...,...,...,...,...
85,Rafele,"29 7th Ave SNew York, NY 10014",(212) 242-1999,,http://www.rafele.com,
86,TGI Fridays,"34 Union Sq ENew York, NY 10003",(646) 556-6381,,http://www.tgifridays.com,
87,Cosi,"60 E 56th StNew York, NY 10022",(212) 588-1225,contactus@getcosi.com,http://www.getcosi.com,
88,Barfly Sports Bar & Restaurant,"244 3rd AveNew York, NY 10010",(212) 473-9660,,http://www.barflyny.com,


### Store in Excel

In [5]:
df_restaurant.to_excel('results_multiple_pages.xlsx', index=False)