## Extract 
#### Information extracted from yellowpages.com to include:
- Restaurant name <br>
- Restaurant phone number <br>
- Restaurant website <br>
- Restaurant type (category) <br>

In [13]:
#import dependencies
from splinter import Browser
from bs4 import BeautifulSoup
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import requests
import json
from requests.utils import requote_uri
from sqlalchemy import create_engine
import re

In [14]:
# initialize browser 
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

[WDM] - Current google-chrome version is 89.0.4389
[WDM] - Get LATEST driver version for 89.0.4389






[WDM] - Driver [/Users/shondeenhechter/.wdm/drivers/chromedriver/mac64/89.0.4389.23/chromedriver] found in cache


In [15]:
search_url = 'https://www.yellowpages.com/search?search_terms=restaurants&geo_location_terms=San+Antonio%2C+TX'
browser.visit(search_url)

In [16]:
# Scrape page into soup
html = browser.html
soup = BeautifulSoup(html, 'html.parser')

In [17]:
# Getting the restaurant info from website and create lists to append information to
rest_info = soup.find_all('div', class_='info')
restaurant = []
phone =[]
website =[]
category =[]
for info in rest_info:
    try:
        restaurant.append(info.find('a', class_="business-name").text)
        phone.append(info.find('div', class_="phone").text)    
        website.append(info.find('a', class_="track-visit-website")['href'])
        category.append(info.find('div', class_="categories").text)       
    except TypeError:
        website.append(float('NAN'))
        category.append(float('NAN'))

In [18]:
# Load information into dataframe
restaurant_df  = pd.DataFrame ( { 'Restaurant_name': restaurant,
                               'Phone_number': phone,
                               'Website': website,
                                'Restaurant_type': category
                             })
restaurant_df

Unnamed: 0,Restaurant_name,Phone_number,Website,Restaurant_type
0,Jacala Mexican Restaurant,(210) 591-2039,,
1,Marie Callender's,(210) 680-4257,http://www.mariecallenders.com,RestaurantsAmerican RestaurantsBakeries
2,Fujiya Japanese Garden,(210) 966-1930,http://www.fujiyajapanesegardens.com,RestaurantsTake Out RestaurantsJapanese Restau...
3,Golden Wok,(210) 615-8282,,
4,Scuzzi's Italian Restaurant,(210) 742-8677,http://scuzzisitalianrestaurant.com,RestaurantsBarsCocktail Lounges
5,Golden Wok,(210) 674-2577,,
6,India Taj Palace Indian Restaurant,(210) 497-4800,http://www.indiatajpalace.com,RestaurantsTake Out RestaurantsIndian Restaurants
7,Jingu House at The Japenese Tea Garden,(210) 735-4647,http://jinguhousesa.com,RestaurantsParty PlanningMexican Restaurants
8,Chama Gaucha Brazilian Steakhouse,(210) 564-9400,,
9,Pappadeaux Seafood Kitchen,(210) 340-7143,https://pappadeaux.com,RestaurantsSeafood Restaurants


## Transform
#### Data is cleaned to remove:
- "Restaurants" from the Restaurant type column <br>
- "NaN" values <br>
- Duplicates <br>
- _"id" column added and set as index_

In [19]:
# Remove "Restaurants" tag from Restaurant_type tags
restaurant_df['Restaurant_type'] = restaurant_df.Restaurant_type.str.replace('Restaurants',"")
restaurant_df

Unnamed: 0,Restaurant_name,Phone_number,Website,Restaurant_type
0,Jacala Mexican Restaurant,(210) 591-2039,,
1,Marie Callender's,(210) 680-4257,http://www.mariecallenders.com,American Bakeries
2,Fujiya Japanese Garden,(210) 966-1930,http://www.fujiyajapanesegardens.com,Take Out Japanese
3,Golden Wok,(210) 615-8282,,
4,Scuzzi's Italian Restaurant,(210) 742-8677,http://scuzzisitalianrestaurant.com,BarsCocktail Lounges
5,Golden Wok,(210) 674-2577,,
6,India Taj Palace Indian Restaurant,(210) 497-4800,http://www.indiatajpalace.com,Take Out Indian
7,Jingu House at The Japenese Tea Garden,(210) 735-4647,http://jinguhousesa.com,Party PlanningMexican
8,Chama Gaucha Brazilian Steakhouse,(210) 564-9400,,
9,Pappadeaux Seafood Kitchen,(210) 340-7143,https://pappadeaux.com,Seafood


In [20]:
# Remove NaN values
restaurant_clean = restaurant_df.dropna(how = 'any')

In [21]:
# Dropping any duplicates
restaurant_clean = restaurant_clean.drop_duplicates(ignore_index=True)

In [24]:
# Load all clean information into dataframe, add "id" column and rearrange columns
restaurant_clean_df = restaurant_clean.copy()
restaurant_clean_df['id'] = restaurant_clean.index
restaurant_info = restaurant_clean_df [['id', 'Restaurant_name', 'Website', 'Phone_number', 'Restaurant_type']]
restaurant_info

Unnamed: 0,id,Restaurant_name,Website,Phone_number,Restaurant_type
0,0,Marie Callender's,http://www.mariecallenders.com,(210) 680-4257,American Bakeries
1,1,Fujiya Japanese Garden,http://www.fujiyajapanesegardens.com,(210) 966-1930,Take Out Japanese
2,2,Scuzzi's Italian Restaurant,http://scuzzisitalianrestaurant.com,(210) 742-8677,BarsCocktail Lounges
3,3,India Taj Palace Indian Restaurant,http://www.indiatajpalace.com,(210) 497-4800,Take Out Indian
4,4,Jingu House at The Japenese Tea Garden,http://jinguhousesa.com,(210) 735-4647,Party PlanningMexican
5,5,Pappadeaux Seafood Kitchen,https://pappadeaux.com,(210) 340-7143,Seafood
6,6,Bourbon Street Seafood Kitchen,http://www.bourbonstreetseafoodkitchen.com,(210) 545-0666,Seafood American
7,7,Texas Roadhouse,http://www.texasroadhouse.com,(210) 521-2988,Barbecue American
8,8,Willie's Grill & Icehouse,http://www.williesrestaurants.com,(210) 490-9220,American Bar & Grills
9,9,The Cove,http://www.thecove.us,(210) 227-2683,American Sandwich Shops


## Extraction #2
#### Information extracted from 2nd data source, google apis, to include:
- Address of restaurant <br>
- Restaurant latitude <br>
- Restaurant longitude <br>

In [25]:
# Import dependencies
import requests
import json
from requests.utils import requote_uri
from api_keys import g_key

In [13]:
# Extract location using Restaurant names extracted from Yellow Pages
restaurant_list = restaurant_clean_df['Restaurant_name'].tolist()
len(restaurant_list)

26

In [14]:
# Create lists
lat =[]
lng= []
address =[]


# Build URL using the Google Maps API
base_url = "https://maps.googleapis.com/maps/api/geocode/json"

In [15]:
# Create loop to iterate through restaurant list and get lat,lng and address info
for restaurant in restaurant_list:
    rest = f"{restaurant}, San Antonio, TX"
    params = {"address": rest, "key": g_key}
# Run request   
    response = requests.get(base_url, params=params)
    rest_go = response.json()
    lat.append(rest_go["results"][0]["geometry"]["location"]["lat"])
    lng.append(rest_go["results"][0]["geometry"]["location"]["lng"])
    address.append(rest_go["results"][0]["formatted_address"])
print(len(lat) )
print(len(lng) )
print(len(address) )

26
26
26


In [48]:
# Load information into dataframe
location_df = pd.DataFrame ({ 'Lat': lat,
                              'Lng': lng,
                               'Address': address
                            
                        })
location_df

Unnamed: 0,Lat,Lng,Address
0,29.488611,-98.5775,"4788 Northwest Loop 410, San Antonio, TX 78229..."
1,29.525875,-98.566506,"9030 Wurzbach Rd, San Antonio, TX 78240, USA"
2,29.666083,-98.633976,"24165 I-10 W, Unit 433, San Antonio, TX 78257,..."
3,29.627662,-98.493681,"20323 Huebner Rd, San Antonio, TX 78258, USA"
4,29.460833,-98.476944,"3853 N St Mary's St, San Antonio, TX 78212, USA"
5,29.519624,-98.488148,"76 NE Interstate 410 Loop, San Antonio, TX 782..."
6,29.603759,-98.444055,"2815 N Loop 1604 E #106, San Antonio, TX 78232..."
7,29.463339,-98.6203,"2893 Cinema Ridge, San Antonio, TX 78238, USA"
8,29.586405,-98.475878,"15801 San Pedro Ave, San Antonio, TX 78232, USA"
9,29.440464,-98.500532,"606 W Cypress St, San Antonio, TX 78212, USA"


## Transform #2
#### Data is cleaned to:
- Add "id" column <br>
- Rearrange columns 

In [56]:
# 'id' column added and set as index 
location_df['id'] = location_df.index

restaurant_location = location_df[['id','Address', 'Lat', 'Lng']]
restaurant_location

Unnamed: 0,id,Address,Lat,Lng
0,0,"4788 Northwest Loop 410, San Antonio, TX 78229...",29.488611,-98.5775
1,1,"9030 Wurzbach Rd, San Antonio, TX 78240, USA",29.525875,-98.566506
2,2,"24165 I-10 W, Unit 433, San Antonio, TX 78257,...",29.666083,-98.633976
3,3,"20323 Huebner Rd, San Antonio, TX 78258, USA",29.627662,-98.493681
4,4,"3853 N St Mary's St, San Antonio, TX 78212, USA",29.460833,-98.476944
5,5,"76 NE Interstate 410 Loop, San Antonio, TX 782...",29.519624,-98.488148
6,6,"2815 N Loop 1604 E #106, San Antonio, TX 78232...",29.603759,-98.444055
7,7,"2893 Cinema Ridge, San Antonio, TX 78238, USA",29.463339,-98.6203
8,8,"15801 San Pedro Ave, San Antonio, TX 78232, USA",29.586405,-98.475878
9,9,"606 W Cypress St, San Antonio, TX 78212, USA",29.440464,-98.500532


In [63]:
# Data Load
rds_connection_string = "postgres:postgres@localhost:5432/restaurants_db"
engine = create_engine(f'postgresql://{rds_connection_string}')

In [64]:
# Confirm tables
engine.table_names()

['restaurant_info', 'restaurant_location']

In [65]:
# Dataframe loaded into database
restaurant_info.to_sql(name='restaurant_info', con=engine, if_exists='replace', index=False)

In [66]:
# Dataframe loaded into database
restaurant_location.to_sql(name='restaurant_location', con=engine, if_exists='replace', index=False)