## Scrape Yellowpages for extracting information about Pizzeria's in San Francisco

### Imports

In [None]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup 
import pymongo
import requests
import time
import json
import re

### Generic Functions

In [None]:
def getdriver ():
    path=r'chromedriver.exe'
    #driver = webdriver.Chrome(executable_path='/Users/Sripriya Srinivasan/Downloads/chromedriver_win32/chromedriver')
    driver = webdriver.Chrome(executable_path=path)
    driver.implicitly_wait(10)
    driver.set_script_timeout(120)
    driver.set_page_load_timeout(30)
    return driver

In [None]:
def writePage(fname,content):
    with open(fname, "w", encoding="utf-8") as file:
        file.write(str(content))

In [None]:
def read_file (name):
    HTMLFile = open(name, "rb")
    htmlfiledata = HTMLFile.read()
    return BeautifulSoup(htmlfiledata, 'lxml')

In [None]:
def connect_mongodb (db_name, collection_name):
    client = pymongo.MongoClient("mongodb://localhost:27017")
    db = client[db_name]
    collection = db[collection_name]
    return collection

In [None]:
def loadWebsiteData (url):
    try:
        headers = {'User-Agent': 'Mozilla/5.0'}
        page = requests.get(url,headers=headers)
        # Create a beautifulsoup object 
        return BeautifulSoup(page.text, 'lxml')
    except :
        print("Error connecting to website")

In [None]:
def updateData (sr,new_val):
    collection = connect_mongodb ("pizzeria", "sf_pizzerias")
    myquery = { "search rank": sr }
    newvalues = { "$set": new_val }
    collection.update_many(myquery, newvalues)

### Task 1 
Search on yellowpages.com for the top 30 “Pizzeria” in San Francisco. Save the search result page to disk, “sf_pizzeria_search_page.htm”.

In [None]:
def q4():
    yp_url = "https://www.yellowpages.com/search?search_terms=pizzeria&geo_location_terms=San%20Francisco%2C%20CA"
    pizzeria = loadWebsiteData(yp_url)
    writePage("sf_pizzeria_search_page.htm",pizzeria)

In [None]:
q4()

### Task 2
Open the search result page saved in Task 3 and parses out all shop information (search rank, name, linked URL [this store’s YP URL], star rating If It Exists, number of reviews IIE, TripAdvisor rating IIE, number of TA reviews IIE, “$” signs IIE, years in business IIE, review IIE, and amenities IIE).  Please be sure to skip all “Ad” results.

In [None]:
def q5():
    search_soup = read_file("sf_pizzeria_search_page.htm")
    results = search_soup.select("div.search-results.organic > div.result")
    base_url = "https://www.yellowpages.com"

    pizzeria_docs = []

    for result in results:
        amenities_list = []
        search_rank = re.search("\d+",result.find_all("h2", {"class": "n"})[0].text).group()
        name = result.select("h2.n > a")[0].text
        url = base_url + result.select("h2.n > a")[0]['href']
        star = result.find("a",{"class":"rating hasExtraRating"})
        ratings = result.find("div",{"class":"ratings"})

        star_rating = None
        star_review = None
        if(star is not None):
            star_rating = star.find("div")['class'][1]
            star_review = star.find("span").text

        ta_rating = None
        ta_reviews = None
        if(ratings is not None):
            try:
                ta = json.loads(ratings["data-tripadvisor"])
            except:
                ta=None
            if(ta is not None):
                ta_rating = ta.get('rating')
                ta_reviews = ta.get('count')

        price = result.find("div",{"class":"price-range"}).text if result.find("div",{"class":"price-range"}) is not None else result.find("div",{"class":"price-range"})

        years_in_business = result.find("div",{"class":"years-in-business"})
        years = None
        if(years_in_business is not None):
            years = years_in_business.findChildren("div",{"class":"number"})[0].text

        review_html = result.find("p",{"class":"body with-avatar"})
        review = review_html.text if review_html is not None else review_html

        amenities = result.find("div",{"class":"amenities-info"})
        if(amenities is not None):
            amenities_span = amenities.findChildren("span")
            for amenity in amenities_span:
                amenities_list.append(amenity.text)

        pizzeria = {"search rank":search_rank,
                   "name":name,
                   "url":url,
                   "star rating":{"rating":star_rating, "review count":star_review},
                   "ta rating":{"rating":ta_rating,"review count":ta_reviews},
                   "price":price,
                   "years in business":years,
                   "reviews":review,
                   "amenities":amenities_list}
        pizzeria_docs.append(pizzeria)
        print(pizzeria)
        print()
    return pizzeria_docs 

In [None]:
pizzeria_docs = q5()

### Task 3
Create a MongoDB collection called “sf_pizzerias” that stores all the extracted shop information, one document for each shop.

In [None]:
sf_pizzerias = connect_mongodb('pizzeria','sf_pizzerias')
sf_pizzerias.insert_many(pizzeria_docs)

### Task 4
Read all URLs stored in “sf_pizzerias” and download each shop page.  Store the page to disk, “sf_pizzerias_[SR].htm” (replace [SR] with the search rank).

In [None]:
def q7():
    for doc in pizzeria_docs:
        url = doc['url']
        sr = doc['search rank']
        time.sleep(2)
        soup = loadWebsiteData(url)
        fname = "sf_pizzerias_"+str(sr)+".htm"
        time.sleep(2)
        writePage(fname,soup)

In [None]:
q7()

### Task 5
Read the 30 shop pages saved in Task 6 and parse each shop’s address, phone number, and website.
Use https://positionstack.com/Links API to query each shop address’ geolocation (long, lat) and update each shop document on the MongoDB collection “sf_pizzerias” to contain the shop’s address, phone number, website, and geolocation.

In [None]:
def q9(address):
    url = "http://api.positionstack.com/v1/forward?access_key=b16e55d0611eb7e98ed44d6e8331f40b&query=" + address + "&output=json"
    data = loadWebsiteData(url)
    datajson = json.loads(data.select("p")[0].text)
    latitude = "Not found"
    longitude = "Not found"
    if 'data' in datajson:
        latitude = datajson['data'][0]['latitude']
        longitude = datajson['data'][0]['longitude']
    return {"latitude":latitude, "longitude":longitude}

In [None]:
def q8and9():
    for i in range(1,31):
        fname = "sf_pizzerias_"+str(i)+".htm"
        soup = read_file(fname)
        print(fname)
        address = "Info Missing"
        geolocation = "Info Missing"
        if (len(soup.select("span.address"))>0):
            address = soup.select("span.address")[0].text
            geolocation = q9(address)
        ph_num = soup.select("a.phone.dockable > strong")[0].text if len(soup.select("a.phone.dockable > strong"))>0 else "Info Missing"
        website = soup.select("a.website-link.dockable")[0]['href'] if len(soup.select("a.website-link.dockable"))>0 else "Info Missing"
        extra_info = {"address":address, "geolocation":geolocation, "phone_number":ph_num, "website":website}
        print(extra_info)
        print()
        updateData(str(i),extra_info)

In [None]:
q8and9()