# LA Restaurant Market and Health Data

In [713]:
# import dependencies

import pandas as pd
import os
import numpy as np
os.getcwd()

from splinter import Browser
from bs4 import BeautifulSoup
from urllib.parse import urlencode

import pymongo
from pymongo import MongoClient
client = MongoClient()


'/Users/sruthi/data-science-bootcamp/GTATL201902DATA3/ETL Project'

In [460]:
# pulling datasets

inspections_data = pd.read_csv('restaurant-and-market-health-inspections.csv')
violations_data = pd.read_csv('restaurant-and-market-health-violations.csv')

In [745]:
# function to embed violations data into inspections data

def transform_inspection(inspection, violations_df, facilities_df):
    VIOLATIONS_COLUMNS = ['row_id', 'violation_code', 'violation_description', 'violation_status', 'points']
    serial_number = inspection['serial_number']
    violations_dict = violations_df.loc[violations_df['serial_number'] == serial_number, VIOLATIONS_COLUMNS].to_dict(orient='records')
    result = inspection.to_dict()
    result['violations'] = violations_dict
    result['violations_count'] = len(violations_dict)
    rating = facilities_df.loc[(facilities_df['facility_name'] == inspections_data.iloc[0]['facility_name']) & (facilities_df['facility_address'] == inspections_data.iloc[0]['facility_address'])].loc[0]['rating']
    if not np.isnan(rating):
        result['rating'] = rating
    return result
  

# Scrap Yelp For Ratings

In [None]:
!which chromedriver

In [703]:
executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
browser = Browser('chrome', **executable_path, headless=False)

def get_url(restaurant_name):
    url_prefix = 'https://www.yelp.com/search?'
    params = { 'find_desc': restaurant_name, 'find_loc': 'Los Angeles,CA' }
    param_string = urlencode(params)
    return url_prefix + param_string

def is_matching_restaurant(name, address, scraped_name, scraped_address):
    name = name.lower()
    address = address.lower()
    scraped_name = scraped_name.lower()
    scraped_address = scraped_address.lower()
    
    is_name_match = name.startswith(scraped_name) or scraped_name.startswith(name)
    is_address_match = address.startswith(scraped_address) or scraped_address.startswith(address)
    return is_name_match and is_address_match
    
def scrape_rating(html, name, address):
    soup = BeautifulSoup(html, 'html.parser')
    restaurants_li = soup.find_all('li')
    for restaurant_li in restaurants_li:
        try:
            scraped_name = restaurant_li.h3.a['name']
            scraped_address = restaurant_li.address.span.text
            if (is_matching_restaurant(name, address, scraped_name, scraped_address)):
                return float(restaurant_li.find('div', role="img")['aria-label'].replace(' star rating', ''))
        except Exception as e:
            {}
            
facilities_df = inspections_data[['facility_name', 'facility_address']].drop_duplicates()
facilities_df['rating'] = pd.Series()

for index, facility in facilities_df.iterrows():
    if (index % 25 == 0):
        print('processing record #' + str(index))
        
    name = facility['facility_name']
    address = facility['facility_address']
    # visit site
    browser.visit(get_url(name))
    # send data to scrape rating and assign it to rating column
    rating = scrape_rating(browser.html, name, address)
    facilities_df.loc[index,'rating'] = rating
    
facilities_df.to_csv('yelp_rating.csv', sep=',')



# Load Into Mongo DB

In [704]:
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

In [705]:
db = client.restaurant_market_health_db
collection = db.inspection_summary

In [747]:
for i, inspection_series in inspections_data.iterrows():
    collection.insert_one(transform_inspection(inspection_series, violations_data, facilities_df))

In [695]:
executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
browser = Browser('chrome', **executable_path, headless=False)

In [749]:
facilities_df.to_csv('yelp_rating.csv', sep=',')