In [26]:
#Scrape data using the yelp api
#!pip install yelpapi

from yelpapi import YelpAPI
import pandas as pd
import os 
import pickle
import time

In [33]:
def Yelp_ScrapeISP(api_key, city_names, business_data, business_reviews):
    """
	====================================================================
	Version: 1.0.0
	Date: Tue 24 Nov 2020
	Purposes: Search and save yelp data about internet service providers
	within a region. 
	Input:
		Required:
			api_key = Api key assigned by yelp fusion
			city_names = List of locations for internet service providers
		Optional:
			business_data = .csv file containing data from previous searches
			business_reviews = .csv file containing reviews of businesses from
								previous searches
	Output:
		'businesses.csv' containing information about internet 
			service providers
		'businesses_reviews.csv' containing reviews of internet service
			providers in the businesses.csv file
		'cities_list.csv' list of previous cities that have been searched.
			Data will only be extracted for cities that have not been
			previously searched.
	
	Example: 
	cities = ['Santa Barbara, CA','Los Angeles, CA','New York, NY']
	api_key = XIXIXIXLXJO
	Yelp_ScrapeISP(api_key,cities)
	Author: Jordan Garrett
	jordangarrett@ucsb.edu
	====================================================================
	"""
    data_dir =  os.path.join(os.getcwd(),'Yelp_Data/')

    
    #check to see if any cities in the list have previously been searched
    if os.path.exists(data_dir+'cities_list.csv'):
    	prev_cities = pickle.load(open(data_dir+'cities_list.csv','rb'))
    	city_names = [city for city in city_names if city not in prev_cities]

    if city_names:
    	print(f'Searching Cities: {city_names}')
    else:
    	print('All cities have already been searched')
    	return


    yelp_api = YelpAPI(api_key)
    
    all_business_df = pd.DataFrame()
    all_reviews_df = pd.DataFrame()

    for iCity in city_names:
        
        # we can play around with the limit and offset parameters 
        # to control the number of results and what item to start the pull on
        
        search_results = yelp_api.search_query(term = 'Internet Service Provides',
                                               location = iCity, limit = 50)
        time.sleep(3)

        business_df = pd.DataFrame.from_dict(search_results['businesses'])


        # drop the phone, display_phone, transactions, is_closed, and image_url columns
        # we shouldn't need them
        unecessary_cols = ['phone', 'display_phone', 'transactions', 'is_closed','image_url']


        business_df2 = business_df.drop(unecessary_cols,1)

        #loop through businesses
        reviews = dict()

        reviews_df = pd.DataFrame()
        for iBiz, biz_id in enumerate(business_df2.loc[:,'id']):
            business_name = business_df2['name'][iBiz]

            #can only get 3 reviews through yelp api
            #BUT...we have the url...which means it should be easy to "not legally" scrape
            reviews[business_name] = yelp_api.reviews_query(biz_id)
            
            time.sleep(3)

            # temporary data frame we can use that will be appended to a master one later
            temp_df = pd.DataFrame.from_dict(reviews[business_name]['reviews'])

            temp_df = temp_df.drop('user',1)

            # add column for ISP provider
            temp_df.insert(0,'ISP_name',business_name)

            # add column for business id
            temp_df.insert(1,'business_id',biz_id)

            # add column for business location
            temp_df.insert(6,'location',
                           str(business_df2[business_df2['id'] == biz_id]['location'].item()['display_address']))


            temp_df = temp_df.rename(columns = {"id":"rev_id"})

            reviews_df = reviews_df.append(temp_df,ignore_index = True)


        all_business_df = all_business_df.append(business_df2, ignore_index=True)
        all_reviews_df = all_reviews_df.append(reviews_df,ignore_index=True)

    # Save data
    # if no previous files, just save the data. if previous files, append    
    if business_data == None and business_reviews == None:
        all_business_df.to_csv(data_dir+'businesses.csv', index=False)
        all_reviews_df.to_csv(data_dir+'businesses_reviews.csv', index = False)

    else: #append data to previous loaded files
        prev_business_df = pd.read_csv(data_dir+business_data)

        prev_reviews_df = pd.read_csv(data_dir+business_reviews)

        new_business_df = prev_business_df.append(all_business_df, ignore_index = True)
        new_reviews_df = prev_reviews_df.append(all_reviews_df, ignore_index = True)

        new_business_df.to_csv(data_dir+'businesses.csv', index=False)
        new_reviews_df.to_csv(data_dir+'businesses_reviews.csv', index = False)


    # Save previous cities to ensure that we aren't looking at cities previously searched
    pickle.dump(city_names, open(data_dir+"cities_list.csv", "wb"))


#if __name__ == '__main__':
#	Yelp_ScrapeISP(api_key,cities,
#		business_dataFile,business_reviewsFile)

In [34]:
#get the api key by creating an account on yelp and then clicking on Create App. Fill out
#form and it will generate a key for you.
api_key = "FCt6wjm5sZeio_0A4Z_4NdIqYGxWRHg5QJDmmGQhKeNskVa5HVCUi31uJ0oNjsZPUMhlnucbYh_8w7KMw_nB6eiDavzyAwhq8k0dfVWotmjFt_yLSTC6uEbPROa-X3Yx"

#cities = ['Los Angeles, CA','Santa Barbara, CA','Walnut, CA',
#'Santa Clarita, CA','Pomona,CA','Fresno,CA']

cities = pd.read_csv("cal_cities_lat_long.csv")
cities["Name"] = cities["Name"] + ", CA"

first_half_cities = cities[0:round(len(cities)/2)]

second_half_cities = cities[round(len(cities)/2):len(cities)]

In [45]:
# Need to chunk it
# does not include number on right side, can just change right side too since it will search all terms
next_chunk = second_half_cities.Name.to_list()[10:15] 
next_chunk

#len(second_half_cities)

['Marysville, CA',
 'Maywood, CA',
 'McFarland, CA',
 'Mendota, CA',
 'Menifee, CA']

In [43]:
#Yelp_ScrapeISP(api_key, second_half_cities.Name.to_list()[0:2], None, None)
Yelp_ScrapeISP(api_key, next_chunk, "businesses.csv", "businesses_reviews.csv")

Searching Cities: ['Maricopa, CA', 'Marina, CA', 'Martinez, CA']


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


In [37]:
#data_dir =  os.path.join(os.getcwd(),'Yelp_Data/')
#prev_cities = pickle.load(open(data_dir+'cities_list.csv','rb'))

#city_names = second_half_cities.Name.to_list()[2:4]
#city_names = [city for city in city_names if city not in prev_cities]
#city_names