In [2]:
import requests
import csv
import time

In [425]:
# Your Yelp Fusion API Key
API_KEY = #use your own API Key

# Define the radius (in meters)
radius = 4000  # 40 km

# Set the total limit of entries to scrape (200)
total_scraped_limit = 200


incremental_scraped_limit = 50  # This is the max for each request

# List of coordinates (latitude, longitude) for major cities in Ontario
ontario_locations = [
    (43.73, -79.63), # Vaughan
]


In [427]:
def get_plumber_data(latitude, longitude, radius, current_total, incremental_limit):
    headers = {
        'Authorization': f'Bearer {API_KEY}',
    }

    # Yelp API endpoint for business search
    url = 'https://api.yelp.com/v3/businesses/search'

    # Parameters for the request
    params = {
        'term': 'Phone Repair',
        'latitude': latitude,
        'longitude': longitude,
        'radius': radius,
        'limit': incremental_limit,
        'offset': current_total,  # Start fetching after the current total
    }

    all_entries = []
    
    while current_total < total_scraped_limit:
        response = requests.get(url, headers=headers, params=params)
        
        if response.status_code != 200:
            print(f"Error fetching data: {response.json()}")
            break

        data = response.json()
        businesses = data.get('businesses', [])

        # Add businesses to the all_entries list
        all_entries.extend(businesses)
        current_total += len(businesses)

        # Check if there's a need to make another request
        if len(businesses) < incremental_limit:
            break  # No more results available

        # Update offset for the next request
        params['offset'] = current_total
        time.sleep(1)  # Sleep for a second to avoid rate limiting

    return all_entries[:total_scraped_limit]  # Return up to total_scraped_limit


In [429]:
def save_to_csv(entries, filename):
    with open(filename, mode='a', newline='', encoding='utf-8') as csv_file:  # Append mode
        fieldnames = ['Name', 'Rating', 'Number of Reviews', 'Opening Hours', 'Phone Number', 
                      'Address', 'Website', 'Response Time', 'Response Rate', 'Services Provided', 
                      'Profile Link', 'Location', 'Service Tag']
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)

        # Only write header if the file is being created
        if csv_file.tell() == 0:
            writer.writeheader()  # Write header to CSV if file is empty
        
        # Loop through each entry and write the details to the CSV
        for entry in entries:
            name = entry.get('name', 'N/A')
            rating = entry.get('rating', 'N/A')
            num_reviews = entry.get('review_count', 'N/A')
            phone_number = entry.get('display_phone', 'N/A')
            address = ", ".join(entry.get('location', {}).get('display_address', []))
            website = entry.get('url', 'N/A')
            
            # Opening hours
            if 'hours' in entry and entry['hours']:
                opening_hours = []
                for hour in entry['hours']:
                    for open_time in hour['open']:
                        opening_hours.append(f"{open_time['day']}: {open_time['start']} - {open_time['end']}")
                opening_hours = "; ".join(opening_hours)
            else:
                opening_hours = 'N/A'
            
            # Get response time and response rate (if available)
            response_time = entry.get('attributes', {}).get('response_time', 'N/A')
            response_rate = entry.get('attributes', {}).get('response_rate', 'N/A')
            
            # Get services provided
            services_provided = entry.get('categories', [])
            services_provided = ", ".join([service['title'] for service in services_provided])
            
            # Profile link
            profile_link = entry.get('url', 'N/A')
            
            # Location tag (e.g., what the service provider is known for)
            service_tag = entry.get('alias', 'N/A')

            # Write data to the CSV file
            writer.writerow({
                'Name': name,
                'Rating': rating,
                'Number of Reviews': num_reviews,
                'Opening Hours': opening_hours,
                'Phone Number': phone_number,
                'Address': address,
                'Website': website,
                'Response Time': response_time,
                'Response Rate': response_rate,
                'Services Provided': services_provided,
                'Profile Link': profile_link,
                'Location': address,
                'Service Tag': service_tag
            })

    print(f"New data has been appended to '{filename}'")


In [431]:
if __name__ == "__main__":
    total_entries_scraped = 0
    
    # Initialize the CSV file with headers (run this only once to create the file)
    with open('plumbers_data_yelp.csv', mode='w', newline='', encoding='utf-8') as csv_file:
        fieldnames = ['Name', 'Rating', 'Number of Reviews', 'Opening Hours', 'Phone Number', 
                      'Address', 'Website', 'Response Time', 'Response Rate', 'Services Provided', 
                      'Profile Link', 'Location', 'Service Tag']
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
        writer.writeheader()  # Write header to CSV

    for latitude, longitude in ontario_locations:
        while total_entries_scraped < total_scraped_limit:
            # Get additional plumber data
            new_plumber_entries = get_plumber_data(latitude, longitude, radius, total_entries_scraped, incremental_scraped_limit)

            # Count the number of new entries retrieved
            new_entries_count = len(new_plumber_entries)
            total_entries_scraped += new_entries_count

            print(f"Total new entries retrieved from ({latitude}, {longitude}): {new_entries_count}")
            
            # Save new data to CSV
            if new_entries_count > 0:  # Only save if new entries exist
                save_to_csv(new_plumber_entries, 'plumbers_data_yelp.csv')

            # Break if no new entries are retrieved
            if new_entries_count < incremental_scraped_limit:
                print(f"No more new entries available for location ({latitude}, {longitude}).")
                break

    print(f"Total entries scraped: {total_entries_scraped}")


Total new entries retrieved from (43.73, -79.63): 12
New data has been appended to 'plumbers_data_yelp.csv'
No more new entries available for location (43.73, -79.63).
Total entries scraped: 12
