In [36]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import csv
import time
import re

In [37]:
headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                    'Chrome/101.0.4951.54 Safari/537.36',
        'Connection': 'keep-alive',
        'Referer': 'https://google.com',
        'DNT': '1',
        'Accept-Language': 'en-GB,en;q=0.5'
    }

In [38]:
base_url = "https://www.yellowpages.com"
current_page = '/search?search_terms=restaurants&geo_location_terms=11364'

In [39]:
field_names = ['Restaurant ID','Name','Type of Restaurant','Trip Advisor Rating','TA Review Count','Yellow Pages Rating','YP Review Count','Phone Number','Street Address','Locality','Zipcode','Dollar Costs','Years in Business','Website','Menu URL','Secondary Info','Order Online Status']

In [40]:
restaurants_data = []

In [41]:
price_range_mapping = {
    '$': 'Low',
    '$$': 'Average',
    '$$$': 'High',
    '$$$$': 'Very High',
    '$$$$$': 'Astronomical'
}

In [42]:
while current_page:
    response = requests.get(base_url+current_page, headers=headers)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text,'html.parser')
        for result in soup.find_all('div', class_='info'):
            #Extract YellowPages ID
            restaurant_id_text = result.find('h2', class_='n').text.strip().replace(".","")
            restaurant_id = restaurant_id_text.split(' ')[0]
            #print(restaurant_id)
            
            # Extract name
            has_name = result.find('a', class_='business-name').find('span')
            if has_name:
                name_text = has_name.text.strip()
            else:
                name_text = None
            print(name_text)
        
            # Extract categories
            category_div = result.find('div',class_='categories')
            if category_div:
                category_elements = category_div.find_all('a')
                categories = [category.text.strip() for category in category_elements]
            else:
                categories = None
            #print("Categories:", categories)
        
            # Extract Tripadvisor Ratings
            #ta_rating_data = result.find('div', class_='ratings')['data-tripadvisor']
            #if ta_rating_data:
            #    ta_rating = float(ta_rating_data.split('"rating":"')[1].split('"')[0])
            #    ta_rating_count = int(ta_rating_data.split('"count":"')[1].split('"')[0])
            #else:
            #    ta_rating = None
            #    ta_rating_count = None
            #print("Trip Advisor Rating:", ta_rating)
            #print("Trip Advisor Count:", ta_rating_count)
            
            # Extract Tripadvisor Ratings
            ta_rating_data = result.find('div', class_='ratings')
            try:
                ta_rating_data = ta_rating_data['data-tripadvisor']
                ta_rating = float(ta_rating_data.split('"rating":"')[1].split('"')[0])
                ta_rating_count = int(ta_rating_data.split('"count":"')[1].split('"')[0])
            except (TypeError, KeyError, IndexError, ValueError):
                ta_rating = None
                ta_rating_count = None
            #print("Trip Advisor Rating:", ta_rating)
            #print("Trip Advisor Count:", ta_rating_count)
            
             # Extract Yellowpages Ratings
            has_extra_rating = result.find('a', class_='hasExtraRating')
            if has_extra_rating:
                yp_rating_class = has_extra_rating.find('div', class_='result-rating')['class'][1:]
                yp_rating_count = int(has_extra_rating.find('span', class_='count').text.strip('()'))
            else:
                yp_rating_class = None
                yp_rating_count=None
            #print("Yellow Pages Rating:", yp_rating_class)
            #print("Yellow Pages Count:", yp_rating_count)


            # Extract phone
            has_phone_number = result.find('div', class_='phones')
            if has_phone_number:
                phone_text = has_phone_number.text.strip()
            else:
                phone_text = None
            #print("Phone Number:",phone_text)
            

            # Extract street address
            has_street_address = result.find('div', class_='street-address')
            if has_street_address:
                street_address_text = has_street_address.text.strip()
                street_address_text_cleaned = re.sub(' +', ' ', street_address_text)
            else:
                street_address_text_cleaned = None
            #print("Street Address",street_address_text_cleaned)
            
            # Extract Locality
            has_locality = result.find('div', class_='locality')
            if has_locality:
                locality_text = has_locality.text.strip()
                zipcode_match = re.search(r'\b\d{5}\b', locality_text)
                
                if zipcode_match:
                    zipcode = zipcode_match.group()
                else:
                    zipcode = None
            else:
                locality_text = None
            #print("Locality:",locality_text)
            #print("ZIP Code:", zipcode)
            
            # Extract Price Range
            has_price_range = result.find('div', class_='price-range')
            if has_price_range:
                price_range_symbol = has_price_range.text.strip()
                price_range_dollars = price_range_mapping.get(price_range_symbol, 'unknown')
            else:
                price_range_dollars = None
            #print("Price Range:",price_range_dollars)
            
            # Extract Years In Business
            has_years_in_business = result.find('div', class_='years-in-business')
            if has_years_in_business:
                years_text = has_years_in_business.find('div', class_='count').text.strip()
                years_numeric = int(re.search(r'\d+', years_text).group())
            else:
                years_numeric = None
            #print ("Years in Business:", years_numeric)
            
            # Extract Website
            has_website = result.find('a', class_='track-visit-website')
            if has_website:
                website_link = has_website['href']
            else:
                website_link = None
            #print("Website Link:", website_link)
            
            # Extract Menu URL
            has_menu = result.find('a', class_='menu')
            if has_menu:
                menu_link = base_url + has_menu['href']
            else:
                menu_link = None
            #print("Menu Link:", menu_link)
            
            #Extract Secondary Info
            has_snippet = result.find('div',class_='snippet')
            if has_snippet:
                snippet_body = has_snippet.find('p',class_='body').text.strip()
            else:
                snippet_body = None
            #print("Secondary Info:", snippet_body)
            
            #Extract Order Online Status
            has_order_online = result.find('div',class_='listing-ctas').find('a',class_='order-online')
            if has_order_online:
                order_online = 1
            else:
                order_online = 0
            #print("Can Order Online?", order_online)
            
            restaurants_data.append([restaurant_id, name_text, categories, ta_rating, \
                                    ta_rating_count, yp_rating_class, yp_rating_count, phone_text, \
                                    street_address_text_cleaned, locality_text, zipcode, \
                                    price_range_dollars, years_numeric, website_link, \
                                    menu_link, snippet_body, order_online])
        
        #Go to the next page
        next_page_link = soup.find('a', class_='next ajax-page')
        if next_page_link:
            current_page = next_page_link['href']
            print('Scraping the Page', current_page)
        else:
            current_page = None
            
df = pd.DataFrame(restaurants_data, columns=field_names)
df.to_csv('yellowpages_newyork_11364.csv',index=False)

      

Villa Rustica
Imperial Taste
Anthony's
Vito's
Gino's Pizza
Blue Bay Diner
River Japanese Cuisine
New Saigon Restaurant Inc
China Grill
L'italiano Catering Inc
Empire Garden
Patoug Foods Incorporated
Little Dumpling, Bayside
ChilSung Korean BBQ
Chil Sung Garden
Duck Village Chinese Restaurant
Kokio Chicken and Beer
Master Panda Group Inc
Bell Diner
Shi Ba Hotpot
Nan bei ho 388 corp
The Shack Nyc
Niafa Inc
Blue Sky Chinese Restaurant
Central Water Filtration Systems
New Golden Star
Pizza One
Sapore
Osaka Japanese Cuisine Inc
Staunton's Bar & Restaurant
None


AttributeError: 'NoneType' object has no attribute 'find'