In [1]:
#! pip install webdriver-manager

In [2]:
import pandas as pd
import re
import requests
import time
import warnings
from bs4 import BeautifulSoup as bs
from random import randint
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from webdriver_manager.chrome import ChromeDriverManager
from tqdm import tqdm
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')



This project has been developed for web scraping from https://www.yellowpages.ca/ website. In order to use the programme, the chrome driver must be compatible with the related version of selenium package. For this reason, the lines between 3-10 in the script have been added so that the download and installation process can be performed automatically. The code asks the user to enter two different keywords to generate search results on the yellowpages page:

(1) A keyword what you want to search on yellowpages website (variable name: what).  You can browse yellowpages website first to determine what keywords you want to search on. Besides, the code does not sensitive upper and lower cases, and please do not include special characters in search results.
(2) A keyword where you want to search on yellowpages website (variable name: where).This variable helps you to restrict the search results for specific locations (e.g. a state or city). 

Note: In this script, entire searching results are listed and the informations listed below are scrapped from search results.Finally, the results are saved as ".csv" file. 
1- Category
2- Postcode
3- Province
4- City (if any)
5- Street
6- Name
7- Phone
8- Site (if any)
9- Review (if any)

In [3]:
class yellow_page_ca_scraper:
    
    def open_yellow_page_ca(self):
        options = webdriver.ChromeOptions()
        options.headless = True
        options.add_argument('--no-sandbox')
        options.add_argument("--disable-setuid-sandbox")
        options.add_argument("--disable-setuid-sandbox")
        options.add_argument('--disable-dev-shm-usage')
        chrome_deriver_manager = ChromeDriverManager()
        driver = webdriver.Chrome(chrome_deriver_manager.install(), options=options)
        page = driver.get('https://www.yellowpages.ca/')
        search_what = driver.find_element(By.ID, "whatwho")
        search_what.send_keys(what)
        search_where = driver.find_element(By.ID, "where")
        search_where.send_keys(where)
        search = driver.find_element("xpath","//*[@id='inputForm']/div[2]/div[2]/div/button/span[1]")
        search.submit()
        time.sleep(randint(7,11))
        number_of_pages = driver.find_element(By.CLASS_NAME,"pageCount").text
        pattern1 = r'^\d\s/\s'
        match = (re.search(pattern1, number_of_pages))
        number_of_pages = number_of_pages[match.end():]
        base_url = driver.current_url
        pattern = r'\d+'
        match = (re.search(pattern, base_url))
        base_url1 = base_url[:match.start()]
        base_url2 = base_url[match.end():]
        time.sleep(randint(3,4))
        category = driver.find_element(By.XPATH,"//*[@id='jsListingMerchantCards']/div[1]/h1/strong[1]").text
        province = driver.find_element(By.XPATH,"//*[@id='jsListingMerchantCards']/div[1]/h1/strong[2]").text
        pattern2 = r'\b[A-Z]{2,}'
        match2 = (re.search(pattern2, province))
        province = province[match2.start():match2.end()]
        return driver, number_of_pages, base_url1, base_url2, category, province
    
    def get_page_results_info(self):
        lists = []
        bar = tqdm(range(1,int(number_of_pages)+1))
        for page in bar:
            bar.set_description("Processing of page {}".format(page))
            base_url = base_url1 + "{}".format(str(page)) + base_url2
            r = requests.get(base_url)
            parsedHTML = bs(r.text, "html.parser")
            links_of_page = parsedHTML.find_all('div', class_ = 'listing__content__wrap--flexed jsGoToMp')
            for current_div in links_of_page:
                name = current_div.find('a', class_ = 'listing__name--link listing__link jsListingName').text.strip()
                try:
                    address = current_div.find('span', class_ ='listing__address--full').text.strip()
                except:
                    address = 'There is no address information'   
                try:
                    postcode = current_div.find("span", {"itemprop" : "postalCode"}).text
                except:
                    postcode = 'There is no postcode information'     
                try:
                    city = current_div.find("span", {"itemprop" : "addressLocality"}).text
                except:
                    city = 'There is no city information'  
                try:
                    website = current_div.find("li", "mlr__item mlr__item--website").a["href"]
                    websiteRedirect = website.find("redirect=")
                    website = website[websiteRedirect+9:]
                    website = website.replace("%3A", ":")
                    website = website.replace("%2F", "/")      
                except:
                    website = 'There is no website'
                try:
                    tel = current_div.find("li", "mlr__item mlr__item--more mlr__item--phone jsMapBubblePhone").text.strip().replace('\n', '')
                    tel = str(tel)
                    tel = tel[12:]
                except:
                    tel = 'There is no telephone number'
                try:
                    rating = current_div.find('span', class_ = 'ypStars jsReviewsChart')["title"]
                except:
                    rating = 'There is no rating value'
                info = {
                        'Category':category,
                        'Postcode':postcode,
                        'Province':province,
                        'City':city,
                        'Street':address,
                        'Name':name,
                        'Phone':tel,
                        'Site':website,
                        'Review':rating    
                        }
                lists.append(info)
        return lists
    def save_csv(self): 
        #pd.set_option('display.max_rows', None)
        df = pd.DataFrame(lists)
        df.to_csv('yellow_page_ca_results.csv', index=False)
        return df    

In [4]:
if __name__ == '__main__':
    scraper = yellow_page_ca_scraper()
    what = input('Enter a keyword what you want to search on Yellow Pages:')
    where = input('Enter a keyword where you want to search on Yellow Pages:')
    driver, number_of_pages, base_url1, base_url2, category, province = scraper.open_yellow_page_ca()
    lists = scraper.get_page_results_info()
    scraper.save_csv()   

Enter a keyword what you want to search on Yellow Pages:restaurant
Enter a keyword where you want to search on Yellow Pages:toronto


Processing of page 354: 100%|████████████████████████████████████████████████████████| 354/354 [12:49<00:00,  2.18s/it]
