In [30]:
import requests
from bs4 import BeautifulSoup
import csv
import time

BASE_URL = "https://www.yellowpages.com"
CATEGORY = "digital-marketing-agencies"
p_limit = 1 
new_file = "DM.csv"


HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}


def extract_company_data(soup):
    companies = []
    listings = soup.find_all("div", class_="result")
    
    for listing in listings:
        company = {}
        try:
            company["name"] = listing.find("a", class_="business-name").text.strip()
        except AttributeError:
            company["name"] = None
        
        try:
            company["website"] = listing.find("a", class_="track-visit-website")["href"]
        except (AttributeError, TypeError):
            company["website"] = None
        
        try:
            company["phone"] = listing.find("div", class_="phones phone primary").text.strip()
        except AttributeError:
            company["phone"] = None
        
        try:
            company["address"] = listing.find("p", class_="adr").text.strip()
        except AttributeError:
            company["address"] = None

        try:
            company["category"] = CATEGORY.replace("-", " ").title()
        except AttributeError:
            company["category"] = None

        try:
            description_tag = listing.find("div", class_="snippet")
            company["description"] = description_tag.text.strip() if description_tag else None
        except AttributeError:
            company["description"] = None
        
       
        company["email"] = "N/A"

        companies.append(company)
    return companies

# Function to scrape a specific page
def scrape_page(url):
    response = requests.get(url, headers=HEADERS)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, "html.parser")
        return extract_company_data(soup)
    else:
        print(f"Failed to retrieve page: {url} (Status Code: {response.status_code})")
        return []

def scrape_directory():
    all_companies = []
    for page in range(1, p_limit + 1):
        url = f"{p_url}/search?search_terms={category}&page={page}"
        print(f"Scraping page {page}: {url}")
        companies = scrape_page(url)
        all_companies.extend(companies)
        time.sleep(2)
    return all_companies
def save_to_csv(data, filename):
    keys = data[0].keys() if data else ["Name", "Website", "Phone", "Address", "Category", "Description", "E-mail"]
    with open(filename, mode="w", newline="", encoding="utf-8") as file:
        writer = csv.DictWriter(file, fieldnames=keys)
        writer.writeheader()
        writer.writerows(data)

# Execute the scraper
if __name__ == "__main__":
    print("Starting to scrape data")
    companies_data = scrape_directory()
    if companies_data:
        save_to_csv(companies_data, new_file)
        print(f"Scraping completed. Data saved to '{new_file}'.")
    else:
        print("No data was scraped.")


Starting to scrape data
Scraping page 1: https://www.yellowpages.com/search?search_terms=digital-marketing-agencies&page=1
Scraping completed. Data saved to 'DM.csv'.
