# Project Overview

1. Go to the Better Business Bureau website
2. Locate all the listings for roofing contractors in Macon, Georgia
3. Build a crawler in Python (use any framework/library you choose) that will extract the following
data points for each roofing contractor, while also filtering out all “waterproof” and
“waterproofing” contractors:
● Company name
● Phone number
● Address with street, city, state, and zip code
● Company Website (if available)
● Email Address (if available)
● BBB Rating
● Accredited Date (if available)
● Profile page URL
● (Any other information you think is good to have/relevant)
4. Export results in .csv format
5. Make a new column in your results file. In that column, please rank the results based on which
businesses you think would have the highest close rate for our sales department
6. Write a 3-5 sentence explanation on your thought process and why you ranked the results the
way you did
7. Write a 3-5 sentence explanation for the following question: If you came across a website where
there were no phone numbers on the company profiles, what alternative methods would you
employ to find them?
8. Email your recruiter your ranked results csv file, the crawler code you wrote to extract the data,
and your 2 responses. The team will review your project and reach out about next steps.

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time
from concurrent.futures import ThreadPoolExecutor
import re

### Getting Business Links

In [None]:
CHROME_DRIVER_PATH = './chromedriver_v_118.exe'

SERVICE = Service(CHROME_DRIVER_PATH)

CHROME_OPTIONS = Options()
CHROME_OPTIONS.add_argument("--headless=new")
CHROME_OPTIONS.add_argument('user-agent=')


In [None]:
def wait_for_content(url, driver, expected_content='Better Business Bureau'):
    '''
    Function to force the page to reload, in case there are any gateway errors host side
    In the case of future errors with IP blocking, this is where we could change the way we are getting page content
    '''
    driver.get(url)
    time.sleep(1)
    while expected_content not in driver.page_source:
        driver.get(url)
        time.sleep(2)


def get_business_links(search_phrase: str, accredation=False, city:str='Macon', state:str='GA'):
    '''
    Function to get all the links to business profile pages off of a searach on bbb.org
    Code can be changed in future to get different cities/states as well
    '''
    
    driver = webdriver.Chrome(service=SERVICE,options=CHROME_OPTIONS)

    url = f'https://www.bbb.org/search?find_loc={city}%2C%20{state}&find_text={search_phrase.replace(" ","%20")}&page='
    start_url = url + '1'

    wait_for_content(start_url,driver)

    # Getting past accredited prompt window, which only happens when first loading the site
    try:
        if not accredation:       
            driver.find_element(By.XPATH,'//*[@id="root"]/dialog[2]/form/fieldset/div[3]/div/label/span').click()
        driver.find_element(By.XPATH,'//*[@id="root"]/dialog[2]/form/div/button').click()
    except Exception:
        pass

    # Declaring set to store links to businesses
    business_links = set()

    # Iterating over the listings to get all the business links
    next_page = True
    next_page_url = ''

    while next_page:
        if next_page_url != '':
            wait_for_content(next_page_url,driver)
            
        # Identifying business cards
        business_cards = driver.find_elements(By.XPATH,'//a[starts-with(@class, "text-blue-medium") and @href]')
        
        # Adding all data
        for card in business_cards:
            business_links.add(card.get_attribute('href'))

        # Loading next page
        try:
            next_button = WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.XPATH, '//a[@rel="next"]')))
            next_page_url = next_button.get_attribute("href")
        except Exception:
            next_page_url = ''
            next_page = False
    
    driver.close()

    return business_links

Problem: The site seems to only load 15 pages of 15 businesses at a time

I am going to create a master set of business links, and search for varying keywords to try and beef up the results as much as possible. 

This probably isn't the best approach but for now it will do better than just taking the first search results. In the future, some ideas for other approaches could be applying different filters, sorts, or parameters to the search to get more results out of each search keyword.

In [None]:
def scrape_for_links_roofing():
    
    # Search terms which the scraper will periodically search for
    search_terms = [
    "roofing contractors",
    "roofing companies",
    "roofing services",
    "roofers",
    "roof inc",
    "roofing inc",
    "roofing co",
    "roofing co.",
    "roofing llc"
    "'s roofing",
    "& roofing",
    "a roofing",
    "b roofing",
    "c roofing",
    "d roofing",
    "e roofing",
    "f roofing",
    "g roofing",
    "h roofing",
    "i roofing",
    "j roofing",
    "k roofing",
    "l roofing",
    "m roofing",
    "n roofing",
    "o roofing",
    "p roofing",
    "q roofing",
    "r roofing",
    "s roofing",
    "t roofing",
    "u roofing",
    "v roofing",
    "w roofing",
    "roofing construction",
    "roof repair",
    "roofing specialists",
    "roof installation",
    "roof maintenance",
    "roofing experts",
    "roofing professionals",
    "roofing solutions",
    "commercial roofing",
    "residential roofing",
    "roofing repair",
    "roofing experts",
    "roofing pro",
    "trust roofing"
    ]

    link_set = set()
    search_num = 1

    # Iterating over the search terms to scrape for links
    for term in search_terms:
        curr_search = get_business_links(term)
        acc_search = get_business_links(term,True)
        link_set = link_set.union(curr_search).union(acc_search)
        print(f'Current Num of Links: {len(link_set)}')
        search_num+=1
        

    # Writes to CSV so that we can save the links over time
    existing_links = set()
    try:
        with open("links.csv", "r") as file:
            for line in file:
                existing_links.add(line.strip())
    except FileNotFoundError:
        pass

    new_links_to_add = link_set - existing_links

    with open("links.csv", "a") as file:
        for link in new_links_to_add:
            file.write(link + "\n")

Note that this method is not computationally optimal. With more time, Threading could be implemented to make searches run in parallel across multiple chrome drivers to speed up the data collection process. More search terms could also be added to gather more data.

### Scraping Business Data

Data points to collect:

1. Company name
2. Phone number
3. Address (Street, City, State, Zip)
4. Company Website
5. Email Address - Not available (on bbb site there is a button to compose email in browser, but it does not give the actual email)
6. BBB Rating
7. Accredited Date
8. Profile url (already have)

Nice to haves, which will be used to calculate saleability score:

9. Category
10. Years in business
11. Complaints last 3 years
12. Complaints last 12 months

In [None]:
def get_business_data(url):
    try:
        driver = webdriver.Chrome(service=SERVICE, options=CHROME_OPTIONS)
        wait_for_content(url, driver)

        # Initialize default values
        data = {
            "name": None,
            "street": None,
            "city": None,
            "state": None,
            "zipcode": None,
            "phone": None,
            "website": None,
            "bbb_url": url,
            "category": None,
            "accredited": None,
            "bbb_rating": None,
            "accredited_date": None,
            "years_in_business": None,
            "one_year_complaints": 0,
            "three_year_complaints": 0
        }

        # Company Name
        try:
            data["name"] = driver.find_element(By.XPATH, '//*[@id="content"]/div[1]/div/header/div/div/h1/span[3]').text
        except Exception:
            pass

        # Phone Number
        try:
            data["phone"] = driver.find_element(By.XPATH,'//a[contains(@href, "tel:")][1]').text
        except Exception:
            pass

        # Address
        try:
            address_elements = driver.find_element(By.XPATH, '//div[starts-with(@class, "card stack dtm-contact")]').find_element(By.XPATH, '//div[starts-with(@class, "dtm-address")]').text.split('\n')
            # Initialize variables
            street_address = None
            city = None
            state = None
            zipcode = None

            # Case if there is a suit number 3rd line in the address    
            if len(address_elements) > 2 and address_elements[2] != 'Get Directions':
                address_elements[0] += ', ' + address_elements[1]
                address_elements.pop(1)

            if len(address_elements) > 1:
                street_address = address_elements[0]
                city_state_zip = address_elements[1]
            else:
                city_state_zip = address_elements[0]

            # Split the city/state/zip into separate components
            city, state_zipcode = city_state_zip.split(",", 1)
            state_zipcode = state_zipcode.strip()
            state, zipcode = state_zipcode.split(" ", 1)

            data['street'] = street_address
            data['city'] = city
            data['state'] = state
            data['zipcode'] = zipcode
        except Exception:
            pass


        # Website
        try:
            data["website"] = driver.find_element(By.CSS_SELECTOR, '.card.dtm-contact .dtm-url').get_attribute('href')
        except Exception:
            pass

        # BBB rating
        try:
            data["bbb_rating"] = driver.find_element(By.XPATH, '//*[starts-with(@class, "dtm-rating")]').text.split('\n')[0]
        except Exception:
            pass

        # Accredation status
        accredited = True
        try:
            acc_status = driver.find_element(By.XPATH, '//*[@id="content"]/div[2]/div[2]/div[2]/div[2]/div/div[1]/a').text
            if 'BBB' in acc_status:
                accredited = False
                data["accredited"] = 'Not Accredited'
        except Exception:
            data['accredited'] = 'Accredited'

        # Accredited date
        if accredited:
            try:
                data["accredited_date"] = driver.find_element(By.XPATH,'//p[contains(., "Accredited Since")]').text.split(': ')[1]
            except Exception:
                pass

        # Category
        try:
            category = url.split('/')[7].replace("-", " ").replace("contractors", "Contractors").title()
            data['category'] = category
        except Exception:
            pass

        # Years in business
        try:
            data["years_in_business"] = int(driver.find_element(By.XPATH, '//p[contains(., "Years in Business")]').text.split(': ')[1])
        except Exception:
            pass

        # Complaints
        try:
            complaints = driver.find_element(By.XPATH, '//h3[contains(text(), "Customer Complaints")]/following::div[1]').text
            if 'closed' in complaints:
                
                numbers = re.findall(r'(\d+)\s+complaints', complaints)

                data["three_year_complaints"] = numbers[0]
                data["one_year_complaints"] = numbers[1]
        except Exception:
            pass

        return data

    except Exception as e:
        print("An error occurred:", str(e))

    finally:
        driver.close()

In [None]:
def get_dataframe(csv_file):
    results = []

    # Read the links from the CSV file
    try:
        with open(csv_file, 'r') as file:
            links = [line.strip() for line in file]

    except FileNotFoundError:
        print(f"File '{csv_file}' not found.")
        return

    total_links = len(links)
    # Progress threshold for notification (25% intervals)
    progress_threshold = total_links // 4 

    def scrape_data(link):
        curr_data = get_business_data(link)
        if curr_data:
            results.append(curr_data)

        if len(results) % progress_threshold == 0:
            progress = len(results) / total_links * 100
            print(f"Progress: {progress:.2f}% ({len(results)} links processed)")

    with ThreadPoolExecutor(max_workers=6) as executor:
        for link in links:
            executor.submit(scrape_data, link)

    pd.DataFrame(results).to_csv('results.csv')

In [None]:
get_dataframe('links.csv')

In [None]:
def calculate_saleability_score(row):
    '''
    Function to calculate saleability score. Note that these values are estimates, and
    a more comprehensive model could be later developed to refine the weighting of different factors

    The max score is 100%

    bbb rating is worth 25%. 
    This rating varies from NR, F to A+ and is based on the better business bureau's own rating
    The thought process is that a higher rated business has longer track record of fewer complaints and more transparency.
    These businesses likely care about their customers, and can be sold on reaching more.
    https://www.bbb.org/overview-of-bbb-ratings

    accredited date is worth 10%. 
    The bbb rating is a good indicator, but not if it was given out yesterday. 
    Every year under 5 years that the accredation was received, the business loses 2% score

    years in business is worth 10%.
    same as accredited date, longer standing businesses will likely be a good indicator of overall business quality

    contact info completeness is worth 15%
    in order to sell and close, we need contact info, and the more the better.
    each piece that is missing is worth 3% (street,city,state,zipcode,phone)

    website is worth 10%
    if no website, it is harder to prepare for a sales call, making it harder to close

    category is worth 15%
    the category we are interested in is 

    there is also 0.10 available for not having any complaints in last year, and 0.05 for the last three years 
    all businesses automatically receieve these points if they have no complaints
    '''
    
    weights = {
    "bbb_rating": 0.25,
    "accredited_date": 0.10,
    "years_in_business": 0.10,
    "contact_info_completeness": 0.15,
    "website": 0.10,
    "category": 0.15,
    "one_year_complaints": 0.10,
    "three_year_complaints": 0.05,
    } 
    
    score = 0

    # BBB Rating
    bbb_mapping = {
        'A+': 1.0,
        'A': 0.94,
        'A-': 0.90,
        'B+': 0.87,
        'B': 0.84,
        'B-': 0.80,
        'C+': 0.77,
        'C': 0.74,
        'C-': 0.70,
        'D+': 0.67,
        'D': 0.64,
        'D-': 0.60,
        'F': 0.0,
        'NR': 0.0
    }
    #bbb score
    bbb_score = bbb_mapping.get(row['bbb_rating'],0) * weights['bbb_rating']

    # Time since accredition
    accredited_score = 0
    if row['accredited']=='Accredited':
        years_ago = (pd.Timestamp.now() - pd.Timestamp(row['accredited_date'])).days // 365
        accredited_score = min(years_ago,5)/5*weights['accredited_date']

    # years in business
    time_in_business = 0
    if not pd.isna(row['years_in_business']):
        time_in_business = min(row['years_in_business'],10)/10*weights['years_in_business']

    three_year = 0
    # complaints
    if row['three_year_complaints']==0:
        three_year = weights['three_year_complaints']

    one_year = 0
    if row['one_year_complaints']==0:
        one_year = weights['one_year_complaints']

    # Website
    website_score = 0
    if not pd.isna(row['website']):
        website_score = weights['website']

    category = 0
    if row['category']=='Roofing Contractors':
        category = weights['category']


    #Contact info completeness
    factors = [row['name'],row['street'],row['city'],row['state'],row['zipcode'],row['phone']]
    total_factors = len(factors)
    total_present = 0
    for factor in factors:
        if not pd.isna(factor):
            total_present += 1
    completeness = total_present/total_factors*weights['contact_info_completeness']


    # final score
    score = completeness + category + website_score + one_year + three_year + time_in_business + accredited_score + bbb_score

    return round(score,2)

In [None]:
df = pd.read_csv('results.csv',index_col=0)

df.head(5)

In [None]:
df["saleability_score"] = df.apply(calculate_saleability_score, axis=1)

# Dropping waterproofing businesses
drop_keywords = ['water','waterproof','waterproofing']
df['name_lower'] = df['name'].str.lower()
df = df[~(df['name_lower'].str.contains('|'.join(drop_keywords), na=False) & pd.notna(df['name_lower']))]
df = df.drop(columns='name_lower')

# Sorting by saleability
df = df.sort_values(by="saleability_score", ascending=False).reset_index(drop=True)

df.head(10)

In [None]:
df.to_csv('scored_result.csv')