# Part-1 CODE-BLOCK
- Do not make any changes in the 'CODE-BLOCK'
- Execute all cells in sequence
- Test-Blocks are disabled but for safety do not try to execute any cell marked as Test-Block

In [153]:
# import necessary libraries
import time
import requests
import numpy as np
import pandas as pd
from contextlib import suppress
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.wait import WebDriverWait 
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

# Install webdriver manager to automatically detect driver location
service = ChromeService(executable_path=ChromeDriverManager().install())

# Set up ChromeDriver options
chrome_options = Options()
chrome_options.add_argument('--headless') # Run in headless mode to avoid opening a new browser window


[WDM] - Downloading: 100%|████████████████████████████████████████████████████████| 6.78M/6.78M [00:00<00:00, 11.0MB/s]


In [111]:
def extract_hrefs(url, n=5, sleep = 15):
    """
    Extracts all hrefs from a paginated web url as a list of lists
    Args:
        n = number of paginated web urls from which hrefs are to be extracted, default = 5
        sleep = time in seconds to sleep before quitting the driver, default = 15 sec
        url = paginated url from which data is to be extracted
    Returns:
        (list of lists) where each list contains the hrefs of each page 
    
    """
    
        # Initialize empty list to store results
    all_hrefs = []

    n = 2
    # Loop over n list pages
    for i in range(1, n+1):
        # Build URL for current list pages having links to company profiles
        # Don't uncomment following line of code
#         url = f"https://wirmarket.wir.ch/de/members/list/?page={i}&resultAmount=100"
        

        # Launch ChromeDriver and load websites
        # In order to disable browser windows opening for websites amend the code of the following line:
        # driver = webdriver.Chrome(service=service, options = chrome_options)
        driver = webdriver.Chrome(service=service)

        # Don't uncomment the code on the following line, it is for testing
        #   driver.implicitly_wait(20) # seconds
        driver.get(url)

        # Wait for all anchor elements to be present
        # WebDriverWait(driver, 180).until(EC.visiblity_of_all_elements_located((By.CSS_SELECTOR, "a")))
        # Due to intermittent functionality of WebDriverWait, we introduce sleep() function to get all anchors
        # This value is presently set at 15 secs but you may change it depending upon internet speed
        time.sleep(sleep)
        # Extract hrefs from all anchor elements on current website
        hrefs = [a.get_attribute('href') for a in driver.find_elements(By.CSS_SELECTOR, "a")]

        # Add hrefs to the list of all hrefs
        all_hrefs.extend(hrefs) # extend flattens the list so that it is no longer a list of lists

        # Close the current browser window
        driver.quit()
    return all_hrefs


In [113]:
# ### Test Block ! DO NOT EXECUTE
# url = f"https://wirmarket.wir.ch/de/members/list/?page={i}&resultAmount=100"
# all_hrefs_list = extract_hrefs(n=2, sleep = 15, 
#                          url = url)
                          
# # Check the length of all_hrefs list
# # Each list link contains between 400-500 hrefs
# print(len(all_hrefs_list))
# # Check for duplicates
# print(len(set(all_hrefs_list)))

# # Test for function extract_hrefs passed

932
437


In [114]:
def unique_urls(urls_list):
    """
    Removes repeated entries of urls in a list which might occur due to use of regex or multiple occurences on a page
    Args:
        urls_list: list containing urls which might be repetitive entries
    Returns: list containing unique urls
    """
    # convert to set
    urls_set = set(urls_list)
    # convert back to list
    unique_urls_list = list(urls_set)
    
    return unique_urls_list


In [115]:
# ## Test-Block !DO NOT EXECUTE
# unique_hrefs = unique_urls(all_hrefs_list)
# print(len(unique_hrefs))

# # Test for function unique_urls passed

437


In [119]:
def regex_pattern_urls(pattern, hrefs_list): 
    """
    Uses a regex pattern to extract urls of interest out of a list of urls
    pattern: regex pattern used for extracting urls from a list of urls
    hrefs_list: list of hrefs required to be filtered according to the regex pattern
    returns:urls_of_interest a list of urls filtered according to regex pattern provided
        
    """
    # Filter the list 'all_hrefs' to keep only the company profile page urls

    #import regex library
    import re
    # regex pattern for company profile pages
    # Do not uncomment following line of code
#     pattern = r'https:\/\/\w+\.wir\.ch\/de\/companyProfile\/profile\/[0-9A-F]{32}\/info\/\?promo=false$'

    urls_of_interest = []

    # loop over all_hrefs list extracted above with Selenium to extract company profile urls
    for href in hrefs_list:
        match = re.search(pattern, str(href))
        if match:
            url = (match.group())
            urls_of_interest.append(url)
    return urls_of_interest        

In [123]:
# ## Test-Block !DO NOT EXECUTE

# pattern = r'https:\/\/\w+\.wir\.ch\/de\/companyProfile\/profile\/[0-9A-F]{32}\/info\/\?promo=false$'
# company_profile_urls =  regex_pattern_urls(pattern, unique_hrefs)
# print(len(company_profile_urls))
# unique_company_profile_urls = unique_urls(company_profile_urls)
# print(len(unique_company_profile_urls)) # should be 200 (slight variation of up to 5% is acceptable due to variation in internet speed)
# print(unique_company_profile_urls[:10]) #must contain the string "info" in them

# # Test for regex_pattern_urls passed

200
200
['https://wirmarket.wir.ch/de/companyProfile/profile/43456310DBC03AE1E054A0369F14B95F/info/?promo=false', 'https://wirmarket.wir.ch/de/companyProfile/profile/3E429FAF42C42A65E0540010E0244DC9/info/?promo=false', 'https://wirmarket.wir.ch/de/companyProfile/profile/770D4DB828D3D240E05400144FF855A7/info/?promo=false', 'https://wirmarket.wir.ch/de/companyProfile/profile/5E75A6EDD2059DB6E05400144FF855A7/info/?promo=false', 'https://wirmarket.wir.ch/de/companyProfile/profile/9B985D81E1CA0D98E05400144FFAFA27/info/?promo=false', 'https://wirmarket.wir.ch/de/companyProfile/profile/4ED1A916F9256A45E054A0369F14B95F/info/?promo=false', 'https://wirmarket.wir.ch/de/companyProfile/profile/3E429FAF25A02A65E0540010E0244DC9/info/?promo=false', 'https://wirmarket.wir.ch/de/companyProfile/profile/3E429FAF66252A65E0540010E0244DC9/info/?promo=false', 'https://wirmarket.wir.ch/de/companyProfile/profile/3E429FAF57572A65E0540010E0244DC9/info/?promo=false', 'https://wirmarket.wir.ch/de/companyProfile/pr

In [139]:
def extract_info_of_interest(url_list, batch_size = 1000, batch_start_index = 0, sleep=10):
    """
    Extracts all the relevant information from the given unique pages using selenium web driver finders and locators
    Args:
    url_list: list of the pages from where info is to be extracted
    batch_size (int): Number of pages to be processed at a time. It is recommended to process large number of pages in batches of 1000
    batch_start_index(int): index of the url_list from where parsing should start
    sleep(int): seconds to wait for page to load before info can be parsed. It overrides WebDriverWait() function
    Returns:
    list of list of extracted data where each list contains data from a single page
    """
    # Dont't uncomment following line of code, it is for troubleshooting
#   from contextlib import suppress

    # prepare the urls batch to be processed
    batch = url_list[batch_start_index:batch_start_index+batch_size+1]

    company_data = [] # initialize a list to contain lists of all companies' info




    for url in batch:
        company_info = [] # initialize a list to contain one company's info


        # driver control block
        driver = webdriver.Chrome(service=service)
        driver.get(url)
        time.sleep(sleep) # We are using sleep to avoid errors of WebDriverWait though it is timewise costly


    # Using suppress context to ignore known exceptions
    # Do not uncomment following line of code as it may lead to execution errors it is for testing
    #     with suppress(Exception): 

        # Get company names

        try:
    #         WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "71-lnk")))

            h3_headings = driver.find_elements(By.CLASS_NAME, "inline-block") # find all elements with class = "inline-block"
            h3_list = []
            for heading in h3_headings: #loop through the list obtained
                h3_list.append(heading.text) # get text of all h3 headings
            company_name = h3_list[0] # company name is the first item in the list
            company_info.append(company_name)   
        except:
            company_info.append('NA')
        finally:
                pass


        # Get company industry

        try:

            uls = driver.find_elements(By.CLASS_NAME, "default")
            list_items = driver.find_elements(By.TAG_NAME,"li")   
            li_list = []
            for li in list_items:
                li_list.append(li.text)
            industry_name = li_list[20] # industry name is the twenty first item in the list
            company_info.append(industry_name)   
        except:
            company_info.append('NA')
        finally:
                pass

        # Get company website if present         
        try:
    #       WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "68-lnk")))
            anchor = driver.find_element(By.ID, "68-lnk")
            website = anchor.get_attribute('href')
            company_info.append(website)
        except:
                company_info.append('NA')
        finally:
                pass

        # Get company telephoone if available
        try:
    #         WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "66-lnk")))
            anchor = driver.find_element(By.ID, "66-lnk")
            company_telephone = anchor.get_attribute('href')
            company_info.append(company_telephone)
        except:
            company_info.append('NA')
        finally:
                pass
        # Get company fax if available
        try:
    #         WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "fax-lnk")))
            anchor = driver.find_element(By.ID, "fax-lnk")
            company_fax_no = anchor.get_attribute('href')
            company_info.append(company_fax_no)
        except:
            company_info.append('NA')
        finally: 
                pass

       # Get company address and address google map link
        try:
    #       WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "6-lnk")))
            anchor = driver.find_element(By.ID, "6-lnk")
            address = anchor.text
            address_map_link = anchor.get_attribute('href')
            company_info.append(address)
            company_info.append(address_map_link)

        except:
                company_info.append('NA')
        finally:
                pass


        company_data.append(company_info)
        company_info = [] # reinitialize company_info

        driver.quit()
    return company_data


In [140]:
# ## Test-Block !DO NOT Execute
# company_data_lists = extract_info_of_interest(url_list = unique_company_profile_urls, batch_size = 200, batch_start_index = 0, sleep=10)

24


In [142]:
# # Test-Block !Do Not Execute
# print(len(company_data_lists)) # should be equal to batch size i.e. 200
# # test for extract_info_of_interest passed

200


In [151]:
# # Test-Block !DO NOT ExECUTE
# print(len(company_data_lists[-1]))
# print(company_data_lists[:5]) 
# # Should be a list of lists
# # Each list should have seven items
# # First item in each list should be 'company name', second: 'industry', third: website, 
# # fourth: 'tel', fifth 'fax', sixth: 'address', seventh: 'address maplink'
# # data not available should be represented by appropriate remarks

# # test for extract_info_of_interest_passed

8
[['Keiser + Schmid GmbH', 'Sonstiger spezialisierter Hoch- und Tiefbau a. n. g.', 'NA', 'tel:062 296 42 82', 'fax:062 296 42 35', 'Langhagstrasse 11\n4600 Olten', 'https://www.google.com/maps/place/Langhagstrasse%2011+4600+Olten', 'website does not exist'], ['Bodenmarkt by GSoG GmbH', 'Aktivitäten der Generalunternehmen im Baugewerbe', 'http://www.bodenmarkt.ch/', 'tel:041 612 25 00', 'fax:041 612 25 01', 'Allwegmatte 3\n6372 Ennetmoos', 'https://www.google.com/maps/place/Allwegmatte%20%203+6372+Ennetmoos', 'email not found on website'], ['MeLandin Bau GmbH', 'Allgemeiner Hoch- und Tiefbau ohne ausgeprägten Schwerpunkt', 'NA', 'NA', 'NA', 'Immenbachstrasse 24\n4125 Riehen', 'https://www.google.com/maps/place/Immenbachstrasse%2024+4125+Riehen', 'website does not exist'], ['Bertani Baugerüste AG', 'Gerüstbau', 'http://www.bertani.ch/', 'tel:044 744 59 99', 'fax:044 744 59 98', 'Poststrasse 27b\n8953 Dietikon', 'https://www.google.com/maps/place/Poststrasse%2027b+8953+Dietikon', 'b.pina

In [145]:
# Extract emails from websites

def extract_emails_from_websites(info_list, pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', index=2):
    """Extracts emails from a list of lists containing a referred website address using a regex pattern
        Args:
        info_list: list of lists containing extracted info including website address
        pattern: regex pattern to cover all possible formats of email addresses
        index(int): index number in the lists where website address is located, default=2
        Returns: info_list_with_emails a list with email addresses appended to info_list
    """
    
    import requests
    import re
#     pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'

    info_list_with_emails = info_list.copy() # make a copy to avoid overwriting original list
    for page in info_list_with_emails:
        try:
            response = requests.get(str(page[index]))

            if response.status_code == 200:
                emails = re.findall(pattern, response.text)
                if emails:
                    unique_emails = set(emails)
                    unique_emails_list = list(unique_emails)
                    page.append(unique_emails_list[0])
                else:
                    page.append('email not found on website')
            else:
                page.append('bad/broken link to website')
        except:
                page.append('website does not exist')
    return info_list_with_emails



In [146]:
# ## Test_Bloack !DO NOT EXECUTE
# company_data_lists_with_emails = extract_emails_from_websites(company_data_lists)

In [152]:
# ## Test_Block !DO NOT EXECUTE
# print(len(company_data_lists_with_emails)) # should be same as company_data_lists i.e. 200
# print(len(company_data_lists_with_emails[-1])) # should be 8
# print(company_data_lists_with_emails[-5:]) #should have 8 items in each list with emails or remarks added

# # Test for extract_emails_from_websites passed

200
8
[['Kurt Ammann Transport AG', 'Güterbeförderung im Strassenverkehr', 'NA', 'tel:071 657 15 75', 'fax:071 657 16 76', 'Bahnhofstrasse 22a\n8560 Märstetten', 'https://www.google.com/maps/place/Bahnhofstrasse%2022a+8560+M%C3%A4rstetten', 'website does not exist'], ['Stutzer + Flüeler AG', 'Fleischverarbeitung', 'http://www.metzgereistutzer.ch/', 'tel:041 660 15 68', 'fax:041 660 91 28', 'Untergasse 5\n6064 Kerns', 'https://www.google.com/maps/place/Untergasse%205+6064+Kerns', 'stans@metzgereistutzer.ch'], ['Scheiweiler Garagen AG', 'lnstandhaltung und Reparatur von Automobilen', 'http://www.scheiweiler.ch/', 'tel:044 787 44 00', 'fax:044 784 94 92', 'Allenwindenstrasse 10\n8832 Wollerau', 'https://www.google.com/maps/place/Allenwindenstrasse%2010+8832+Wollerau', 'info@scheiweiler.ch'], ['BSL-Ticketprint AG', 'Offsetdruck', 'http://www.ticketprint.ch/', 'tel:041 248 41 61', 'fax:041 240 16 50', 'Sagenmattstrasse 7\n6003 Luzern', 'https://www.google.com/maps/place/Sagenmattstrasse%207

In [171]:
# Convert to Dataframes
def convert_to_dataframe(data_list, columns):
    """
    Converts a list of lists containing relevant data to pandas dataframe
    Args:
        data_list: list of lists to be converted to dataframe
        columns: Names of columns in the dataframe
    """
    df = pd.DataFrame(data_list, columns, axis = 1)
    return df

In [165]:
# ## Test-Block !DO NOT EXECUTE
# # Convert to Dataframe first 100 items
# columns=['company_name', 'industry', 'website', 'telephone', 'fax', 'address', 'address_maplink', 'email']

# df1 = pd.DataFrame(company_data_lists_with_emails[:100], columns=columns)
 

In [166]:
# ## Test-Block !DO NOT EXECUTE
# df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   company_name     100 non-null    object
 1   industry         100 non-null    object
 2   website          100 non-null    object
 3   telephone        100 non-null    object
 4   fax              100 non-null    object
 5   address          100 non-null    object
 6   address_maplink  100 non-null    object
 7   email            95 non-null     object
dtypes: object(8)
memory usage: 6.4+ KB


In [167]:
# ## Test_Block !DO NOT EXECUTE
# df1.head()

Unnamed: 0,company_name,industry,website,telephone,fax,address,address_maplink,email
0,Keiser + Schmid GmbH,Sonstiger spezialisierter Hoch- und Tiefbau a....,,tel:062 296 42 82,fax:062 296 42 35,Langhagstrasse 11\n4600 Olten,https://www.google.com/maps/place/Langhagstras...,website does not exist
1,Bodenmarkt by GSoG GmbH,Aktivitäten der Generalunternehmen im Baugewerbe,http://www.bodenmarkt.ch/,tel:041 612 25 00,fax:041 612 25 01,Allwegmatte 3\n6372 Ennetmoos,https://www.google.com/maps/place/Allwegmatte%...,email not found on website
2,MeLandin Bau GmbH,Allgemeiner Hoch- und Tiefbau ohne ausgeprägte...,,,,Immenbachstrasse 24\n4125 Riehen,https://www.google.com/maps/place/Immenbachstr...,website does not exist
3,Bertani Baugerüste AG,Gerüstbau,http://www.bertani.ch/,tel:044 744 59 99,fax:044 744 59 98,Poststrasse 27b\n8953 Dietikon,https://www.google.com/maps/place/Poststrasse%...,b.pinalli@bertani.ch
4,Giuliani Bauausführungen,Sonstiger spezialisierter Hoch- und Tiefbau a....,http://www.paologiuliani-bau.ch/,tel:081 353 84 05,,Dammweg 138\n7000 Chur,https://www.google.com/maps/place/Dammweg%2013...,info@paologiuliani-bau.ch


In [181]:
# ## Test-Block !DO NOT EXECUTE
# # Convert to DataFrame next 100 items
# columns=['company_name', 'industry', 'website', 'telephone', 'fax', 'address', 'address_maplink', 'email']

# df2 = pd.DataFrame(company_data_lists_with_emails[101:200], columns=columns)


In [182]:
## Test-Block !DO NOT EXECUTE
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99 entries, 0 to 98
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   company_name     99 non-null     object
 1   industry         99 non-null     object
 2   website          99 non-null     object
 3   telephone        99 non-null     object
 4   fax              99 non-null     object
 5   address          99 non-null     object
 6   address_maplink  99 non-null     object
 7   email            95 non-null     object
dtypes: object(8)
memory usage: 6.3+ KB


In [183]:
# ## Test_Block !DO NOT EXECUTE
# df2.head()

Unnamed: 0,company_name,industry,website,telephone,fax,address,address_maplink,email
0,Raphael Müller Print My Shirt,Veredlung von Textilien und Bekleidung,,,,Dorfstrasse 23B\n3661 Uetendorf,https://www.google.com/maps/place/Dorfstrasse%...,website does not exist
1,BUDAK TEXTILES Schweiz AG,Grosshandel mit Textilien,http://www.budak-textiles.ch/,tel:041 500 67 07,,Bürgenstrasse 4\n6005 Luzern,https://www.google.com/maps/place/B%C3%BCrgens...,email not found on website
2,Franco Baeriswil,Garten- und Landschaftsbau sowie Erbringung vo...,http://www.baeriswil.com/,tel:041 370 63 76,fax:041 370 63 73,Winkelbüelhof 2\n6043 Adligenswil,https://www.google.com/maps/place/Winkelb%C3%B...,website does not exist
3,Printpark GmbH,Werbeagenturen,http://www.printpark-gmbh.ch/,,,Postweg 2\n5034 Suhr,https://www.google.com/maps/place/Postweg%202+...,samuel.niederer@printparkgmbh.ch
4,Master Zahner,Allgemeiner Hoch- und Tiefbau ohne ausgeprägte...,,tel:079 264 38 27,,Roosstrasse 53\n8832 Wollerau,https://www.google.com/maps/place/Roosstrasse%...,website does not exist


In [184]:
# ## Test_Block !DO NOT EXECUTE
# df2.tail()

Unnamed: 0,company_name,industry,website,telephone,fax,address,address_maplink,email
94,Kurt Ammann Transport AG,Güterbeförderung im Strassenverkehr,,tel:071 657 15 75,fax:071 657 16 76,Bahnhofstrasse 22a\n8560 Märstetten,https://www.google.com/maps/place/Bahnhofstras...,website does not exist
95,Stutzer + Flüeler AG,Fleischverarbeitung,http://www.metzgereistutzer.ch/,tel:041 660 15 68,fax:041 660 91 28,Untergasse 5\n6064 Kerns,https://www.google.com/maps/place/Untergasse%2...,stans@metzgereistutzer.ch
96,Scheiweiler Garagen AG,lnstandhaltung und Reparatur von Automobilen,http://www.scheiweiler.ch/,tel:044 787 44 00,fax:044 784 94 92,Allenwindenstrasse 10\n8832 Wollerau,https://www.google.com/maps/place/Allenwindens...,info@scheiweiler.ch
97,BSL-Ticketprint AG,Offsetdruck,http://www.ticketprint.ch/,tel:041 248 41 61,fax:041 240 16 50,Sagenmattstrasse 7\n6003 Luzern,https://www.google.com/maps/place/Sagenmattstr...,florian.baechler@bsl.ch
98,Staeger AG Thalwil,Detailhandel mit Geräten der Unterhaltungselek...,http://www.staegerag.ch/,tel:044 720 13 62,fax:044 722 13 62,Alte Landstrasse 160\n8800 Thalwil,https://www.google.com/maps/place/Alte%20Lands...,email not found on website


In [185]:
# # Test-Block !DO nOT EXECUTE
# # Join both Dataframes
# df = pd.concat([df1, df2])

In [178]:
## Test-Block DO NOT EXECUTE
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 199 entries, 0 to 98
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   company_name     199 non-null    object
 1   industry         199 non-null    object
 2   website          199 non-null    object
 3   telephone        199 non-null    object
 4   fax              199 non-null    object
 5   address          199 non-null    object
 6   address_maplink  199 non-null    object
 7   email            190 non-null    object
dtypes: object(8)
memory usage: 14.0+ KB


In [186]:
# ## Test_Block !DO NOT EXECUTE
# df.head()==df1.head() # should be same as df1.head()
# # test passed

Unnamed: 0,company_name,industry,website,telephone,fax,address,address_maplink,email
0,True,True,True,True,True,True,True,True
1,True,True,True,True,True,True,True,True
2,True,True,True,True,True,True,True,True
3,True,True,True,True,True,True,True,True
4,True,True,True,True,True,True,True,True


In [188]:
# ## Test_Block !DO NOT EXECUTE
# df.tail()==df2.tail() # should be same as df2.tail()
# # test passed

Unnamed: 0,company_name,industry,website,telephone,fax,address,address_maplink,email
94,True,True,True,True,True,True,True,True
95,True,True,True,True,True,True,True,True
96,True,True,True,True,True,True,True,True
97,True,True,True,True,True,True,True,True
98,True,True,True,True,True,True,True,True


### xxxxxxxxxxxxxxxxxxxxxx END OF CODE BLOCKXXXXXXXXXXXXXXXXXXXXXXXX

## XXXXXXXXXXXXX START OF EXECUTION BLOCXXXXXXXXXXXXXX

### Part-I Extract links to all company profile pages