# Part-1 CODE-BLOCK
- Do not make any changes in the 'CODE-BLOCK'
- Execute all cells in sequence
- Test-Blocks are disabled but for safety do not try to execute any cell marked as Test-Block

In [28]:
# import necessary libraries
import time
import requests
import re
import numpy as np
import pandas as pd
from contextlib import suppress
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.wait import WebDriverWait 
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

# Install webdriver manager to automatically detect driver location
service = ChromeService(executable_path=ChromeDriverManager().install())

# Set up ChromeDriver options
chrome_options = Options()
chrome_options.add_argument('--headless') # Run in headless mode to avoid opening a new browser window


In [29]:
def extract_hrefs(n=2, sleep = 15):
    """
    Extracts all hrefs from a paginated web url as a list 
    Args:
        n = number of paginated web urls from which hrefs are to be extracted, default = 5
        sleep = time in seconds to sleep before quitting the driver, default = 15 sec
        url = paginated url from which data is to be extracted
    Returns:
        (list) containining the hrefs found on each page 
    
    """
    
        # Initialize empty list to store results
    all_hrefs = []

    
    # Loop over n list pages
    for i in range(1, n+1):
        # Build URL for current list pages having links to company profiles
        # Don't uncomment following line of code
        url = f"https://wirmarket.wir.ch/de/members/list/?page={i}&resultAmount=100"
        

        # Launch ChromeDriver and load websites
        # In order to disable browser windows opening for websites amend the code of the following line:
        # driver = webdriver.Chrome(service=service, options = chrome_options)
        driver = webdriver.Chrome(service=service)

        # Don't uncomment the code on the following line, it is for testing
        #   driver.implicitly_wait(20) # seconds
        driver.get(url)

        # Wait for all anchor elements to be present
        # WebDriverWait(driver, 180).until(EC.visiblity_of_all_elements_located((By.CSS_SELECTOR, "a")))
        # Due to intermittent functionality of WebDriverWait, we introduce sleep() function to get all anchors
        # This value is presently set at 15 secs but you may change it depending upon internet speed
        time.sleep(sleep)
        # Extract hrefs from all anchor elements on current website
        hrefs = [a.get_attribute('href') for a in driver.find_elements(By.CSS_SELECTOR, "a")]

        # Add hrefs to the list of all hrefs
        all_hrefs.extend(hrefs) # extend flattens the list so that it is no longer a list of lists

        # Close the current browser window
        driver.quit()
    return all_hrefs


In [33]:
### Test Block ! DO NOT EXECUTE
# url = f"https://wirmarket.wir.ch/de/members/list/?page={i}&resultAmount=100"
# all_hrefs_list = extract_hrefs(n=2, sleep = 15, 
#                          url = url)
                          
# # Check the length of all_hrefs list
# # Each list link contains between 400-500 hrefs
# print(len(all_hrefs_list))
# # Check for duplicates
# print(len(set(all_hrefs_list)))

# # Test for function extract_hrefs passed

In [30]:
def unique_urls(urls_list):
    """
    Removes repeated entries of urls in a list which might occur due to use of regex or multiple occurences on a page
    Args:
        urls_list: list containing urls which might be repetitive entries
    Returns: list containing unique urls
    """
    # convert to set
    urls_set = set(urls_list)
    # convert back to list
    unique_urls_list = list(urls_set)
    
    return unique_urls_list


In [None]:
# ## Test-Block !DO NOT EXECUTE
# unique_hrefs = unique_urls(all_hrefs_list)
# print(len(unique_hrefs))

# # Test for function unique_urls passed

In [31]:
def regex_pattern_urls(pattern, hrefs_list): 
    """
    Uses a regex pattern to extract urls of interest out of a list of urls
    pattern: regex pattern used for extracting urls from a list of urls
    hrefs_list: list of hrefs required to be filtered according to the regex pattern
    returns:urls_of_interest a list of urls filtered according to regex pattern provided
        
    """
    # Filter the list 'all_hrefs' to keep only the company profile page urls

    #import regex library
    import re
    # regex pattern for company profile pages
    # Do not uncomment following line of code
#     pattern = r'https:\/\/\w+\.wir\.ch\/de\/companyProfile\/profile\/[0-9A-F]{32}\/info\/\?promo=false$'

    urls_of_interest = []

    # loop over all_hrefs list extracted above with Selenium to extract company profile urls
    for href in hrefs_list:
        match = re.search(pattern, str(href))
        if match:
            url = (match.group())
            urls_of_interest.append(url)
    return urls_of_interest        

In [None]:
# ## Test-Block !DO NOT EXECUTE

# pattern = r'https:\/\/\w+\.wir\.ch\/de\/companyProfile\/profile\/[0-9A-F]{32}\/info\/\?promo=false$'
# company_profile_urls =  regex_pattern_urls(pattern, unique_hrefs)
# print(len(company_profile_urls))
# unique_company_profile_urls = unique_urls(company_profile_urls)
# print(len(unique_company_profile_urls)) # should be 200 (slight variation of up to 5% is acceptable due to variation in internet speed)
# print(unique_company_profile_urls[:10]) #must contain the string "info" in them

# # Test for regex_pattern_urls passed

In [32]:
def extract_info_of_interest(url_list, batch_size = 1000, batch_start_index = 0, sleep=10):
    """
    Extracts all the relevant information from the given unique pages using selenium web driver finders and locators
    Args:
    url_list: list of the pages from where info is to be extracted
    batch_size (int): Number of pages to be processed at a time. It is recommended to process large number of pages in batches of 1000
    batch_start_index(int): index of the url_list from where parsing should start
    sleep(int): seconds to wait for page to load before info can be parsed. It overrides WebDriverWait() function
    Returns:
    list of list of extracted data where each list contains data from a single page
    """
    # Dont't uncomment following line of code, it is for troubleshooting
#   from contextlib import suppress

    # prepare the urls batch to be processed
    batch = url_list[batch_start_index:batch_start_index+batch_size+1]

    company_data = [] # initialize a list to contain lists of all companies' info


    count = 0

    for url in batch:
        company_info = [] # initialize a list to contain one company's info


        # driver control block
        driver = webdriver.Chrome(service=service)
        driver.get(url)
        time.sleep(sleep) # We are using sleep to avoid errors of WebDriverWait though it is timewise costly


    # Using suppress context to ignore known exceptions
    # Do not uncomment following line of code as it may lead to execution errors it is for testing
    #     with suppress(Exception): 

        # Get company names

        try:
    #         WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "71-lnk")))

            h3_headings = driver.find_elements(By.CLASS_NAME, "inline-block") # find all elements with class = "inline-block"
            h3_list = []
            for heading in h3_headings: #loop through the list obtained
                h3_list.append(heading.text) # get text of all h3 headings
            company_name = h3_list[0] # company name is the first item in the list
            company_info.append(company_name)   
        except:
            company_info.append('NA')
        finally:
                pass


        # Get company industry

        try:

            uls = driver.find_elements(By.CLASS_NAME, "default")
            list_items = driver.find_elements(By.TAG_NAME,"li")   
            li_list = []
            for li in list_items:
                li_list.append(li.text)
            industry_name = li_list[20] # industry name is the twenty first item in the list
            company_info.append(industry_name)   
        except:
            company_info.append('NA')
        finally:
                pass

        # Get company website if present         
        try:
    #       WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "68-lnk")))
            anchor = driver.find_element(By.ID, "68-lnk")
            website = anchor.get_attribute('href')
            company_info.append(website)
        except:
                company_info.append('NA')
        finally:
                pass

        # Get company telephoone if available
        try:
    #         WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "66-lnk")))
            anchor = driver.find_element(By.ID, "66-lnk")
            company_telephone = anchor.get_attribute('href')
            company_info.append(company_telephone)
        except:
            company_info.append('NA')
        finally:
                pass
        # Get company fax if available
        try:
    #         WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "fax-lnk")))
            anchor = driver.find_element(By.ID, "fax-lnk")
            company_fax_no = anchor.get_attribute('href')
            company_info.append(company_fax_no)
        except:
            company_info.append('NA')
        finally: 
                pass

       # Get company address and address google map link
        try:
    #       WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "6-lnk")))
            anchor = driver.find_element(By.ID, "6-lnk")
            address = anchor.text
            address_map_link = anchor.get_attribute('href')
            company_info.append(address)
            company_info.append(address_map_link)

        except:
                company_info.append('NA')
        finally:
                pass


        company_data.append(company_info)
        company_info = [] # reinitialize company_info
        
        count+=1
        print("count", count)

        driver.quit()
    return company_data


In [None]:
# ## Test-Block !DO NOT Execute
# company_data_lists = extract_info_of_interest(url_list = unique_company_profile_urls, batch_size = 200, batch_start_index = 0, sleep=10)

In [None]:
# # Test-Block !Do Not Execute
# print(len(company_data_lists)) # should be equal to batch size i.e. 200
# # test for extract_info_of_interest passed

In [None]:
# # Test-Block !DO NOT ExECUTE
# print(len(company_data_lists[-1]))
# print(company_data_lists[:5]) 
# # Should be a list of lists
# # Each list should have seven items
# # First item in each list should be 'company name', second: 'industry', third: website, 
# # fourth: 'tel', fifth 'fax', sixth: 'address', seventh: 'address maplink'
# # data not available should be represented by appropriate remarks

# # test for extract_info_of_interest_passed

In [37]:
# # Extract emails from websites

# def extract_emails_from_websites(info_list, pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', index=2):
#     """Extracts emails from a list of lists containing a referred website address using a regex pattern
#         Args:
#         info_list: list of lists containing extracted info including website address
#         pattern: regex pattern to cover all possible formats of email addresses
#         index(int): index number in the lists where website address is located, default=2
#         Returns: info_list_with_emails a list with email addresses appended to info_list
#     """
    
#     import requests
#     import re
# #     pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'

#     info_list_with_emails = info_list.copy() # make a copy to avoid overwriting original list
#     for page in info_list_with_emails:
#         try:
#             response = requests.get(str(page[index]))

#             if response.status_code == 200:
#                 emails = re.findall(pattern, response.text)
#                 if emails:
#                     unique_emails = set(emails)
#                     unique_emails_list = list(unique_emails)
#                     page.append(unique_emails_list[0])
#                 else:
#                     page.append('email not found on website')
#             else:
#                 page.append('bad/broken link to website')
#         except:
#                 page.append('website does not exist')
#     return info_list_with_emails



In [33]:
# Extract emails from websites

def extract_emails_from_websites(info_list, pattern=r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', index=2):
    """Extracts emails from a list of lists containing a referred website address using a regex pattern
        Args:
        info_list: list of lists containing extracted info including website address
        pattern: regex pattern to cover all possible formats of email addresses
        index(int): index number in the lists where website address is located, default=2
        Returns: info_list_with_emails a list with email addresses appended to info_list
    """
    
    import requests
    import re
    from bs4 import BeautifulSoup

    info_list_with_emails = info_list.copy() # make a copy to avoid overwriting original list
    all_links = []
    for page in info_list_with_emails:
        try:
            home_page = str(page[index])
            response = requests.get(home_page)

            if response.status_code == 200:
                soup = BeautifulSoup(response.text, 'html.parser')
                links = [link.get('href') for link in soup.find_all('a')] # get all links on the homepage
                

                # search for emails on all links found
                email_found = False
                for link in links:
                    if link and 'http' not in link:
                        link = home_page[:-1]+link
#                         print(link)
#                         all_links.append(link)
                        link_response = requests.get(link)
                        if link_response.status_code == 200:
                            emails = re.findall(pattern, link_response.text)
#                             all_links.append(emails[0])
                            
                            if emails:
                                unique_emails = set(emails)
                                unique_emails_list = list(unique_emails)
                                page.append(unique_emails_list[0])
                                all_links.append(unique_emails_list[0])
                                email_found = True
                                break

                if not email_found:
                    page.append('email not found on website')

            else:
                page.append('bad/broken link to website')
        except:
            page.append('website not found')
#     print(info_list_with_emails)
    print('No of emails found: ', len(all_links))
    return info_list_with_emails


In [None]:
# ## Test_Bloack !DO NOT EXECUTE
# company_data_lists_with_emails = extract_emails_from_websites(company_data_lists)

In [None]:
# ## Test_Block !DO NOT EXECUTE
# print(len(company_data_lists_with_emails)) # should be same as company_data_lists i.e. 200
# print(len(company_data_lists_with_emails[-1])) # should be 8
# print(company_data_lists_with_emails[-5:]) #should have 8 items in each list with emails or remarks added

# # Test for extract_emails_from_websites passed

In [None]:
# ## Test-Block !DO NOT EXECUTE
# # Convert to Dataframe first 100 items
# columns=['company_name', 'industry', 'website', 'telephone', 'fax', 'address', 'address_maplink', 'email']

# df1 = pd.DataFrame(company_data_lists_with_emails[:100], columns=columns)
 

In [None]:
# ## Test-Block !DO NOT EXECUTE
# df1.info()

In [None]:
# ## Test_Block !DO NOT EXECUTE
# df1.head()

In [None]:
# ## Test-Block !DO NOT EXECUTE
# # Convert to DataFrame next 100 items
# columns=['company_name', 'industry', 'website', 'telephone', 'fax', 'address', 'address_maplink', 'email']

# df2 = pd.DataFrame(company_data_lists_with_emails[101:200], columns=columns)


In [None]:
# ## Test-Block !DO NOT EXECUTE
# df2.info()

In [None]:
# ## Test_Block !DO NOT EXECUTE
# df2.head()

In [None]:
# ## Test_Block !DO NOT EXECUTE
# df2.tail()

In [None]:
# # Test-Block !DO nOT EXECUTE
# # Join both Dataframes
# df = pd.concat([df1, df2])

In [None]:
## Test-Block DO NOT EXECUTE
# df.info()

In [None]:
# ## Test_Block !DO NOT EXECUTE
# df.head()==df1.head() # should be same as df1.head()
# # test passed

In [None]:
# ## Test_Block !DO NOT EXECUTE
# df.tail()==df2.tail() # should be same as df2.tail()
# # test passed

### xxxxxxxxxxxxxxxxxxxxxx END OF CODE BLOCKXXXXXXXXXXXXXXXXXXXXXXXX

## XXXXXXXXXXXXX START OF EXECUTION BLOCXXXXXXXXXXXXXX

### Part-I Extract links to all company profile pages

In [7]:
# extract all_hrefs from 224 list pages on the site 
start_time = time.time()
# url = f"https://wirmarket.wir.ch/de/members/list/?page={i}&resultAmount=25"
all_hrefs = extract_hrefs( n=224, sleep = 17)

end_time = time.time()
execution_time = end_time - start_time
print("execution_time: ", execution_time)


execution_time:  7432.664510250092


In [8]:
print(len(all_hrefs))

104806


In [61]:
all_hrefs_dict = {"hrefs": all_hrefs}
all_hrefs_df = pd.DataFrame(all_hrefs)
all_hrefs_df.to_csv("all_hrefs.csv", index = False)

In [9]:
# Remove duplicate or repeated entries from the all_hrefs
all_hrefs_unique = unique_urls(all_hrefs) 

In [10]:
print(len(all_hrefs_unique))

29124


In [11]:
# Extract company profile page urls from all unique hrefs
pattern = r'https:\/\/\w+\.wir\.ch\/de\/companyProfile\/profile\/[0-9A-F]{32}\/info\/\?promo=false$'
company_profile_urls = regex_pattern_urls(pattern = pattern, hrefs_list = all_hrefs_unique)

In [12]:
print(len(company_profile_urls))

14227


In [13]:
# Remove duplicate or repeated entries from company_profile_urls
unique_company_profile_urls = unique_urls(company_profile_urls)

#### Checkpoint-1

In [14]:
print(len(unique_company_profile_urls)) # should be 14227 (slight variation of up to 5% is acceptable due to variation in internet speed)
print(unique_company_profile_urls[:10])
print(unique_company_profile_urls[-10:])

14227
['https://wirmarket.wir.ch/de/companyProfile/profile/3E429FAF14ED2A65E0540010E0244DC9/info/?promo=false', 'https://wirmarket.wir.ch/de/companyProfile/profile/3E429FAF4B042A65E0540010E0244DC9/info/?promo=false', 'https://wirmarket.wir.ch/de/companyProfile/profile/3E429FAF240E2A65E0540010E0244DC9/info/?promo=false', 'https://wirmarket.wir.ch/de/companyProfile/profile/3E429FAF3B6B2A65E0540010E0244DC9/info/?promo=false', 'https://wirmarket.wir.ch/de/companyProfile/profile/3E429FAF41082A65E0540010E0244DC9/info/?promo=false', 'https://wirmarket.wir.ch/de/companyProfile/profile/3E429FAF12A92A65E0540010E0244DC9/info/?promo=false', 'https://wirmarket.wir.ch/de/companyProfile/profile/3E429FAF1B3B2A65E0540010E0244DC9/info/?promo=false', 'https://wirmarket.wir.ch/de/companyProfile/profile/3E429FAF54932A65E0540010E0244DC9/info/?promo=false', 'https://wirmarket.wir.ch/de/companyProfile/profile/3E429FAF31EB2A65E0540010E0244DC9/info/?promo=false', 'https://wirmarket.wir.ch/de/companyProfile/prof

In [62]:
# Save as csv
company_profile_url_dict = {"urls": unique_company_profile_urls}
company_profile_url_df = pd.DataFrame(company_profile_url_dict)
company_profile_url_df.to_csv('profile_urls.csv', index = False)

In [10]:
# import as dataframe
unique_company_profiles_df = pd.read_csv("profile_urls.csv")

In [13]:
unique_company_profiles_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14227 entries, 0 to 14226
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   urls    14227 non-null  object
dtypes: object(1)
memory usage: 111.3+ KB


In [19]:
unique_company_profile_urls = unique_company_profiles_df['urls'].tolist()


In [20]:
print(len(unique_company_profile_urls))

14227


In [21]:
print(unique_company_profile_urls[-5:])

['https://wirmarket.wir.ch/de/companyProfile/profile/3E429FAF26282A65E0540010E0244DC9/info/?promo=false', 'https://wirmarket.wir.ch/de/companyProfile/profile/3E429FAF3C152A65E0540010E0244DC9/info/?promo=false', 'https://wirmarket.wir.ch/de/companyProfile/profile/58B85543DFA09858E05400144FF95A47/info/?promo=false', 'https://wirmarket.wir.ch/de/companyProfile/profile/3E429FAF27E62A65E0540010E0244DC9/info/?promo=false', 'https://wirmarket.wir.ch/de/companyProfile/profile/3E429FAF55272A65E0540010E0244DC9/info/?promo=false']


### xxxxxxxxxxxxxxxxxxx END OF PART-1 xxxxxxxxxxxxxxxxxxxxxxxxxxx

## Part-2 Batch Processing

### Batch-1, batch_size = 1000, batch_start_index = 0

In [17]:
# Extract info of interst
start_time = time.time()
batch1_initial = extract_info_of_interest(unique_company_profile_urls, batch_size = 1000, batch_start_index = 0, sleep=10)
end_time = time.time()
execution_time = end_time - start_time
print('execution time: ', execution_time)

execution time:  18019.821753025055


### checkpoint-1

In [20]:
print(len(batch1_initial)) # Should be equal to batch_size + 1
print(len(batch1_initial[-1])) # Should be equal to 7

1001
7


In [21]:
print(batch1_initial[:3])

[['Ringler + Strahm Storenbau AG', 'Sonstige Bauinstallation', 'http://www.ringler-strahm.ch/', 'tel:033 345 22 55', 'fax:033 345 55 19', 'Uetendorfstrasse 20\n3634 Thierachern', 'https://www.google.com/maps/place/Uetendorfstrasse%2020+3634+Thierachern'], ['Grob & Partner Architektur AG', 'Architekturbüros', 'http://www.grobarchitektur.ch/', 'tel:081 720 02 00', 'fax:081 720 02 05', 'Bahnhofstrasse 3\n7320 Sargans', 'https://www.google.com/maps/place/Bahnhofstrasse%20%203+7320+Sargans'], ['Lauclair AG', 'Schreinerarbeiten im Innenausbau', 'http://www.lauclair.ch/', 'tel:031 879 01 69', 'fax:031 879 20 69', 'Lyssstrasse 27\n3054 Schüpfen', 'https://www.google.com/maps/place/Lyssstrasse%2027+3054+Sch%C3%BCpfen']]


### End of checkpoint-1

In [22]:
# Extract emails
start_time = time.time()
pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
batch1 = extract_emails_from_websites(batch1_initial, pattern = pattern , index=2)
end_time = time.time()
execution_time = end_time - start_time
print('execution time: ', execution_time)

execution time:  953.6410942077637


### checkpoint-2

In [23]:
print(len(batch1)) # should be same as batch_size i.e. 1001
print(len(batch1[-1])) # should be 8
print(batch1[-5:]) #should have 8 items in each list with emails or remarks added


1001
8
[['STB Holzbau GmbH', 'Einbau von Fenster, Türen und Innenausbau, Einbauküchen, Einbaumöbel', 'NA', 'tel:055 444 31 68', 'fax:055 444 31 68', 'Haslenstrasse 28b\n8862 Schübelbach', 'https://www.google.com/maps/place/Haslenstrasse%2028b+8862+Sch%C3%BCbelbach', 'website does not exist'], ['Beauty Solar Sonnenland GmbH', 'Saunas, Solarien', 'NA', 'tel:041 280 15 15', 'fax:041 631 08 07', 'Schulhausstrasse 4\n6052 Hergiswil NW', 'https://www.google.com/maps/place/Schulhausstrasse%204+6052+Hergiswil%20NW', 'website does not exist'], ['Elektro-Brizzi AG', 'Elektroinstallation', 'NA', 'tel:052 386 22 22', 'fax:052 386 22 93', 'Heinrich Gujer-Strasse 5\n8494 Bauma', 'https://www.google.com/maps/place/Heinrich%20Gujer-Strasse%205+8494+Bauma', 'website does not exist'], ['Keller Wärme & Wasser AG', 'Sanitär- und Heizungsinstallation', 'NA', 'tel:032 373 51 53', 'fax:032 373 27 71', 'Erlenstrasse 15\n2555 Brügg BE', 'https://www.google.com/maps/place/Erlenstrasse%2015+2555+Br%C3%BCgg%20BE'

### End of checkpoint-2

In [24]:
# Columns to be used for all batches
columns=['company_name', 'industry', 'website', 'telephone', 'fax', 'address', 'address_maplink', 'email']


In [25]:
# # Convert to Dataframe batch1
df1 = pd.DataFrame(batch1, columns=columns)


### Checkpoint-3

In [26]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1001 entries, 0 to 1000
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   company_name     1001 non-null   object
 1   industry         1001 non-null   object
 2   website          1001 non-null   object
 3   telephone        1001 non-null   object
 4   fax              1001 non-null   object
 5   address          1001 non-null   object
 6   address_maplink  1001 non-null   object
 7   email            1001 non-null   object
dtypes: object(8)
memory usage: 62.7+ KB


In [27]:
df1.head()

Unnamed: 0,company_name,industry,website,telephone,fax,address,address_maplink,email
0,Ringler + Strahm Storenbau AG,Sonstige Bauinstallation,http://www.ringler-strahm.ch/,tel:033 345 22 55,fax:033 345 55 19,Uetendorfstrasse 20\n3634 Thierachern,https://www.google.com/maps/place/Uetendorfstr...,info@ringler-strahm.ch
1,Grob & Partner Architektur AG,Architekturbüros,http://www.grobarchitektur.ch/,tel:081 720 02 00,fax:081 720 02 05,Bahnhofstrasse 3\n7320 Sargans,https://www.google.com/maps/place/Bahnhofstras...,605a7baede844d278b89dc95ae0a9123@sentry-next.w...
2,Lauclair AG,Schreinerarbeiten im Innenausbau,http://www.lauclair.ch/,tel:031 879 01 69,fax:031 879 20 69,Lyssstrasse 27\n3054 Schüpfen,https://www.google.com/maps/place/Lyssstrasse%...,a95fca1f1eb9fe8g2c9ead0cd9931e2a@2x.jpg
3,Club Goldwand,"Kauf und Verkauf von eigenen Grundstücken, Geb...",http://www.clubgoldwand.ch/,tel:056 282 30 50,,Landstrasse 6\n5415 Nussbaumen AG,https://www.google.com/maps/place/Landstrasse%...,info@clubgoldwand.ch
4,NICOLE DIEM Horgen,Detailhandel mit Brillen und anderen Sehhilfen,http://www.nicolediem.ch/,tel:044 770 10 40,,Dorfplatz 3\n8810 Horgen,https://www.google.com/maps/place/Dorfplatz%20...,johndoe@domain.com


In [19]:
### End of Checkpoint-3

In [28]:
# Save as csv file
df1.to_csv('batch1.csv', index = False)

### xxxxxxxxxxxxxxx Push to Github xxxxxxxxxxxxxxxxxxxxxxxxx

### Batch-2, batch_size = 1000, batch_start_index = 1001

In [41]:
# Extract info of interst
start_time = time.time()
batch2_initial = extract_info_of_interest(unique_company_profile_urls, batch_size = 1000, batch_start_index = 1001, sleep=10)
end_time = time.time()
execution_time = end_time - start_time
print('execution time: ', execution_time)

execution time:  19495.211315393448


### checkpoint-1

In [42]:
print(len(batch2_initial))
print(len(batch2_initial[-1]))

1001
7


In [43]:
print(batch2_initial[:3])

[['Modehaus Peter', 'Detailhandel mit Herrenbekleidung', 'NA', 'tel:041 970 17 57', 'fax:041 970 07 57', 'Hauptgasse 36\n6130 Willisau', 'https://www.google.com/maps/place/Hauptgasse%2036+6130+Willisau'], ['VB ORGANISATION Sàrl', 'Offsetdruck', 'http://www.imprimerievb.ch/', 'tel:079 205 44 53', 'NA', 'chemin St-Hubert 32\n1950 Sion', 'https://www.google.com/maps/place/chemin%20St-Hubert%2032+1950+Sion'], ['Heinz Stucki', 'Anbau von Getreide (ohne Reis), Hülsenfrüchten und Ölsaaten', 'NA', 'tel:031 781 05 03', 'fax:031 781 05 03', 'Hubmatt\n3116 Noflen BE', 'https://www.google.com/maps/place/Hubmatt+3116+Noflen%20BE']]


### End of Checkpoint-1

In [44]:
# Extract emails
start_time = time.time()
pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
batch2 = extract_emails_from_websites(batch2_initial, pattern = pattern , index=2)
end_time = time.time()
execution_time = end_time - start_time
print('execution time: ', execution_time)

execution time:  1249.9318661689758


### Checkpoint-2

In [45]:
print(len(batch2)) # should be same as batch_size i.e. 1000
print(len(batch2[-1])) # should be 8
print(batch2[-5:]) #should have 8 items in each list with emails or remarks added

1001
8
[['MBA AG', 'Grosshandel mit Bergwerks-, Bau- und Baustoffmaschinen', 'http://www.mba-baumaschinen.ch/', 'tel:044 838 61 11', 'NA', 'Zürichstrasse 50\n8303 Bassersdorf', 'https://www.google.com/maps/place/Z%C3%BCrichstrasse%2050+8303+Bassersdorf', 'info@mba-maschinen.com'], ['Mächler GU AG', 'Garten- und Landschaftsbau sowie Erbringung von sonstigen gärtnerischen Dienstleistungen', 'http://www.maechler-gu.ch/', 'tel:055 451 11 31', 'fax:055 451 11 34', 'St.Gallerstrasse 58\n8853 Lachen SZ', 'https://www.google.com/maps/place/St.Gallerstrasse%2058+8853+Lachen%20SZ', 'info@maechler-gu.ch'], ['Garasch 106 AG', 'lnstandhaltung und Reparatur von Automobilen', 'https://garasch106.ch/', 'tel:041 624 40 24', 'NA', 'Stanserstrasse 106\n6373 Ennetbürgen', 'https://www.google.com/maps/place/Stanserstrasse%20106+6373+Ennetb%C3%BCrgen', 'auto@garasch106.ch'], ['Bumbachsäge AG', 'Sägewerke', 'http://www.bumbachsaege.ch/', 'tel:034 493 35 57', 'fax:034 493 31 84', 'Bumbachsäge 172\n6197 Schang

### End of Checkpoint-2

In [46]:
# # Convert to Dataframe batch1
df2 = pd.DataFrame(batch2, columns=columns)


### Checkpoint 3

In [47]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1001 entries, 0 to 1000
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   company_name     1001 non-null   object
 1   industry         1001 non-null   object
 2   website          1001 non-null   object
 3   telephone        1001 non-null   object
 4   fax              1001 non-null   object
 5   address          1001 non-null   object
 6   address_maplink  1001 non-null   object
 7   email            987 non-null    object
dtypes: object(8)
memory usage: 62.7+ KB


In [48]:
df2.head()

Unnamed: 0,company_name,industry,website,telephone,fax,address,address_maplink,email
0,Modehaus Peter,Detailhandel mit Herrenbekleidung,,tel:041 970 17 57,fax:041 970 07 57,Hauptgasse 36\n6130 Willisau,https://www.google.com/maps/place/Hauptgasse%2...,website does not exist
1,VB ORGANISATION Sàrl,Offsetdruck,http://www.imprimerievb.ch/,tel:079 205 44 53,,chemin St-Hubert 32\n1950 Sion,https://www.google.com/maps/place/chemin%20St-...,605a7baede844d278b89dc95ae0a9123@sentry-next.w...
2,Heinz Stucki,"Anbau von Getreide (ohne Reis), Hülsenfrüchten...",,tel:031 781 05 03,fax:031 781 05 03,Hubmatt\n3116 Noflen BE,https://www.google.com/maps/place/Hubmatt+3116...,website does not exist
3,Heizplan HPA AG,Sanitär- und Heizungsinstallation,http://www.heizplan.ch/,tel:081 750 34 50,fax:081 750 34 59,Karmaad 36\n9473 Gams,https://www.google.com/maps/place/Karmaad%2036...,kontakt@heizplan.ch
4,Hoppler Tiefbohrungen GmbH,Test- und Suchbohrung,http://www.hoppler-gmbh.ch/,tel:056 634 40 46,fax:056 634 40 75,Allmendstrasse 1\n5621 Zufikon,https://www.google.com/maps/place/Allmendstras...,info@hoppler-gmbh.ch


### End of Checkpoint-3

In [152]:
# save as csv
df2.to_csv('batch2.csv', index = False)

In [51]:
# Merge the two dataframes
df = pd.concat([df1, df2])

### Checkpoint-4

In [52]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2002 entries, 0 to 1000
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   company_name     2002 non-null   object
 1   industry         2002 non-null   object
 2   website          2002 non-null   object
 3   telephone        2002 non-null   object
 4   fax              2002 non-null   object
 5   address          2002 non-null   object
 6   address_maplink  2002 non-null   object
 7   email            1988 non-null   object
dtypes: object(8)
memory usage: 140.8+ KB


In [53]:
df.head()

Unnamed: 0,company_name,industry,website,telephone,fax,address,address_maplink,email
0,Ringler + Strahm Storenbau AG,Sonstige Bauinstallation,http://www.ringler-strahm.ch/,tel:033 345 22 55,fax:033 345 55 19,Uetendorfstrasse 20\n3634 Thierachern,https://www.google.com/maps/place/Uetendorfstr...,info@ringler-strahm.ch
1,Grob & Partner Architektur AG,Architekturbüros,http://www.grobarchitektur.ch/,tel:081 720 02 00,fax:081 720 02 05,Bahnhofstrasse 3\n7320 Sargans,https://www.google.com/maps/place/Bahnhofstras...,605a7baede844d278b89dc95ae0a9123@sentry-next.w...
2,Lauclair AG,Schreinerarbeiten im Innenausbau,http://www.lauclair.ch/,tel:031 879 01 69,fax:031 879 20 69,Lyssstrasse 27\n3054 Schüpfen,https://www.google.com/maps/place/Lyssstrasse%...,a95fca1f1eb9fe8g2c9ead0cd9931e2a@2x.jpg
3,Club Goldwand,"Kauf und Verkauf von eigenen Grundstücken, Geb...",http://www.clubgoldwand.ch/,tel:056 282 30 50,,Landstrasse 6\n5415 Nussbaumen AG,https://www.google.com/maps/place/Landstrasse%...,info@clubgoldwand.ch
4,NICOLE DIEM Horgen,Detailhandel mit Brillen und anderen Sehhilfen,http://www.nicolediem.ch/,tel:044 770 10 40,,Dorfplatz 3\n8810 Horgen,https://www.google.com/maps/place/Dorfplatz%20...,johndoe@domain.com


In [54]:
df.tail()

Unnamed: 0,company_name,industry,website,telephone,fax,address,address_maplink,email
996,MBA AG,"Grosshandel mit Bergwerks-, Bau- und Baustoffm...",http://www.mba-baumaschinen.ch/,tel:044 838 61 11,,Zürichstrasse 50\n8303 Bassersdorf,https://www.google.com/maps/place/Z%C3%BCrichs...,info@mba-maschinen.com
997,Mächler GU AG,Garten- und Landschaftsbau sowie Erbringung vo...,http://www.maechler-gu.ch/,tel:055 451 11 31,fax:055 451 11 34,St.Gallerstrasse 58\n8853 Lachen SZ,https://www.google.com/maps/place/St.Gallerstr...,info@maechler-gu.ch
998,Garasch 106 AG,lnstandhaltung und Reparatur von Automobilen,https://garasch106.ch/,tel:041 624 40 24,,Stanserstrasse 106\n6373 Ennetbürgen,https://www.google.com/maps/place/Stanserstras...,auto@garasch106.ch
999,Bumbachsäge AG,Sägewerke,http://www.bumbachsaege.ch/,tel:034 493 35 57,fax:034 493 31 84,Bumbachsäge 172\n6197 Schangnau,https://www.google.com/maps/place/Bumbachs%C3%...,info@bumbachsaege.ch
1000,MEUBLES DESCARTES SA,Inserate,http://www.decarte.ch/,tel:027 743 43 43,fax:027 743 43 44,Route du Léman 33\n1907 Saxon,https://www.google.com/maps/place/Route%20du%2...,email not found on website


In [55]:
df.head() == df1.head()

Unnamed: 0,company_name,industry,website,telephone,fax,address,address_maplink,email
0,True,True,True,True,True,True,True,True
1,True,True,True,True,True,True,True,True
2,True,True,True,True,True,True,True,True
3,True,True,True,True,True,True,True,True
4,True,True,True,True,True,True,True,True


In [56]:
df.tail() == df2.tail()

Unnamed: 0,company_name,industry,website,telephone,fax,address,address_maplink,email
996,True,True,True,True,True,True,True,True
997,True,True,True,True,True,True,True,True
998,True,True,True,True,True,True,True,True
999,True,True,True,True,True,True,True,True
1000,True,True,True,True,True,True,True,True


### End of Checkpoint-4

### xxxxxxxxxxxx Push to Github xxxxxxxxxxxxxx

### Batch-3, batch_size = 1000, batch_start_index = 2001


In [58]:
# Extract info of interst
start_time = time.time()
batch3_initial = extract_info_of_interest(unique_company_profile_urls, batch_size = 1000, batch_start_index = 2001, sleep=5)
end_time = time.time()
execution_time = end_time - start_time
print('execution time: ', execution_time)

execution time:  14860.21987771988


### checkpoint-1

In [63]:
print(len(batch3_initial))
print(len(batch3_initial[-1]))

1001
7


In [64]:
print(batch3_initial[:3])

[['MEUBLES DESCARTES SA', 'Inserate', 'http://www.decarte.ch/', 'tel:027 743 43 43', 'fax:027 743 43 44', 'Route du Léman 33\n1907 Saxon', 'https://www.google.com/maps/place/Route%20du%20L%C3%A9man%2033+1907+Saxon'], ['Eticolle Schoechli SA', 'Sonstiges Drucken a. n. g.', 'http://www.eticolle.ch/', 'tel:027 452 25 26', 'NA', 'Techno-pôle 2\n3960 Sierre', 'https://www.google.com/maps/place/Techno-p%C3%B4le%202+3960+Sierre'], ['Ettima AG', 'Grosshandel mit Werkzeugmaschinen', 'http://www.ettima.ch/', 'tel:031 819 56 26', 'fax:031 819 56 63', 'Bernstrasse 25\n3125 Toffen', 'https://www.google.com/maps/place/Bernstrasse%2025+3125+Toffen']]


### End of Checkpoint-1

In [65]:
# Extract emails
start_time = time.time()
pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
batch3 = extract_emails_from_websites(batch3_initial, pattern = pattern , index=2)
end_time = time.time()
execution_time = end_time - start_time
print('execution time: ', execution_time)

execution time:  915.4399597644806


### Checkpoint-2

In [66]:
print(len(batch3)) # should be same as batch_size i.e. 1000
print(len(batch3[-1])) # should be 8
print(batch3[-5:]) #should have 8 items in each list with emails or remarks added

1001
8
[['Hirt Schreinerei GmbH', 'Schreinerarbeiten im Innenausbau', 'http://www.hirt-schreinerei.ch/', 'tel:061 901 55 88', 'fax:061 901 83 04', 'Bächliackerstrasse 4\n4402 Frenkendorf', 'https://www.google.com/maps/place/B%C3%A4chliackerstrasse%204+4402+Frenkendorf', 'bad/broken link to website'], ['René Bührer AG', 'Detailhandel mit Vorhängen, Teppichen, Fussbodenbelägen und Tapeten', 'NA', 'tel:052 672 16 30', 'fax:052 672 16 40', 'Rheinfallstrasse 7\n8212 Neuhausen am Rheinfall', 'https://www.google.com/maps/place/Rheinfallstrasse%207+8212+Neuhausen%20am%20Rheinfall', 'website does not exist'], ['EFOS Flugschule GmbH', 'Fahr- und Flugschulen', 'http://www.efos.ch/', 'tel:044 861 07 04', 'NA', 'Flughafenstrasse 14\n8302 Kloten', 'https://www.google.com/maps/place/Flughafenstrasse%2014+8302+Kloten', '20info@efos.ch'], ['Hess Druck AG', 'Offsetdruck', 'http://www.hessdruck.ch/', 'tel:071 658 61 80', 'NA', 'Schützlerweg 3\n8560 Märstetten', 'https://www.google.com/maps/place/Sch%C3%B

### End of Checkpoint-2

In [67]:
# # Convert to Dataframe batch3
df3 = pd.DataFrame(batch3, columns=columns)


### Checkpoint 3

In [68]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1001 entries, 0 to 1000
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   company_name     1001 non-null   object
 1   industry         1001 non-null   object
 2   website          1001 non-null   object
 3   telephone        1001 non-null   object
 4   fax              1001 non-null   object
 5   address          1001 non-null   object
 6   address_maplink  1001 non-null   object
 7   email            986 non-null    object
dtypes: object(8)
memory usage: 62.7+ KB


In [69]:
df3.head()

Unnamed: 0,company_name,industry,website,telephone,fax,address,address_maplink,email
0,MEUBLES DESCARTES SA,Inserate,http://www.decarte.ch/,tel:027 743 43 43,fax:027 743 43 44,Route du Léman 33\n1907 Saxon,https://www.google.com/maps/place/Route%20du%2...,email not found on website
1,Eticolle Schoechli SA,Sonstiges Drucken a. n. g.,http://www.eticolle.ch/,tel:027 452 25 26,,Techno-pôle 2\n3960 Sierre,https://www.google.com/maps/place/Techno-p%C3%...,info@eticolle.ch
2,Ettima AG,Grosshandel mit Werkzeugmaschinen,http://www.ettima.ch/,tel:031 819 56 26,fax:031 819 56 63,Bernstrasse 25\n3125 Toffen,https://www.google.com/maps/place/Bernstrasse%...,email not found on website
3,Alushi AG,Spezielle Reinigung von Gebäuden und Reinigung...,http://www.alushi.ch/,tel:052 376 16 80,fax:052 376 16 87,Frauenfelderstrasse 66\n9548 Matzingen,https://www.google.com/maps/place/Frauenfelder...,email not found on website
4,Studer Optik,Detailhandel mit Brillen und anderen Sehhilfen,http://www.studeroptik.ch/,tel:032 392 25 15,fax:032 392 28 89,Stadtplatz 60\n3270 Aarberg,https://www.google.com/maps/place/Stadtplatz%2...,605a7baede844d278b89dc95ae0a9123@sentry-next.w...


### End of Checkpoint-3

In [153]:
#save as csv
df3.to_csv('batch3.csv', index = False)

In [72]:
# Merge the two dataframes
df = pd.concat([df, df3])

### Checkpoint-4

In [73]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3003 entries, 0 to 1000
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   company_name     3003 non-null   object
 1   industry         3003 non-null   object
 2   website          3003 non-null   object
 3   telephone        3003 non-null   object
 4   fax              3003 non-null   object
 5   address          3003 non-null   object
 6   address_maplink  3003 non-null   object
 7   email            2974 non-null   object
dtypes: object(8)
memory usage: 211.1+ KB


In [74]:
df.head()

Unnamed: 0,company_name,industry,website,telephone,fax,address,address_maplink,email
0,Ringler + Strahm Storenbau AG,Sonstige Bauinstallation,http://www.ringler-strahm.ch/,tel:033 345 22 55,fax:033 345 55 19,Uetendorfstrasse 20\n3634 Thierachern,https://www.google.com/maps/place/Uetendorfstr...,info@ringler-strahm.ch
1,Grob & Partner Architektur AG,Architekturbüros,http://www.grobarchitektur.ch/,tel:081 720 02 00,fax:081 720 02 05,Bahnhofstrasse 3\n7320 Sargans,https://www.google.com/maps/place/Bahnhofstras...,605a7baede844d278b89dc95ae0a9123@sentry-next.w...
2,Lauclair AG,Schreinerarbeiten im Innenausbau,http://www.lauclair.ch/,tel:031 879 01 69,fax:031 879 20 69,Lyssstrasse 27\n3054 Schüpfen,https://www.google.com/maps/place/Lyssstrasse%...,a95fca1f1eb9fe8g2c9ead0cd9931e2a@2x.jpg
3,Club Goldwand,"Kauf und Verkauf von eigenen Grundstücken, Geb...",http://www.clubgoldwand.ch/,tel:056 282 30 50,,Landstrasse 6\n5415 Nussbaumen AG,https://www.google.com/maps/place/Landstrasse%...,info@clubgoldwand.ch
4,NICOLE DIEM Horgen,Detailhandel mit Brillen und anderen Sehhilfen,http://www.nicolediem.ch/,tel:044 770 10 40,,Dorfplatz 3\n8810 Horgen,https://www.google.com/maps/place/Dorfplatz%20...,johndoe@domain.com


In [75]:
df.tail()

Unnamed: 0,company_name,industry,website,telephone,fax,address,address_maplink,email
996,Hirt Schreinerei GmbH,Schreinerarbeiten im Innenausbau,http://www.hirt-schreinerei.ch/,tel:061 901 55 88,fax:061 901 83 04,Bächliackerstrasse 4\n4402 Frenkendorf,https://www.google.com/maps/place/B%C3%A4chlia...,bad/broken link to website
997,René Bührer AG,"Detailhandel mit Vorhängen, Teppichen, Fussbod...",,tel:052 672 16 30,fax:052 672 16 40,Rheinfallstrasse 7\n8212 Neuhausen am Rheinfall,https://www.google.com/maps/place/Rheinfallstr...,website does not exist
998,EFOS Flugschule GmbH,Fahr- und Flugschulen,http://www.efos.ch/,tel:044 861 07 04,,Flughafenstrasse 14\n8302 Kloten,https://www.google.com/maps/place/Flughafenstr...,20info@efos.ch
999,Hess Druck AG,Offsetdruck,http://www.hessdruck.ch/,tel:071 658 61 80,,Schützlerweg 3\n8560 Märstetten,https://www.google.com/maps/place/Sch%C3%BCtzl...,satz@hessdruck.ch
1000,Rhomberg Schmuck AG,Detailhandel mit Uhren und Schmuck,http://www.schmuck.ch/,tel:071 310 15 80,,Zürcher-Strasse\n9015 St. Gallen,https://www.google.com/maps/place/Z%C3%BCrcher...,email not found on website


In [76]:
df.head() == df1.head()

Unnamed: 0,company_name,industry,website,telephone,fax,address,address_maplink,email
0,True,True,True,True,True,True,True,True
1,True,True,True,True,True,True,True,True
2,True,True,True,True,True,True,True,True
3,True,True,True,True,True,True,True,True
4,True,True,True,True,True,True,True,True


In [77]:
df.tail() == df3.tail()

Unnamed: 0,company_name,industry,website,telephone,fax,address,address_maplink,email
996,True,True,True,True,True,True,True,True
997,True,True,True,True,True,True,True,True
998,True,True,True,True,True,True,True,True
999,True,True,True,True,True,True,True,True
1000,True,True,True,True,True,True,True,True


### xxxxxxxxxxxx Push to Github xxxxxxxxxxxxxx

### Batch-4, batch_size = 1000, batch_start_index = 3001


In [81]:
# Extract info of interst
start_time = time.time()
batch4_initial = extract_info_of_interest(unique_company_profile_urls, batch_size = 1000, batch_start_index = 3001, sleep=5)
end_time = time.time()
execution_time = end_time - start_time
print('execution time: ', execution_time)

count 1
count 2
count 3
count 4
count 5
count 6
count 7
count 8
count 9
count 10
count 11
count 12
count 13
count 14
count 15
count 16
count 17
count 18
count 19
count 20
count 21
count 22
count 23
count 24
count 25
count 26
count 27
count 28
count 29
count 30
count 31
count 32
count 33
count 34
count 35
count 36
count 37
count 38
count 39
count 40
count 41
count 42
count 43
count 44
count 45
count 46
count 47
count 48
count 49
count 50
count 51
count 52
count 53
count 54
count 55
count 56
count 57
count 58
count 59
count 60
count 61
count 62
count 63
count 64
count 65
count 66
count 67
count 68
count 69
count 70
count 71
count 72
count 73
count 74
count 75
count 76
count 77
count 78
count 79
count 80
count 81
count 82
count 83
count 84
count 85
count 86
count 87
count 88
count 89
count 90
count 91
count 92
count 93
count 94
count 95
count 96
count 97
count 98
count 99
count 100
count 101
count 102
count 103
count 104
count 105
count 106
count 107
count 108
count 109
count 110
count 11

count 832
count 833
count 834
count 835
count 836
count 837
count 838
count 839
count 840
count 841
count 842
count 843
count 844
count 845
count 846
count 847
count 848
count 849
count 850
count 851
count 852
count 853
count 854
count 855
count 856
count 857
count 858
count 859
count 860
count 861
count 862
count 863
count 864
count 865
count 866
count 867
count 868
count 869
count 870
count 871
count 872
count 873
count 874
count 875
count 876
count 877
count 878
count 879
count 880
count 881
count 882
count 883
count 884
count 885
count 886
count 887
count 888
count 889
count 890
count 891
count 892
count 893
count 894
count 895
count 896
count 897
count 898
count 899
count 900
count 901
count 902
count 903
count 904
count 905
count 906
count 907
count 908
count 909
count 910
count 911
count 912
count 913
count 914
count 915
count 916
count 917
count 918
count 919
count 920
count 921
count 922
count 923
count 924
count 925
count 926
count 927
count 928
count 929
count 930
count 931


### checkpoint-1

In [82]:
print(len(batch4_initial))
print(len(batch4_initial[-1]))

1001
7


In [83]:
print(batch4_initial[:3])

[['Rhomberg Schmuck AG', 'Detailhandel mit Uhren und Schmuck', 'http://www.schmuck.ch/', 'tel:071 310 15 80', 'NA', 'Zürcher-Strasse\n9015 St. Gallen', 'https://www.google.com/maps/place/Z%C3%BCrcher-Strasse+9015+St.%20Gallen'], ['GanzImmo AG', 'Verwaltung von Grundstücken, Gebäuden und Wohnungen für Dritte', 'http://www.ganzimmo.ch/', 'tel:052 213 56 65', 'fax:052 213 31 74', 'Schaffhauserstrasse 79\n8401 Winterthur', 'https://www.google.com/maps/place/Schaffhauserstrasse%2079+8401+Winterthur'], ['Coiffure Daniel Moll', 'Artikel im Shop', 'http://www.coiffeur-kriens.com/', 'tel:041 310 05 41', 'NA', 'Amlehnstrasse 54\n6010 Kriens', 'https://www.google.com/maps/place/Amlehnstrasse%2054+6010+Kriens']]


In [84]:
# Extract emails
start_time = time.time()
pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
batch4 = extract_emails_from_websites(batch4_initial, pattern = pattern , index=2)
end_time = time.time()
execution_time = end_time - start_time
print('execution time: ', execution_time)

execution time:  994.2478754520416


### Checkpoint-2

In [85]:
print(len(batch4)) # should be same as batch_size i.e. 1000
print(len(batch4[-1])) # should be 8
print(batch4[-5:]) #should have 8 items in each list with emails or remarks added

1001
8
[['Martin Stoller Transporte', 'Güterbeförderung im Strassenverkehr', 'NA', 'tel:031 741 24 15', 'NA', 'Weidstrasse 2\n3184 Wünnewil', 'https://www.google.com/maps/place/Weidstrasse%202+3184+W%C3%BCnnewil', 'website does not exist'], ['Christoph Hasler Schreinerei', 'Schreinerarbeiten im Innenausbau', 'http://www.hasler-schreinerei.ch/', 'tel:071 917 11 55', 'fax:071 917 18 48', 'Käsereistrasse 1\n9555 Tobel', 'https://www.google.com/maps/place/K%C3%A4sereistrasse%201+9555+Tobel', 'info@hasler-schreinerei.ch'], ['MoRailSo AG', 'Grosshandel mit sonstigen Maschinen und Ausrüstungen', 'https://www.ramorail.com/', 'tel:079 463 97 50', 'NA', 'Rothmatte 7\n6022 Grosswangen', 'https://www.google.com/maps/place/Rothmatte%207+6022+Grosswangen', 'bad/broken link to website'], ['Elektro Schuler AG', 'Elektroinstallation', 'NA', 'tel:041 631 03 31', 'fax:041 631 03 32', 'Schulhausstrasse 4\n6052 Hergiswil NW', 'https://www.google.com/maps/place/Schulhausstrasse%204+6052+Hergiswil%20NW', 'we

### End of Checkpoint-2

In [86]:
# # Convert to Dataframe batch1
df4 = pd.DataFrame(batch4, columns=columns)


### Checkpoint 3

In [87]:
df4.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1001 entries, 0 to 1000
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   company_name     1001 non-null   object
 1   industry         1001 non-null   object
 2   website          1001 non-null   object
 3   telephone        1001 non-null   object
 4   fax              1001 non-null   object
 5   address          1001 non-null   object
 6   address_maplink  1001 non-null   object
 7   email            1001 non-null   object
dtypes: object(8)
memory usage: 62.7+ KB


In [88]:
df4.head()

Unnamed: 0,company_name,industry,website,telephone,fax,address,address_maplink,email
0,Rhomberg Schmuck AG,Detailhandel mit Uhren und Schmuck,http://www.schmuck.ch/,tel:071 310 15 80,,Zürcher-Strasse\n9015 St. Gallen,https://www.google.com/maps/place/Z%C3%BCrcher...,email not found on website
1,GanzImmo AG,"Verwaltung von Grundstücken, Gebäuden und Wohn...",http://www.ganzimmo.ch/,tel:052 213 56 65,fax:052 213 31 74,Schaffhauserstrasse 79\n8401 Winterthur,https://www.google.com/maps/place/Schaffhauser...,email not found on website
2,Coiffure Daniel Moll,Artikel im Shop,http://www.coiffeur-kriens.com/,tel:041 310 05 41,,Amlehnstrasse 54\n6010 Kriens,https://www.google.com/maps/place/Amlehnstrass...,605a7baede844d278b89dc95ae0a9123@sentry-next.w...
3,Rondell AG,"Kauf und Verkauf von eigenen Grundstücken, Geb...",,tel:041 320 53 73,,Weinhalde 12\n6010 Kriens,https://www.google.com/maps/place/Weinhalde%20...,website does not exist
4,Rémy Bühler Malergeschäft Rothrist,Malerei,http://www.maler-buehler.ch/,tel:062 794 44 64,,Pfaffernweg 7\n4852 Rothrist,https://www.google.com/maps/place/Pfaffernweg%...,cid_image001_png@01D443CD.png


### End of Checkpoint-3

In [154]:
#save as csv
df4.to_csv('batch4.csv', index = False)

In [91]:
# Merge the two dataframes
df = pd.concat([df, df4])

### Checkpoint-4

In [92]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4004 entries, 0 to 1000
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   company_name     4004 non-null   object
 1   industry         4004 non-null   object
 2   website          4004 non-null   object
 3   telephone        4004 non-null   object
 4   fax              4004 non-null   object
 5   address          4004 non-null   object
 6   address_maplink  4004 non-null   object
 7   email            3975 non-null   object
dtypes: object(8)
memory usage: 281.5+ KB


In [93]:
df.head()

Unnamed: 0,company_name,industry,website,telephone,fax,address,address_maplink,email
0,Ringler + Strahm Storenbau AG,Sonstige Bauinstallation,http://www.ringler-strahm.ch/,tel:033 345 22 55,fax:033 345 55 19,Uetendorfstrasse 20\n3634 Thierachern,https://www.google.com/maps/place/Uetendorfstr...,info@ringler-strahm.ch
1,Grob & Partner Architektur AG,Architekturbüros,http://www.grobarchitektur.ch/,tel:081 720 02 00,fax:081 720 02 05,Bahnhofstrasse 3\n7320 Sargans,https://www.google.com/maps/place/Bahnhofstras...,605a7baede844d278b89dc95ae0a9123@sentry-next.w...
2,Lauclair AG,Schreinerarbeiten im Innenausbau,http://www.lauclair.ch/,tel:031 879 01 69,fax:031 879 20 69,Lyssstrasse 27\n3054 Schüpfen,https://www.google.com/maps/place/Lyssstrasse%...,a95fca1f1eb9fe8g2c9ead0cd9931e2a@2x.jpg
3,Club Goldwand,"Kauf und Verkauf von eigenen Grundstücken, Geb...",http://www.clubgoldwand.ch/,tel:056 282 30 50,,Landstrasse 6\n5415 Nussbaumen AG,https://www.google.com/maps/place/Landstrasse%...,info@clubgoldwand.ch
4,NICOLE DIEM Horgen,Detailhandel mit Brillen und anderen Sehhilfen,http://www.nicolediem.ch/,tel:044 770 10 40,,Dorfplatz 3\n8810 Horgen,https://www.google.com/maps/place/Dorfplatz%20...,johndoe@domain.com


In [94]:
df.tail()

Unnamed: 0,company_name,industry,website,telephone,fax,address,address_maplink,email
996,Martin Stoller Transporte,Güterbeförderung im Strassenverkehr,,tel:031 741 24 15,,Weidstrasse 2\n3184 Wünnewil,https://www.google.com/maps/place/Weidstrasse%...,website does not exist
997,Christoph Hasler Schreinerei,Schreinerarbeiten im Innenausbau,http://www.hasler-schreinerei.ch/,tel:071 917 11 55,fax:071 917 18 48,Käsereistrasse 1\n9555 Tobel,https://www.google.com/maps/place/K%C3%A4serei...,info@hasler-schreinerei.ch
998,MoRailSo AG,Grosshandel mit sonstigen Maschinen und Ausrüs...,https://www.ramorail.com/,tel:079 463 97 50,,Rothmatte 7\n6022 Grosswangen,https://www.google.com/maps/place/Rothmatte%20...,bad/broken link to website
999,Elektro Schuler AG,Elektroinstallation,,tel:041 631 03 31,fax:041 631 03 32,Schulhausstrasse 4\n6052 Hergiswil NW,https://www.google.com/maps/place/Schulhausstr...,website does not exist
1000,Camping Lac des Brenets,Inserate,http://www.camping-brenets.ch/,tel:032 932 16 18,fax:032 932 16 39,2416 Les Brenets,https://www.google.com/maps/place/undefined+24...,info@camping-brenets.ch


In [97]:
df.head() == df1.head()

Unnamed: 0,company_name,industry,website,telephone,fax,address,address_maplink,email
0,True,True,True,True,True,True,True,True
1,True,True,True,True,True,True,True,True
2,True,True,True,True,True,True,True,True
3,True,True,True,True,True,True,True,True
4,True,True,True,True,True,True,True,True


In [98]:
df.tail() == df4.tail()

Unnamed: 0,company_name,industry,website,telephone,fax,address,address_maplink,email
996,True,True,True,True,True,True,True,True
997,True,True,True,True,True,True,True,True
998,True,True,True,True,True,True,True,True
999,True,True,True,True,True,True,True,True
1000,True,True,True,True,True,True,True,True


### xxxxxxxxxxxx Push to Github xxxxxxxxxxxxxx

### Batch-5, batch_size = 1000, batch_start_index = 4001


In [99]:
# Extract info of interst
start_time = time.time()
batch5_initial = extract_info_of_interest(unique_company_profile_urls, batch_size = 1000, batch_start_index = 4001, sleep=5)
end_time = time.time()
execution_time = end_time - start_time
print('execution time: ', execution_time)

count 1
count 2
count 3
count 4
count 5
count 6
count 7
count 8
count 9
count 10
count 11
count 12
count 13
count 14
count 15
count 16
count 17
count 18
count 19
count 20
count 21
count 22
count 23
count 24
count 25
count 26
count 27
count 28
count 29
count 30
count 31
count 32
count 33
count 34
count 35
count 36
count 37
count 38
count 39
count 40
count 41
count 42
count 43
count 44
count 45
count 46
count 47
count 48
count 49
count 50
count 51
count 52
count 53
count 54
count 55
count 56
count 57
count 58
count 59
count 60
count 61
count 62
count 63
count 64
count 65
count 66
count 67
count 68
count 69
count 70
count 71
count 72
count 73
count 74
count 75
count 76
count 77
count 78
count 79
count 80
count 81
count 82
count 83
count 84
count 85
count 86
count 87
count 88
count 89
count 90
count 91
count 92
count 93
count 94
count 95
count 96
count 97
count 98
count 99
count 100
count 101
count 102
count 103
count 104
count 105
count 106
count 107
count 108
count 109
count 110
count 11

count 832
count 833
count 834
count 835
count 836
count 837
count 838
count 839
count 840
count 841
count 842
count 843
count 844
count 845
count 846
count 847
count 848
count 849
count 850
count 851
count 852
count 853
count 854
count 855
count 856
count 857
count 858
count 859
count 860
count 861
count 862
count 863
count 864
count 865
count 866
count 867
count 868
count 869
count 870
count 871
count 872
count 873
count 874
count 875
count 876
count 877
count 878
count 879
count 880
count 881
count 882
count 883
count 884
count 885
count 886
count 887
count 888
count 889
count 890
count 891
count 892
count 893
count 894
count 895
count 896
count 897
count 898
count 899
count 900
count 901
count 902
count 903
count 904
count 905
count 906
count 907
count 908
count 909
count 910
count 911
count 912
count 913
count 914
count 915
count 916
count 917
count 918
count 919
count 920
count 921
count 922
count 923
count 924
count 925
count 926
count 927
count 928
count 929
count 930
count 931


### checkpoint-1

In [100]:
print(len(batch5_initial))
print(len(batch5_initial[-1]))

1001
7


In [101]:
print(batch5_initial[:3])

[['Camping Lac des Brenets', 'Inserate', 'http://www.camping-brenets.ch/', 'tel:032 932 16 18', 'fax:032 932 16 39', '2416 Les Brenets', 'https://www.google.com/maps/place/undefined+2416+Les%20Brenets'], ['Elektro Capaul AG', 'Elektroinstallation', 'NA', 'tel:081 511 20 22', 'fax:081 511 20 21', 'Promenada 3\n7018 Flims Waldhaus', 'https://www.google.com/maps/place/Promenada%203+7018+Flims%20Waldhaus'], ['Parkhotel Bellevue & Spa', 'Hotels, Gasthöfe und Pensionen mit Restaurant', 'http://www.parkhotel-bellevue.ch/', 'tel:033 673 80 00', 'fax:033 673 80 01', 'Bellevuestrasse 15\n3715 Adelboden', 'https://www.google.com/maps/place/Bellevuestrasse%2015+3715+Adelboden']]


### End of Checkpoint-1

In [102]:
# Extract emails
start_time = time.time()
pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
batch5 = extract_emails_from_websites(batch5_initial, pattern = pattern , index=2)
end_time = time.time()
execution_time = end_time - start_time
print('execution time: ', execution_time)

execution time:  981.4949326515198


### Checkpoint-2

In [103]:
print(len(batch5)) # should be same as batch_size i.e. 1000
print(len(batch5[-1])) # should be 8
print(batch5[-5:]) #should have 8 items in each list with emails or remarks added

1001
8
[['Vinolatino GmbH', 'Grosshandel mit Wein und Spirituosen', 'NA', 'NA', 'NA', 'Hauptstrasse 33\n6280 Hochdorf', 'https://www.google.com/maps/place/Hauptstrasse%2033+6280+Hochdorf', 'website does not exist'], ['Cometal SA', 'Herstellung von Metallkonstruktionen', 'http://www.cometal.ch/', 'tel:091 859 10 22', 'fax:091 859 27 03', 'via Cantonale\n6595 Riazzino', 'https://www.google.com/maps/place/via%20Cantonale+6595+Riazzino', 'email not found on website'], ['HCDM Beratungen GmbH', 'Unternehmensberatung', 'NA', 'tel:079 445 69 40', 'NA', 'Mühleweg 7\n7304 Maienfeld', 'https://www.google.com/maps/place/M%C3%BChleweg%207+7304+Maienfeld', 'website does not exist'], ['Eigenheer Elektro AG', 'Elektroinstallation', 'http://www.eigenheer-elektro.ch/', 'tel:052 317 13 79', 'NA', 'Burgstrasse 5\n8450 Andelfingen', 'https://www.google.com/maps/place/Burgstrasse%205+8450+Andelfingen', 'info@eigenheer-elektro.ch'], ['Papeterie Zumstein AG', 'Detailhandel mit Schreibwaren und Bürobedarf', 'N

### End of Checkpoint-2

In [104]:
# # Convert to Dataframe batch1
df5 = pd.DataFrame(batch5, columns=columns)


In [105]:
# # Convert to Dataframe batch1
df5 = pd.DataFrame(batch5, columns=columns)


### Checkpoint 3

In [106]:
df5.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1001 entries, 0 to 1000
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   company_name     1001 non-null   object
 1   industry         1001 non-null   object
 2   website          1001 non-null   object
 3   telephone        1001 non-null   object
 4   fax              1001 non-null   object
 5   address          1001 non-null   object
 6   address_maplink  1001 non-null   object
 7   email            1001 non-null   object
dtypes: object(8)
memory usage: 62.7+ KB


In [107]:
df5.head()

Unnamed: 0,company_name,industry,website,telephone,fax,address,address_maplink,email
0,Camping Lac des Brenets,Inserate,http://www.camping-brenets.ch/,tel:032 932 16 18,fax:032 932 16 39,2416 Les Brenets,https://www.google.com/maps/place/undefined+24...,info@camping-brenets.ch
1,Elektro Capaul AG,Elektroinstallation,,tel:081 511 20 22,fax:081 511 20 21,Promenada 3\n7018 Flims Waldhaus,https://www.google.com/maps/place/Promenada%20...,website does not exist
2,Parkhotel Bellevue & Spa,"Hotels, Gasthöfe und Pensionen mit Restaurant",http://www.parkhotel-bellevue.ch/,tel:033 673 80 00,fax:033 673 80 01,Bellevuestrasse 15\n3715 Adelboden,https://www.google.com/maps/place/Bellevuestra...,info@bellevue-parkhotel.ch
3,Supersaxo Gerold,Malerei und Gipserei ohne ausgeprägten Schwerp...,,tel:g.supersaxo@valaiscom.ch,fax:027 957 45 16,Haus Rio\n3906 Saas-Fee,https://www.google.com/maps/place/Haus%20Rio+3...,website does not exist
4,Jenni Baumaschinen AG,"Grosshandel mit Bergwerks-, Bau- und Baustoffm...",http://www.rammax.ch/Joomla/,tel:041 920 36 62,fax:041 920 35 34,Haldenmattstrasse 2\n6210 Sursee,https://www.google.com/maps/place/Haldenmattst...,bad/broken link to website


### End of Checkpoint-3

In [155]:
# save as csv
df5.to_csv('batch5.csv', index = False)

In [110]:
#### Merge the two dataframes
df = pd.concat([df, df5])

### Checkpoint-4

In [111]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5005 entries, 0 to 1000
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   company_name     5005 non-null   object
 1   industry         5005 non-null   object
 2   website          5005 non-null   object
 3   telephone        5005 non-null   object
 4   fax              5005 non-null   object
 5   address          5005 non-null   object
 6   address_maplink  5005 non-null   object
 7   email            4976 non-null   object
dtypes: object(8)
memory usage: 351.9+ KB


In [112]:
df.head()

Unnamed: 0,company_name,industry,website,telephone,fax,address,address_maplink,email
0,Ringler + Strahm Storenbau AG,Sonstige Bauinstallation,http://www.ringler-strahm.ch/,tel:033 345 22 55,fax:033 345 55 19,Uetendorfstrasse 20\n3634 Thierachern,https://www.google.com/maps/place/Uetendorfstr...,info@ringler-strahm.ch
1,Grob & Partner Architektur AG,Architekturbüros,http://www.grobarchitektur.ch/,tel:081 720 02 00,fax:081 720 02 05,Bahnhofstrasse 3\n7320 Sargans,https://www.google.com/maps/place/Bahnhofstras...,605a7baede844d278b89dc95ae0a9123@sentry-next.w...
2,Lauclair AG,Schreinerarbeiten im Innenausbau,http://www.lauclair.ch/,tel:031 879 01 69,fax:031 879 20 69,Lyssstrasse 27\n3054 Schüpfen,https://www.google.com/maps/place/Lyssstrasse%...,a95fca1f1eb9fe8g2c9ead0cd9931e2a@2x.jpg
3,Club Goldwand,"Kauf und Verkauf von eigenen Grundstücken, Geb...",http://www.clubgoldwand.ch/,tel:056 282 30 50,,Landstrasse 6\n5415 Nussbaumen AG,https://www.google.com/maps/place/Landstrasse%...,info@clubgoldwand.ch
4,NICOLE DIEM Horgen,Detailhandel mit Brillen und anderen Sehhilfen,http://www.nicolediem.ch/,tel:044 770 10 40,,Dorfplatz 3\n8810 Horgen,https://www.google.com/maps/place/Dorfplatz%20...,johndoe@domain.com


In [113]:
df.tail()

Unnamed: 0,company_name,industry,website,telephone,fax,address,address_maplink,email
996,Vinolatino GmbH,Grosshandel mit Wein und Spirituosen,,,,Hauptstrasse 33\n6280 Hochdorf,https://www.google.com/maps/place/Hauptstrasse...,website does not exist
997,Cometal SA,Herstellung von Metallkonstruktionen,http://www.cometal.ch/,tel:091 859 10 22,fax:091 859 27 03,via Cantonale\n6595 Riazzino,https://www.google.com/maps/place/via%20Canton...,email not found on website
998,HCDM Beratungen GmbH,Unternehmensberatung,,tel:079 445 69 40,,Mühleweg 7\n7304 Maienfeld,https://www.google.com/maps/place/M%C3%BChlewe...,website does not exist
999,Eigenheer Elektro AG,Elektroinstallation,http://www.eigenheer-elektro.ch/,tel:052 317 13 79,,Burgstrasse 5\n8450 Andelfingen,https://www.google.com/maps/place/Burgstrasse%...,info@eigenheer-elektro.ch
1000,Papeterie Zumstein AG,Detailhandel mit Schreibwaren und Bürobedarf,,tel:044 211 77 70,,Rennweg 19\n8001 Zürich,https://www.google.com/maps/place/Rennweg%2019...,website does not exist


In [114]:
df.head() == df1.head()

Unnamed: 0,company_name,industry,website,telephone,fax,address,address_maplink,email
0,True,True,True,True,True,True,True,True
1,True,True,True,True,True,True,True,True
2,True,True,True,True,True,True,True,True
3,True,True,True,True,True,True,True,True
4,True,True,True,True,True,True,True,True


In [115]:
df.tail() == df5.tail()

Unnamed: 0,company_name,industry,website,telephone,fax,address,address_maplink,email
996,True,True,True,True,True,True,True,True
997,True,True,True,True,True,True,True,True
998,True,True,True,True,True,True,True,True
999,True,True,True,True,True,True,True,True
1000,True,True,True,True,True,True,True,True


### xxxxxxxxxxxx Push to Github xxxxxxxxxxxxxx

### Batch-6, batch_size = 1000, batch_start_index = 5001


In [116]:
# Extract info of interst
start_time = time.time()
batch6_initial = extract_info_of_interest(unique_company_profile_urls, batch_size = 1000, batch_start_index = 5001, sleep=5)
end_time = time.time()
execution_time = end_time - start_time
print('execution time: ', execution_time)

count 1
count 2
count 3
count 4
count 5
count 6
count 7
count 8
count 9
count 10
count 11
count 12
count 13
count 14
count 15
count 16
count 17
count 18
count 19
count 20
count 21
count 22
count 23
count 24
count 25
count 26
count 27
count 28
count 29
count 30
count 31
count 32
count 33
count 34
count 35
count 36
count 37
count 38
count 39
count 40
count 41
count 42
count 43
count 44
count 45
count 46
count 47
count 48
count 49
count 50
count 51
count 52
count 53
count 54
count 55
count 56
count 57
count 58
count 59
count 60
count 61
count 62
count 63
count 64
count 65
count 66
count 67
count 68
count 69
count 70
count 71
count 72
count 73
count 74
count 75
count 76
count 77
count 78
count 79
count 80
count 81
count 82
count 83
count 84
count 85
count 86
count 87
count 88
count 89
count 90
count 91
count 92
count 93
count 94
count 95
count 96
count 97
count 98
count 99
count 100
count 101
count 102
count 103
count 104
count 105
count 106
count 107
count 108
count 109
count 110
count 11

count 832
count 833
count 834
count 835
count 836
count 837
count 838
count 839
count 840
count 841
count 842
count 843
count 844
count 845
count 846
count 847
count 848
count 849
count 850
count 851
count 852
count 853
count 854
count 855
count 856
count 857
count 858
count 859
count 860
count 861
count 862
count 863
count 864
count 865
count 866
count 867
count 868
count 869
count 870
count 871
count 872
count 873
count 874
count 875
count 876
count 877
count 878
count 879
count 880
count 881
count 882
count 883
count 884
count 885
count 886
count 887
count 888
count 889
count 890
count 891
count 892
count 893
count 894
count 895
count 896
count 897
count 898
count 899
count 900
count 901
count 902
count 903
count 904
count 905
count 906
count 907
count 908
count 909
count 910
count 911
count 912
count 913
count 914
count 915
count 916
count 917
count 918
count 919
count 920
count 921
count 922
count 923
count 924
count 925
count 926
count 927
count 928
count 929
count 930
count 931


### checkpoint-1

In [118]:
print(len(batch6_initial))
print(len(batch6_initial[-1]))

1001
7


In [119]:
print(batch6_initial[:3])

[['Papeterie Zumstein AG', 'Detailhandel mit Schreibwaren und Bürobedarf', 'NA', 'tel:044 211 77 70', 'NA', 'Rennweg 19\n8001 Zürich', 'https://www.google.com/maps/place/Rennweg%2019+8001+Z%C3%BCrich'], ['Elektro Schuler AG', 'Elektroinstallation', 'NA', 'tel:041 612 06 33', 'NA', 'Forellenhof/Stanserstr.\n6362 Stansstad', 'https://www.google.com/maps/place/Forellenhof/Stanserstr.+6362+Stansstad'], ['Rüedi Büromaschinen', 'Detailhandel mit Telekommunikationsgeräten', 'http://www.rueedi-bueromaschinen.ch/', 'tel:034 415 19 23', 'fax:034 415 24 80', 'Gässli 2\n3473 Alchenstorf', 'https://www.google.com/maps/place/G%C3%A4ssli%202+3473+Alchenstorf']]


### End of Checkpoint-1

In [120]:
# Extract emails
start_time = time.time()
pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
batch6 = extract_emails_from_websites(batch6_initial, pattern = pattern , index=2)
end_time = time.time()
execution_time = end_time - start_time
print('execution time: ', execution_time)



['info@max-mueller.ch', 'info@buerodesign.ch', 'info@kohlerundpartner.ch', 'info@immo-collection.ch', 'r.blanc@blanc-architekten.ch', 'info@bruggers.ch', 'info@uhrenstucky.ch', 'moveandstic-spielgereate-die-idee_01BzC68SWB0J4xW_1280x1280@2x.jpg', 'info@straessle.com', 'info@gtl-sa.com', 'info@allodmathis.ch', 'mail@kuhn-gruppe.ch', 'info@vrp-machines.com', 'info@schoetz.portas.ch', 'fg@digitalundprint.ch', 'info@willihaustechnik.ch', 'reto.schleiss@axa.ch', 'ge_du@bluewin.ch', 'info@crehoba.ch', 'info@stolzholz.ch', 'info@ib-automobile.ch', 'hotel@hotel-falken.ch', 'weingut@hedinger.ch', 'support@delea.ch', 'PARI9192@2x.webp', '20180509_165215-a02c8494-b926dc3f@428w2x.jpg', 'info@maler-grimm.ch', 'richard.thommen@garagebuser.ch', 'info@chromag.ch', 'info@schwarzer-loewen.ch', 'melanie.varela@galledia.ch', 'info@gruber-polybau.ch', 'info@dinohaustechnik.ch', 'info@vonholzen-immobilien.ch', 'info@tangemann-metallbau.ch', 'info@wirthlin-haustechnik.ch', 'info@hotel-schweizerhof.com', 'inf

### Checkpoint-2

In [123]:
print(len(batch6)) # should be same as batch_size i.e. 1000
print(len(batch6[-1])) # should be 8
print(batch6[-5:]) #should have 8 items in each list with emails or remarks added

1001
8
[['Max Auer', 'Vermietung und Verpachtung von eigenen oder geleasten Grundstücken', 'NA', 'tel:071 911 21 42', 'fax:071 911 85 27', 'Agathafeld 22\n9512 Rossrüti', 'https://www.google.com/maps/place/Agathafeld%2022+9512+Rossr%C3%BCti', 'website not found'], ['Rino Weder AG', 'Mechanische Werkstätten', 'http://www.rinoweder.ch/', 'tel:071 763 60 50', 'fax:071 763 60 51', 'Roggenweg 1\n9463 Oberriet SG', 'https://www.google.com/maps/place/Roggenweg%201+9463+Oberriet%20SG', 'verkauf@rinoweder.ch'], ['Garage Andermatt AG Baar', 'lnstandhaltung und Reparatur von Automobilen', 'http://www.garage-andermatt.ch/', 'tel:041 760 46 46', 'NA', 'Ruessenstrasse 22\n6340 Baar', 'https://www.google.com/maps/place/Ruessenstrasse%2022+6340+Baar', 'info@garage-andermatt.ch'], ['Pizzeria Mare Monte, Ekrem Sinani', 'Restaurants, Imbissstuben, Tea-Rooms und Gelaterias', 'NA', 'tel:071 845 60 06', 'fax:071 845 60 07', 'Blumenstrasse 8\n9403 Goldach', 'https://www.google.com/maps/place/Blumenstrasse%20

### End of Checkpoint-2

In [124]:
# # Convert to Dataframe batch1
df6 = pd.DataFrame(batch6, columns=columns)


In [125]:
# # Convert to Dataframe batch1
df6 = pd.DataFrame(batch6, columns=columns)


### Checkpoint-3

In [126]:
df6.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1001 entries, 0 to 1000
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   company_name     1001 non-null   object
 1   industry         1001 non-null   object
 2   website          1001 non-null   object
 3   telephone        1001 non-null   object
 4   fax              1001 non-null   object
 5   address          1001 non-null   object
 6   address_maplink  1001 non-null   object
 7   email            999 non-null    object
dtypes: object(8)
memory usage: 62.7+ KB


In [127]:
# save as csv
df6.to_csv('batch6.csv', index = False)

### End of Checkpoint-3

In [128]:
#### Merge the two dataframes
df = pd.concat([df, df6])

### Checkpoint-4

In [129]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6006 entries, 0 to 1000
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   company_name     6006 non-null   object
 1   industry         6006 non-null   object
 2   website          6006 non-null   object
 3   telephone        6006 non-null   object
 4   fax              6006 non-null   object
 5   address          6006 non-null   object
 6   address_maplink  6006 non-null   object
 7   email            5975 non-null   object
dtypes: object(8)
memory usage: 422.3+ KB


In [130]:
df.head()

Unnamed: 0,company_name,industry,website,telephone,fax,address,address_maplink,email
0,Ringler + Strahm Storenbau AG,Sonstige Bauinstallation,http://www.ringler-strahm.ch/,tel:033 345 22 55,fax:033 345 55 19,Uetendorfstrasse 20\n3634 Thierachern,https://www.google.com/maps/place/Uetendorfstr...,info@ringler-strahm.ch
1,Grob & Partner Architektur AG,Architekturbüros,http://www.grobarchitektur.ch/,tel:081 720 02 00,fax:081 720 02 05,Bahnhofstrasse 3\n7320 Sargans,https://www.google.com/maps/place/Bahnhofstras...,605a7baede844d278b89dc95ae0a9123@sentry-next.w...
2,Lauclair AG,Schreinerarbeiten im Innenausbau,http://www.lauclair.ch/,tel:031 879 01 69,fax:031 879 20 69,Lyssstrasse 27\n3054 Schüpfen,https://www.google.com/maps/place/Lyssstrasse%...,a95fca1f1eb9fe8g2c9ead0cd9931e2a@2x.jpg
3,Club Goldwand,"Kauf und Verkauf von eigenen Grundstücken, Geb...",http://www.clubgoldwand.ch/,tel:056 282 30 50,,Landstrasse 6\n5415 Nussbaumen AG,https://www.google.com/maps/place/Landstrasse%...,info@clubgoldwand.ch
4,NICOLE DIEM Horgen,Detailhandel mit Brillen und anderen Sehhilfen,http://www.nicolediem.ch/,tel:044 770 10 40,,Dorfplatz 3\n8810 Horgen,https://www.google.com/maps/place/Dorfplatz%20...,johndoe@domain.com


In [131]:
df.tail()

Unnamed: 0,company_name,industry,website,telephone,fax,address,address_maplink,email
996,Max Auer,Vermietung und Verpachtung von eigenen oder ge...,,tel:071 911 21 42,fax:071 911 85 27,Agathafeld 22\n9512 Rossrüti,https://www.google.com/maps/place/Agathafeld%2...,website not found
997,Rino Weder AG,Mechanische Werkstätten,http://www.rinoweder.ch/,tel:071 763 60 50,fax:071 763 60 51,Roggenweg 1\n9463 Oberriet SG,https://www.google.com/maps/place/Roggenweg%20...,verkauf@rinoweder.ch
998,Garage Andermatt AG Baar,lnstandhaltung und Reparatur von Automobilen,http://www.garage-andermatt.ch/,tel:041 760 46 46,,Ruessenstrasse 22\n6340 Baar,https://www.google.com/maps/place/Ruessenstras...,info@garage-andermatt.ch
999,"Pizzeria Mare Monte, Ekrem Sinani","Restaurants, Imbissstuben, Tea-Rooms und Gelat...",,tel:071 845 60 06,fax:071 845 60 07,Blumenstrasse 8\n9403 Goldach,https://www.google.com/maps/place/Blumenstrass...,website not found
1000,TICINOCOLOR SA,Bewertungen,http://www.ticinocolor.ch/,tel:091 967 79 79,fax:091 967 79 81,via San Gottardo 146\n6942 Savosa,https://www.google.com/maps/place/via%20San%20...,email not found on website


In [132]:
df.head()==df1.head()

Unnamed: 0,company_name,industry,website,telephone,fax,address,address_maplink,email
0,True,True,True,True,True,True,True,True
1,True,True,True,True,True,True,True,True
2,True,True,True,True,True,True,True,True
3,True,True,True,True,True,True,True,True
4,True,True,True,True,True,True,True,True


In [133]:
df.tail()==df6.tail()

Unnamed: 0,company_name,industry,website,telephone,fax,address,address_maplink,email
996,True,True,True,True,True,True,True,True
997,True,True,True,True,True,True,True,True
998,True,True,True,True,True,True,True,True
999,True,True,True,True,True,True,True,True
1000,True,True,True,True,True,True,True,True


### End of Checkpoint-4

### xxxxxxxxxxxxxxxxxxxxx Push to Github xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx

### Batch-7, batch_size = 1000, batch_start_index = 6001


In [134]:
# Extract info of interst
start_time = time.time()
batch7_initial = extract_info_of_interest(unique_company_profile_urls, batch_size = 1000, batch_start_index = 6001, sleep=5)
end_time = time.time()
execution_time = end_time - start_time
print('execution time: ', execution_time)

count 1
count 2
count 3
count 4
count 5
count 6
count 7
count 8
count 9
count 10
count 11
count 12
count 13
count 14
count 15
count 16
count 17
count 18
count 19
count 20
count 21
count 22
count 23
count 24
count 25
count 26
count 27
count 28
count 29
count 30
count 31
count 32
count 33
count 34
count 35
count 36
count 37
count 38
count 39
count 40
count 41
count 42
count 43
count 44
count 45
count 46
count 47
count 48
count 49
count 50
count 51
count 52
count 53
count 54
count 55
count 56
count 57
count 58
count 59
count 60
count 61
count 62
count 63
count 64
count 65
count 66
count 67
count 68
count 69
count 70
count 71
count 72
count 73
count 74
count 75
count 76
count 77
count 78
count 79
count 80
count 81
count 82
count 83
count 84
count 85
count 86
count 87
count 88
count 89
count 90
count 91
count 92
count 93
count 94
count 95
count 96
count 97
count 98
count 99
count 100
count 101
count 102
count 103
count 104
count 105
count 106
count 107
count 108
count 109
count 110
count 11

count 832
count 833
count 834
count 835
count 836
count 837
count 838
count 839
count 840
count 841
count 842
count 843
count 844
count 845
count 846
count 847
count 848
count 849
count 850
count 851
count 852
count 853
count 854
count 855
count 856
count 857
count 858
count 859
count 860
count 861
count 862
count 863
count 864
count 865
count 866
count 867
count 868
count 869
count 870
count 871
count 872
count 873
count 874
count 875
count 876
count 877
count 878
count 879
count 880
count 881
count 882
count 883
count 884
count 885
count 886
count 887
count 888
count 889
count 890
count 891
count 892
count 893
count 894
count 895
count 896
count 897
count 898
count 899
count 900
count 901
count 902
count 903
count 904
count 905
count 906
count 907
count 908
count 909
count 910
count 911
count 912
count 913
count 914
count 915
count 916
count 917
count 918
count 919
count 920
count 921
count 922
count 923
count 924
count 925
count 926
count 927
count 928
count 929
count 930
count 931


### Checkpoint-1

In [135]:
print(len(batch7_initial))
print(len(batch7_initial[-1]))

1001
7


In [136]:
print(batch7_initial[:3])

[['TICINOCOLOR SA', 'Bewertungen', 'http://www.ticinocolor.ch/', 'tel:091 967 79 79', 'fax:091 967 79 81', 'via San Gottardo 146\n6942 Savosa', 'https://www.google.com/maps/place/via%20San%20Gottardo%20146+6942+Savosa'], ['Amrein Reinigungen', 'Allgemeine Gebäudereinigung', 'NA', 'tel:062 771 88 70', 'fax:062 771 85 65', 'Alzbachstrasse 17\n5734 Reinach AG', 'https://www.google.com/maps/place/Alzbachstrasse%2017+5734+Reinach%20AG'], ['Duplirex Sensler Papeterie', 'Detailhandel mit Schreibwaren und Bürobedarf', 'NA', 'tel:026 493 33 55', 'NA', 'Hauptstrasse 17\n3186 Düdingen', 'https://www.google.com/maps/place/Hauptstrasse%2017+3186+D%C3%BCdingen']]


### End of Checkpoint-1

In [137]:
# Extract emails
start_time = time.time()
pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
batch7 = extract_emails_from_websites(batch7_initial, pattern = pattern , index=2)
end_time = time.time()
execution_time = end_time - start_time
print('execution time: ', execution_time)



No of emails found:  303
execution time:  3580.3506712913513


### Checkpoint-2

In [141]:
print(len(batch7)) # should be same as batch_size i.e. 1000
print(len(batch7[-1])) # should be 8
print(batch7[:15]) #should have 8 items in each list with emails or remarks added

1001
8
[['TICINOCOLOR SA', 'Bewertungen', 'http://www.ticinocolor.ch/', 'tel:091 967 79 79', 'fax:091 967 79 81', 'via San Gottardo 146\n6942 Savosa', 'https://www.google.com/maps/place/via%20San%20Gottardo%20146+6942+Savosa', 'email not found on website'], ['Amrein Reinigungen', 'Allgemeine Gebäudereinigung', 'NA', 'tel:062 771 88 70', 'fax:062 771 85 65', 'Alzbachstrasse 17\n5734 Reinach AG', 'https://www.google.com/maps/place/Alzbachstrasse%2017+5734+Reinach%20AG', 'website not found'], ['Duplirex Sensler Papeterie', 'Detailhandel mit Schreibwaren und Bürobedarf', 'NA', 'tel:026 493 33 55', 'NA', 'Hauptstrasse 17\n3186 Düdingen', 'https://www.google.com/maps/place/Hauptstrasse%2017+3186+D%C3%BCdingen', 'website not found'], ['Z-Audio Animatec AG', 'Grosshandel mit Geräten der Unterhaltungselektronik', 'http://www.z-audio.ch/', 'tel:044 370 20 40', 'fax:044 370 20 41', 'Schickmattweg 7\n8332 Russikon', 'https://www.google.com/maps/place/Schickmattweg%207+8332+Russikon', 'email not fo

### End of Checkpoint-2

In [142]:
# # Convert to Dataframe batch7
df7 = pd.DataFrame(batch7, columns=columns)


### Checkpoint-3

In [143]:
df7.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1001 entries, 0 to 1000
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   company_name     1001 non-null   object
 1   industry         1001 non-null   object
 2   website          1001 non-null   object
 3   telephone        1001 non-null   object
 4   fax              1001 non-null   object
 5   address          1001 non-null   object
 6   address_maplink  1001 non-null   object
 7   email            1000 non-null   object
dtypes: object(8)
memory usage: 62.7+ KB


In [145]:
# save as csv
df7.to_csv('batch7.csv', index = False)

### End of Checkpoint-3

In [146]:
#### Merge the two dataframes
df = pd.concat([df, df7])

### Checkpoint-4

In [147]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7007 entries, 0 to 1000
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   company_name     7007 non-null   object
 1   industry         7007 non-null   object
 2   website          7007 non-null   object
 3   telephone        7007 non-null   object
 4   fax              7007 non-null   object
 5   address          7007 non-null   object
 6   address_maplink  7007 non-null   object
 7   email            6975 non-null   object
dtypes: object(8)
memory usage: 492.7+ KB


In [148]:
df.head()

Unnamed: 0,company_name,industry,website,telephone,fax,address,address_maplink,email
0,Ringler + Strahm Storenbau AG,Sonstige Bauinstallation,http://www.ringler-strahm.ch/,tel:033 345 22 55,fax:033 345 55 19,Uetendorfstrasse 20\n3634 Thierachern,https://www.google.com/maps/place/Uetendorfstr...,info@ringler-strahm.ch
1,Grob & Partner Architektur AG,Architekturbüros,http://www.grobarchitektur.ch/,tel:081 720 02 00,fax:081 720 02 05,Bahnhofstrasse 3\n7320 Sargans,https://www.google.com/maps/place/Bahnhofstras...,605a7baede844d278b89dc95ae0a9123@sentry-next.w...
2,Lauclair AG,Schreinerarbeiten im Innenausbau,http://www.lauclair.ch/,tel:031 879 01 69,fax:031 879 20 69,Lyssstrasse 27\n3054 Schüpfen,https://www.google.com/maps/place/Lyssstrasse%...,a95fca1f1eb9fe8g2c9ead0cd9931e2a@2x.jpg
3,Club Goldwand,"Kauf und Verkauf von eigenen Grundstücken, Geb...",http://www.clubgoldwand.ch/,tel:056 282 30 50,,Landstrasse 6\n5415 Nussbaumen AG,https://www.google.com/maps/place/Landstrasse%...,info@clubgoldwand.ch
4,NICOLE DIEM Horgen,Detailhandel mit Brillen und anderen Sehhilfen,http://www.nicolediem.ch/,tel:044 770 10 40,,Dorfplatz 3\n8810 Horgen,https://www.google.com/maps/place/Dorfplatz%20...,johndoe@domain.com


In [149]:
df.tail()

Unnamed: 0,company_name,industry,website,telephone,fax,address,address_maplink,email
996,Garage du Lac F. Dougoud SA,Detailhandel mit Motorenkraftstoffen (Tankstel...,,,,Rte de la Gruyère 115\n1644 Avry-devant-Pont,https://www.google.com/maps/place/Rte%20de%20l...,website not found
997,Bau GmbH André Bucher,"Anbringen von Stuckaturen, Gipserei und Verput...",,,,Grossweid 28\n6026 Rain,https://www.google.com/maps/place/Grossweid%20...,website not found
998,"Stefan Thalmann AG, Baugeschäft,",Allgemeiner Hoch- und Tiefbau ohne ausgeprägte...,http://www.thalmann-bau.ch/,tel:041 660 43 68,fax:041 660 99 81,Dörfli 9\n6060 Ramersberg,https://www.google.com/maps/place/D%C3%B6rfli%...,mail@thalmann-bau.ch
999,Print-Fix Drucktechnik AG,Grosshandel mit Datenverarbeitungsgeräten und ...,,tel:041 930 00 91,,Krümmigasse 15\n6221 Rickenbach LU,https://www.google.com/maps/place/Kr%C3%BCmmig...,website not found
1000,Spenglerei Schmid GmbH,Bauspenglerei,http://www.spenglerei-schmid.ch/,,,Kesselstrasse 12\n8200 Schaffhausen,https://www.google.com/maps/place/Kesselstrass...,bad/broken link to website


In [150]:
df.head()==df1.head()

Unnamed: 0,company_name,industry,website,telephone,fax,address,address_maplink,email
0,True,True,True,True,True,True,True,True
1,True,True,True,True,True,True,True,True
2,True,True,True,True,True,True,True,True
3,True,True,True,True,True,True,True,True
4,True,True,True,True,True,True,True,True


In [151]:
df.tail()==df7.tail()

Unnamed: 0,company_name,industry,website,telephone,fax,address,address_maplink,email
996,True,True,True,True,True,True,True,True
997,True,True,True,True,True,True,True,True
998,True,True,True,True,True,True,True,True
999,True,True,True,True,True,True,True,True
1000,True,True,True,True,True,True,True,True


### xxxxxxxxxxxxxxx Push to Github xxxxxxxxxxxxxxxxxxxx

### Batch-8, batch_size = 1000, batch_start_index = 7001


In [156]:
# Extract info of interst
start_time = time.time()
batch8_initial = extract_info_of_interest(unique_company_profile_urls, batch_size = 1000, batch_start_index = 7001, sleep=5)
end_time = time.time()
execution_time = end_time - start_time
print('execution time: ', execution_time)

count 1
count 2
count 3
count 4
count 5
count 6
count 7
count 8
count 9
count 10
count 11
count 12
count 13
count 14
count 15
count 16
count 17
count 18
count 19
count 20
count 21
count 22
count 23
count 24
count 25
count 26
count 27
count 28
count 29
count 30
count 31
count 32
count 33
count 34
count 35
count 36
count 37
count 38
count 39
count 40
count 41
count 42
count 43
count 44
count 45
count 46
count 47
count 48
count 49
count 50
count 51
count 52
count 53
count 54
count 55
count 56
count 57
count 58
count 59
count 60
count 61
count 62
count 63
count 64
count 65
count 66
count 67
count 68
count 69
count 70
count 71
count 72
count 73
count 74
count 75
count 76
count 77
count 78
count 79
count 80
count 81
count 82
count 83
count 84
count 85
count 86
count 87
count 88
count 89
count 90
count 91
count 92
count 93
count 94
count 95
count 96
count 97
count 98
count 99
count 100
count 101
count 102
count 103
count 104
count 105
count 106
count 107
count 108
count 109
count 110
count 11

count 832
count 833
count 834
count 835
count 836
count 837
count 838
count 839
count 840
count 841
count 842
count 843
count 844
count 845
count 846
count 847
count 848
count 849
count 850
count 851
count 852
count 853
count 854
count 855
count 856
count 857
count 858
count 859
count 860
count 861
count 862
count 863
count 864
count 865
count 866
count 867
count 868
count 869
count 870
count 871
count 872
count 873
count 874
count 875
count 876
count 877
count 878
count 879
count 880
count 881
count 882
count 883
count 884
count 885
count 886
count 887
count 888
count 889
count 890
count 891
count 892
count 893
count 894
count 895
count 896
count 897
count 898
count 899
count 900
count 901
count 902
count 903
count 904
count 905
count 906
count 907
count 908
count 909
count 910
count 911
count 912
count 913
count 914
count 915
count 916
count 917
count 918
count 919
count 920
count 921
count 922
count 923
count 924
count 925
count 926
count 927
count 928
count 929
count 930
count 931


In [25]:
df8_initial = pd.DataFrame(batch10_initial)
df8_initial.to_csv('batch8_initial.csv', index = False)

### Checkpoint-1

In [None]:
print(len(batch8_initial))
print(len(batch8_initial[-1]))

In [None]:
print(batch8_initial[:3])

### End of Checkpoint-1

In [None]:
# Extract emails
start_time = time.time()
pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
batch8 = extract_emails_from_websites(batch8_initial, pattern = pattern , index=2)
end_time = time.time()
execution_time = end_time - start_time
print('execution time: ', execution_time)

### Checkpoint-2

In [None]:
print(len(batch8)) # should be same as batch_size i.e. 1000
print(len(batch8[-1])) # should be 8
print(batch8[-5:]) #should have 8 items in each list with emails or remarks added

### End of Checkpoint-2

In [None]:
# # Convert to Dataframe batch8
df8 = pd.DataFrame(batch8, columns=columns)


### Checkpoint-3

In [None]:
df8.info()

In [None]:
# save as csv
pd.to_csv('batch8', index = False)

### End of Checkpoint-3

In [None]:
#### Merge the two dataframes
df = pd.concat([df, df8])

### Checkpoint-4

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df.head() == df1.head()

In [None]:
df.tail() == df8.tail()

### End of Checkpoint-4

### xxxxxxxxxxxxxxxxxxxxx Push to Github xxxxxxxxxxxxxxxxxxxxxxxxx

### Batch-9, batch_size = 1000, batch_start_index = 8001


In [157]:
# Extract info of interst
start_time = time.time()
batch9_initial = extract_info_of_interest(unique_company_profile_urls, batch_size = 1000, batch_start_index = 8001, sleep=5)
end_time = time.time()
execution_time = end_time - start_time
print('execution time: ', execution_time)

count 1
count 2
count 3
count 4
count 5
count 6
count 7
count 8
count 9
count 10
count 11
count 12
count 13
count 14
count 15
count 16
count 17
count 18
count 19
count 20
count 21
count 22
count 23
count 24
count 25
count 26
count 27
count 28
count 29
count 30
count 31
count 32
count 33
count 34
count 35
count 36
count 37
count 38
count 39
count 40
count 41
count 42
count 43
count 44
count 45
count 46
count 47
count 48
count 49
count 50
count 51
count 52
count 53
count 54
count 55
count 56
count 57
count 58
count 59
count 60
count 61
count 62
count 63
count 64
count 65
count 66
count 67
count 68
count 69
count 70
count 71
count 72
count 73
count 74
count 75
count 76
count 77
count 78
count 79
count 80
count 81
count 82
count 83
count 84
count 85
count 86
count 87
count 88
count 89
count 90
count 91
count 92
count 93
count 94
count 95
count 96
count 97
count 98
count 99
count 100
count 101
count 102
count 103
count 104
count 105
count 106
count 107
count 108
count 109
count 110
count 11

count 832
count 833
count 834
count 835
count 836
count 837
count 838
count 839
count 840
count 841
count 842
count 843
count 844
count 845
count 846
count 847
count 848
count 849
count 850
count 851
count 852
count 853
count 854
count 855
count 856
count 857
count 858
count 859
count 860
count 861
count 862
count 863
count 864
count 865
count 866
count 867
count 868
count 869
count 870
count 871
count 872
count 873
count 874
count 875
count 876
count 877
count 878
count 879
count 880
count 881
count 882
count 883
count 884
count 885
count 886
count 887
count 888
count 889
count 890
count 891
count 892
count 893
count 894
count 895
count 896
count 897
count 898
count 899
count 900
count 901
count 902
count 903
count 904
count 905
count 906
count 907
count 908
count 909
count 910
count 911
count 912
count 913
count 914
count 915
count 916
count 917
count 918
count 919
count 920
count 921
count 922
count 923
count 924
count 925
count 926
count 927
count 928
count 929
count 930
count 931


In [24]:
df9_initial = pd.DataFrame(batch10_initial)
df9_initial.to_csv('batch9_initial.csv', index = False)


### Checkpoint-1

In [None]:
print(len(batch9_initial))
print(len(batch9_initial[-1]))

In [None]:
print(batch9_initial[:3])

### End of Checkpoint-1

In [None]:
# Extract emails
start_time = time.time()
pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
batch9 = extract_emails_from_websites(batch9_initial, pattern = pattern , index=2)
end_time = time.time()
execution_time = end_time - start_time
print('execution time: ', execution_time)

### Checkpoint-2

In [None]:
print(len(batch9)) # should be same as batch_size i.e. 1000
print(len(batch9[-1])) # should be 8
print(batch9[-5:]) #should have 8 items in each list with emails or remarks added

### End of Checkpoint-2

In [None]:
# # Convert to Dataframe batch9
df9 = pd.DataFrame(batch9, columns=columns)


### Checkpoint-3

In [None]:
df9.info()

In [None]:
# save as csv
pd.to_csv('batch9', index = False)

### End of Checkpoint-3

In [None]:
#### Merge the two dataframes
df = pd.concat([df, df9])

### Checkpoint-4

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df.head() == df1.head()

In [None]:
df.tail() == df9.tail()

### End of Checkpoint-4

### xxxxxxxxxxxxxxxxxxxxxxxx Push to Github xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx

### Batch-10, batch_size = 1000, batch_start_index = 9001


In [22]:
# Extract info of interst
start_time = time.time()
batch10_initial = extract_info_of_interest(unique_company_profile_urls, batch_size = 1000, batch_start_index = 9001, sleep=5)
end_time = time.time()
execution_time = end_time - start_time
print('execution time: ', execution_time)

count 1
count 2
count 3
count 4
count 5
count 6
count 7
count 8
count 9
count 10
count 11
count 12
count 13
count 14
count 15
count 16
count 17
count 18
count 19
count 20
count 21
count 22
count 23
count 24
count 25
count 26
count 27
count 28
count 29
count 30
count 31
count 32
count 33
count 34
count 35
count 36
count 37
count 38
count 39
count 40
count 41
count 42
count 43
count 44
count 45
count 46
count 47
count 48
count 49
count 50
count 51
count 52
count 53
count 54
count 55
count 56
count 57
count 58
count 59
count 60
count 61
count 62
count 63
count 64
count 65
count 66
count 67
count 68
count 69
count 70
count 71
count 72
count 73
count 74
count 75
count 76
count 77
count 78
count 79
count 80
count 81
count 82
count 83
count 84
count 85
count 86
count 87
count 88
count 89
count 90
count 91
count 92
count 93
count 94
count 95
count 96
count 97
count 98
count 99
count 100
count 101
count 102
count 103
count 104
count 105
count 106
count 107
count 108
count 109
count 110
count 11

count 832
count 833
count 834
count 835
count 836
count 837
count 838
count 839
count 840
count 841
count 842
count 843
count 844
count 845
count 846
count 847
count 848
count 849
count 850
count 851
count 852
count 853
count 854
count 855
count 856
count 857
count 858
count 859
count 860
count 861
count 862
count 863
count 864
count 865
count 866
count 867
count 868
count 869
count 870
count 871
count 872
count 873
count 874
count 875
count 876
count 877
count 878
count 879
count 880
count 881
count 882
count 883
count 884
count 885
count 886
count 887
count 888
count 889
count 890
count 891
count 892
count 893
count 894
count 895
count 896
count 897
count 898
count 899
count 900
count 901
count 902
count 903
count 904
count 905
count 906
count 907
count 908
count 909
count 910
count 911
count 912
count 913
count 914
count 915
count 916
count 917
count 918
count 919
count 920
count 921
count 922
count 923
count 924
count 925
count 926
count 927
count 928
count 929
count 930
count 931


In [23]:
df10_initial = pd.DataFrame(batch10_initial)
df10_initial.to_csv('batch10_initial.csv', index = False)


### Checkpoint-1

In [None]:
print(len(batch10_initial))
print(len(batch10_initial[-1]))

In [None]:
print(batch10_initial[:3])

### End of Checkpoint-1

In [None]:
# Extract emails
start_time = time.time()
pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
batch10 = extract_emails_from_websites(batch10_initial, pattern = pattern , index=2)
end_time = time.time()
execution_time = end_time - start_time
print('execution time: ', execution_time)

### Checkpoint-2

In [None]:
print(len(batch10)) # should be same as batch_size i.e. 1000
print(len(batch10[-1])) # should be 8
print(batch10[-5:]) #should have 8 items in each list with emails or remarks added

### End of Checkpoint-2

In [None]:
# # Convert to Dataframe batch10
df10 = pd.DataFrame(batch10, columns=columns)


### Checkpoint-3

In [None]:
df10.info()

In [None]:
# save as csv
pd.to_csv('batch10', index = False)

### End of Checkpoint-3

In [None]:
#### Merge the two dataframes
df = pd.concat([df, df10])

### Checkpoint-4

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.head() == df1.head()

In [None]:
df.tail() == df10.tail()

### End of Checkpoint-4

### xxxxxxxxxxxxxxxxxxx Push to Github xxxxxxxxxxxxxxxxxxxxxxxx

### Batch-11, batch_size = 1000, batch_start_index = 10001


In [34]:
# Extract info of interst
start_time = time.time()
batch11_initial = extract_info_of_interest(unique_company_profile_urls, batch_size = 1000, batch_start_index = 10001, sleep=5)
end_time = time.time()
execution_time = end_time - start_time
print('execution time: ', execution_time)

count 1
count 2
count 3
count 4
count 5
count 6
count 7
count 8
count 9
count 10
count 11
count 12
count 13
count 14
count 15
count 16
count 17
count 18
count 19
count 20
count 21
count 22
count 23
count 24
count 25
count 26
count 27
count 28
count 29
count 30
count 31
count 32
count 33
count 34
count 35
count 36
count 37
count 38
count 39
count 40
count 41
count 42
count 43
count 44
count 45
count 46
count 47
count 48
count 49
count 50
count 51
count 52
count 53
count 54
count 55
count 56
count 57
count 58
count 59
count 60
count 61
count 62
count 63
count 64
count 65
count 66
count 67
count 68
count 69
count 70
count 71
count 72
count 73
count 74
count 75
count 76
count 77
count 78
count 79
count 80
count 81
count 82
count 83
count 84
count 85
count 86
count 87
count 88
count 89
count 90
count 91
count 92
count 93
count 94
count 95
count 96
count 97
count 98
count 99
count 100
count 101
count 102
count 103
count 104
count 105
count 106
count 107
count 108
count 109
count 110
count 11

count 832
count 833
count 834
count 835
count 836
count 837
count 838
count 839
count 840
count 841
count 842
count 843
count 844
count 845
count 846
count 847
count 848
count 849
count 850
count 851
count 852
count 853
count 854
count 855
count 856
count 857
count 858
count 859
count 860
count 861
count 862
count 863
count 864
count 865
count 866
count 867
count 868
count 869
count 870
count 871
count 872
count 873
count 874
count 875
count 876
count 877
count 878
count 879
count 880
count 881
count 882
count 883
count 884
count 885
count 886
count 887
count 888
count 889
count 890
count 891
count 892
count 893
count 894
count 895
count 896
count 897
count 898
count 899
count 900
count 901
count 902
count 903
count 904
count 905
count 906
count 907
count 908
count 909
count 910
count 911
count 912
count 913
count 914
count 915
count 916
count 917
count 918
count 919
count 920
count 921
count 922
count 923
count 924
count 925
count 926
count 927
count 928
count 929
count 930
count 931


### Checkpoint-1

In [35]:
print(len(batch11_initial))
print(len(batch11_initial[-1]))

1001
7


In [36]:
print(batch11_initial[:3])

[['c+j immo AG', 'Kauf und Verkauf von eigenen Grundstücken, Gebäuden und Wohnungen', 'NA', 'NA', 'NA', 'Industriestrasse 21\n6055 Alpnach Dorf', 'https://www.google.com/maps/place/Industriestrasse%2021+6055+Alpnach%20Dorf'], ['Dr.iur. Marco Ettisberger', 'Advokatur-, Notariatsbüros', 'http://www.eplaw.ch/', 'tel:081 254 38 00', 'fax:081 254 38 09', 'Hinterm Bach 40\n7002 Chur', 'https://www.google.com/maps/place/Hinterm%20Bach%2040+7002+Chur'], ['Techno AG', 'Grosshandel mit Automobilteilen und -zubehör', 'NA', 'tel:061 717 90 00', 'fax:061 711 38 58', 'Butthollenring 31\n4147 Aesch BL', 'https://www.google.com/maps/place/Butthollenring%2031+4147+Aesch%20BL']]


In [37]:
df11_initial = pd.DataFrame(batch11_initial)
df11_initial.to_csv('batch11_initial.csv', index = False)

### End of Checkpoint-1

In [None]:
# Extract emails
start_time = time.time()
pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
batch11 = extract_emails_from_websites(batch11_initial, pattern = pattern , index=2)
end_time = time.time()
execution_time = end_time - start_time
print('execution time: ', execution_time)

### Checkpoint-2

In [None]:
print(len(batch11)) # should be same as batch_size i.e. 1000
print(len(batch11[-1])) # should be 8
print(batch11[-5:]) #should have 8 items in each list with emails or remarks added

### End of Checkpoint-2

In [None]:
# # Convert to Dataframe batch11
df11 = pd.DataFrame(batch11, columns=columns)


### Checkpoint-3

In [None]:
df11.info()

In [None]:
# save as csv
pd.to_csv('batch11', index = False)

### End of Checkpoint-3

In [None]:
#### Merge the two dataframes
df = pd.concat([df, df11])

### Checkpoint-4

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.head() == df1.head()

In [None]:
df.tail() == df11.tail()

### End of Checkpoint-4

### xxxxxxxxxxxxxxxxxxxxxxxx Push to Github xxxxxxxxxxxxxxxxxxxxxxxxx

### Batch-12, batch_size = 1000, batch_start_index = 11001


In [38]:
# Extract info of interst
start_time = time.time()
batch12_initial = extract_info_of_interest(unique_company_profile_urls, batch_size = 1000, batch_start_index = 11001, sleep=5)
end_time = time.time()
execution_time = end_time - start_time
print('execution time: ', execution_time)

count 1
count 2
count 3
count 4
count 5
count 6
count 7
count 8
count 9
count 10
count 11
count 12
count 13
count 14
count 15
count 16
count 17
count 18
count 19
count 20
count 21
count 22
count 23
count 24
count 25
count 26
count 27
count 28
count 29
count 30
count 31
count 32
count 33
count 34
count 35
count 36
count 37
count 38
count 39
count 40
count 41
count 42
count 43
count 44
count 45
count 46
count 47
count 48
count 49
count 50
count 51
count 52
count 53
count 54
count 55
count 56
count 57
count 58
count 59
count 60
count 61
count 62
count 63
count 64
count 65
count 66
count 67
count 68
count 69
count 70
count 71
count 72
count 73
count 74
count 75
count 76
count 77
count 78
count 79
count 80
count 81
count 82
count 83
count 84
count 85
count 86
count 87
count 88
count 89
count 90
count 91
count 92
count 93
count 94
count 95
count 96
count 97
count 98
count 99
count 100
count 101
count 102
count 103
count 104
count 105
count 106
count 107
count 108
count 109
count 110
count 11

count 832
count 833
count 834
count 835
count 836
count 837
count 838
count 839
count 840
count 841
count 842
count 843
count 844
count 845
count 846
count 847
count 848
count 849
count 850
count 851
count 852
count 853
count 854
count 855
count 856
count 857
count 858
count 859
count 860
count 861
count 862
count 863
count 864
count 865
count 866
count 867
count 868
count 869
count 870
count 871
count 872
count 873
count 874
count 875
count 876
count 877
count 878
count 879
count 880
count 881
count 882
count 883
count 884
count 885
count 886
count 887
count 888
count 889
count 890
count 891
count 892
count 893
count 894
count 895
count 896
count 897
count 898
count 899
count 900
count 901
count 902
count 903
count 904
count 905
count 906
count 907
count 908
count 909
count 910
count 911
count 912
count 913
count 914
count 915
count 916
count 917
count 918
count 919
count 920
count 921
count 922
count 923
count 924
count 925
count 926
count 927
count 928
count 929
count 930
count 931


### Checkpoint-1

In [39]:
print(len(batch12_initial))
print(len(batch12_initial[-1]))

1001
7


In [40]:
print(batch12_initial[:3])

[['Sajade AG', 'Sonstiger Detailhandel mit Metallwaren, Anstrichmitteln, Bau- und Heimwerkerbedarf', 'http://www.sajade.ch/', 'tel:061 692 28 19', 'NA', 'Lehenmattstr. 137\n4052 Basel', 'https://www.google.com/maps/place/Lehenmattstr.%20137+4052+Basel'], ['Gasthof zum Schütz', 'Bewertungen', 'NA', 'tel:031 781 01 17', 'NA', 'Bernstrasse 5\n3629 Oppligen', 'https://www.google.com/maps/place/Bernstrasse%205+3629+Oppligen'], ['HW-Handel Inh. Wirthlin', 'Sonstiger Detailhandel mit Metallwaren, Anstrichmitteln, Bau- und Heimwerkerbedarf', 'NA', 'NA', 'NA', 'Gewerbepark Bata\n4313 Möhlin', 'https://www.google.com/maps/place/Gewerbepark%20Bata+4313+M%C3%B6hlin']]


In [41]:
df12_initial = pd.DataFrame(batch12_initial)
df12_initial.to_csv('batch12_initial.csv', index = False)

### End of Checkpoint-1

In [None]:
# Extract emails
start_time = time.time()
pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
batch12 = extract_emails_from_websites(batch12_initial, pattern = pattern , index=2)
end_time = time.time()
execution_time = end_time - start_time
print('execution time: ', execution_time)

### Checkpoint-2

In [None]:
print(len(batch12)) # should be same as batch_size i.e. 1000
print(len(batch12[-1])) # should be 8
print(batch12[-5:]) #should have 8 items in each list with emails or remarks added

### End of Checkpoint-2

In [None]:
# # Convert to Dataframe batch11
df12 = pd.DataFrame(batch12, columns=columns)


### Checkpoint-3

In [None]:
df12.info()

In [None]:
# save as csv
pd.to_csv('batch12', index = False)

### End of Checkpoint-3

In [None]:
#### Merge the two dataframes
df = pd.concat([df, df12])

### Checkpoint-4

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.head() == df1.head()

In [None]:
df.tail() == df12.tail()

### End of Checkpoint-4

### xxxxxxxxxxxxxxxxxxxxxxx Push to Github xxxxxxxxxxxxxxxxxxxxxxxx

### Batch-13, batch_size = 1000, batch_start_index = 12001


In [None]:
# Extract info of interst
start_time = time.time()
batch13_initial = extract_info_of_interest(unique_company_profile_urls, batch_size = 1000, batch_start_index = 12001, sleep=5)
end_time = time.time()
execution_time = end_time - start_time
print('execution time: ', execution_time)

### Checkpoint-1

In [None]:
print(len(batch13_initial))
print(len(batch13_initial[-1]))

In [None]:
print(batch13_initial[:3])

### End of Checkpoint-1

In [None]:
# Extract emails
start_time = time.time()
pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
batch13 = extract_emails_from_websites(batch13_initial, pattern = pattern , index=2)
end_time = time.time()
execution_time = end_time - start_time
print('execution time: ', execution_time)

### Checkpoint-2

In [None]:
print(len(batch13)) # should be same as batch_size i.e. 1000
print(len(batch13[-1])) # should be 8
print(batch13[-5:]) #should have 8 items in each list with emails or remarks added

### End of Checkpoint-2

In [None]:
# # Convert to Dataframe batch11
df13 = pd.DataFrame(batch13, columns=columns)


### Checkpoint-3

In [None]:
df13.info()

In [None]:
# save as csv
pd.to_csv('batch13', index = False)

### End of Checkpoint-3

In [None]:
#### Merge the two dataframes
df = pd.concat([df, df13])

### Checkpoint-4

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.head() == df1.head()

In [None]:
df.tail() == df13.tail()

### End of Checkpoint-4

### xxxxxxxxxxxxxxxxxxxxxxxxx Push to Github xxxxxxxxxxxxxxxxxxxxxx

### Batch-14, batch_size = 1227, batch_start_index = 13001


In [None]:
# Extract info of interst
start_time = time.time()
batch14_initial = extract_info_of_interest(unique_company_profile_urls, batch_size = 1227, batch_start_index = 13001, sleep=5)
end_time = time.time()
execution_time = end_time - start_time
print('execution time: ', execution_time)

### Checkpoint-1

In [None]:
print(len(batch14_initial))
print(len(batch14_initial[-1]))

In [None]:
print(batch14_initial[:3])

### End of Checkpoint-1

In [None]:
# Extract emails
start_time = time.time()
pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
batch14 = extract_emails_from_websites(batch14_initial, pattern = pattern , index=2)
end_time = time.time()
execution_time = end_time - start_time
print('execution time: ', execution_time)

### Checkpoint-2

In [None]:
print(len(batch14)) # should be same as batch_size i.e. 1000
print(len(batch14[-1])) # should be 8
print(batch14[-5:]) #should have 8 items in each list with emails or remarks added

### End of Checkpoint-2

In [None]:
# # Convert to Dataframe batch11
df14 = pd.DataFrame(batch14, columns=columns)


### Checkpoint-3

In [None]:
df14.info()

In [None]:
# save as csv
pd.to_csv('batch14', index = False)

### End of Checkpoint-3

In [None]:
#### Merge the two dataframes
df = pd.concat([df, df14])

### Checkpoing-4

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.head() == df1.head()

In [None]:
df.tail() == df14.tail()

### End of Checkpoint-4

### xxxxxxxxxxxxxxxxxxxxxx Push to Github xxxxxxxxxxxxxxxxxxxxxxxxx