# Part-1 CODE-BLOCK
- Do not make any changes in the 'CODE-BLOCK'
- Execute all cells in sequence
- Test-Blocks are disabled but for safety do not try to execute any cell marked as Test-Block

In [1]:
# import necessary libraries
import time
import requests
import re
import numpy as np
import pandas as pd
from contextlib import suppress
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.wait import WebDriverWait 
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

# Install webdriver manager to automatically detect driver location
service = ChromeService(executable_path=ChromeDriverManager().install())

# Set up ChromeDriver options
chrome_options = Options()
chrome_options.add_argument('--headless') # Run in headless mode to avoid opening a new browser window


In [2]:
def extract_hrefs(n=2, sleep = 15):
    """
    Extracts all hrefs from a paginated web url as a list 
    Args:
        n = number of paginated web urls from which hrefs are to be extracted, default = 5
        sleep = time in seconds to sleep before quitting the driver, default = 15 sec
        url = paginated url from which data is to be extracted
    Returns:
        (list) containining the hrefs found on each page 
    
    """
    
        # Initialize empty list to store results
    all_hrefs = []

    
    # Loop over n list pages
    for i in range(1, n+1):
        # Build URL for current list pages having links to company profiles
        # Don't uncomment following line of code
        url = f"https://wirmarket.wir.ch/de/members/list/?page={i}&resultAmount=100"
        

        # Launch ChromeDriver and load websites
        # In order to disable browser windows opening for websites amend the code of the following line:
        # driver = webdriver.Chrome(service=service, options = chrome_options)
        driver = webdriver.Chrome(service=service)

        # Don't uncomment the code on the following line, it is for testing
        #   driver.implicitly_wait(20) # seconds
        driver.get(url)

        # Wait for all anchor elements to be present
        # WebDriverWait(driver, 180).until(EC.visiblity_of_all_elements_located((By.CSS_SELECTOR, "a")))
        # Due to intermittent functionality of WebDriverWait, we introduce sleep() function to get all anchors
        # This value is presently set at 15 secs but you may change it depending upon internet speed
        time.sleep(sleep)
        # Extract hrefs from all anchor elements on current website
        hrefs = [a.get_attribute('href') for a in driver.find_elements(By.CSS_SELECTOR, "a")]

        # Add hrefs to the list of all hrefs
        all_hrefs.extend(hrefs) # extend flattens the list so that it is no longer a list of lists

        # Close the current browser window
        driver.quit()
    return all_hrefs


In [3]:
### Test Block ! DO NOT EXECUTE
# url = f"https://wirmarket.wir.ch/de/members/list/?page={i}&resultAmount=100"
# all_hrefs_list = extract_hrefs(n=2, sleep = 15, 
#                          url = url)
                          
# # Check the length of all_hrefs list
# # Each list link contains between 400-500 hrefs
# print(len(all_hrefs_list))
# # Check for duplicates
# print(len(set(all_hrefs_list)))

# # Test for function extract_hrefs passed

In [4]:
def unique_urls(urls_list):
    """
    Removes repeated entries of urls in a list which might occur due to use of regex or multiple occurences on a page
    Args:
        urls_list: list containing urls which might be repetitive entries
    Returns: list containing unique urls
    """
    # convert to set
    urls_set = set(urls_list)
    # convert back to list
    unique_urls_list = list(urls_set)
    
    return unique_urls_list


In [5]:
# ## Test-Block !DO NOT EXECUTE
# unique_hrefs = unique_urls(all_hrefs_list)
# print(len(unique_hrefs))

# # Test for function unique_urls passed

In [6]:
def regex_pattern_urls(pattern, hrefs_list): 
    """
    Uses a regex pattern to extract urls of interest out of a list of urls
    pattern: regex pattern used for extracting urls from a list of urls
    hrefs_list: list of hrefs required to be filtered according to the regex pattern
    returns:urls_of_interest a list of urls filtered according to regex pattern provided
        
    """
    # Filter the list 'all_hrefs' to keep only the company profile page urls

    #import regex library
    import re
    # regex pattern for company profile pages
    # Do not uncomment following line of code
#     pattern = r'https:\/\/\w+\.wir\.ch\/de\/companyProfile\/profile\/[0-9A-F]{32}\/info\/\?promo=false$'

    urls_of_interest = []

    # loop over all_hrefs list extracted above with Selenium to extract company profile urls
    for href in hrefs_list:
        match = re.search(pattern, str(href))
        if match:
            url = (match.group())
            urls_of_interest.append(url)
    return urls_of_interest        

In [7]:
# ## Test-Block !DO NOT EXECUTE

# pattern = r'https:\/\/\w+\.wir\.ch\/de\/companyProfile\/profile\/[0-9A-F]{32}\/info\/\?promo=false$'
# company_profile_urls =  regex_pattern_urls(pattern, unique_hrefs)
# print(len(company_profile_urls))
# unique_company_profile_urls = unique_urls(company_profile_urls)
# print(len(unique_company_profile_urls)) # should be 200 (slight variation of up to 5% is acceptable due to variation in internet speed)
# print(unique_company_profile_urls[:10]) #must contain the string "info" in them

# # Test for regex_pattern_urls passed

In [8]:
def extract_info_of_interest(url_list, batch_size = 1000, batch_start_index = 0, sleep=10):
    """
    Extracts all the relevant information from the given unique pages using selenium web driver finders and locators
    Args:
    url_list: list of the pages from where info is to be extracted
    batch_size (int): Number of pages to be processed at a time. It is recommended to process large number of pages in batches of 1000
    batch_start_index(int): index of the url_list from where parsing should start
    sleep(int): seconds to wait for page to load before info can be parsed. It overrides WebDriverWait() function
    Returns:
    list of list of extracted data where each list contains data from a single page
    """
    # Dont't uncomment following line of code, it is for troubleshooting
#   from contextlib import suppress

    # prepare the urls batch to be processed
    batch = url_list[batch_start_index:batch_start_index+batch_size+1]

    company_data = [] # initialize a list to contain lists of all companies' info


    count = 0

    for url in batch:
        company_info = [] # initialize a list to contain one company's info


        # driver control block
        driver = webdriver.Chrome(service=service)
        driver.get(url)
        time.sleep(sleep) # We are using sleep to avoid errors of WebDriverWait though it is timewise costly


    # Using suppress context to ignore known exceptions
    # Do not uncomment following line of code as it may lead to execution errors it is for testing
    #     with suppress(Exception): 

        # Get company names

        try:
    #         WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "71-lnk")))

            h3_headings = driver.find_elements(By.CLASS_NAME, "inline-block") # find all elements with class = "inline-block"
            h3_list = []
            for heading in h3_headings: #loop through the list obtained
                h3_list.append(heading.text) # get text of all h3 headings
            company_name = h3_list[0] # company name is the first item in the list
            company_info.append(company_name)   
        except:
            company_info.append('NA')
        finally:
                pass


        # Get company industry

        try:

            uls = driver.find_elements(By.CLASS_NAME, "default")
            list_items = driver.find_elements(By.TAG_NAME,"li")   
            li_list = []
            for li in list_items:
                li_list.append(li.text)
            industry_name = li_list[20] # industry name is the twenty first item in the list
            company_info.append(industry_name)   
        except:
            company_info.append('NA')
        finally:
                pass

        # Get company website if present         
        try:
    #       WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "68-lnk")))
            anchor = driver.find_element(By.ID, "68-lnk")
            website = anchor.get_attribute('href')
            company_info.append(website)
        except:
                company_info.append('NA')
        finally:
                pass

        # Get company telephoone if available
        try:
    #         WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "66-lnk")))
            anchor = driver.find_element(By.ID, "66-lnk")
            company_telephone = anchor.get_attribute('href')
            company_info.append(company_telephone)
        except:
            company_info.append('NA')
        finally:
                pass
        # Get company fax if available
        try:
    #         WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "fax-lnk")))
            anchor = driver.find_element(By.ID, "fax-lnk")
            company_fax_no = anchor.get_attribute('href')
            company_info.append(company_fax_no)
        except:
            company_info.append('NA')
        finally: 
                pass

       # Get company address and address google map link
        try:
    #       WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "6-lnk")))
            anchor = driver.find_element(By.ID, "6-lnk")
            address = anchor.text
            address_map_link = anchor.get_attribute('href')
            company_info.append(address)
            company_info.append(address_map_link)

        except:
                company_info.append('NA')
        finally:
                pass


        company_data.append(company_info)
        company_info = [] # reinitialize company_info
        
        count+=1
        print("count", count)

        driver.quit()
    return company_data


In [9]:
# ## Test-Block !DO NOT Execute
# company_data_lists = extract_info_of_interest(url_list = unique_company_profile_urls, batch_size = 200, batch_start_index = 0, sleep=10)

In [10]:
# # Test-Block !Do Not Execute
# print(len(company_data_lists)) # should be equal to batch size i.e. 200
# # test for extract_info_of_interest passed

In [11]:
# # Test-Block !DO NOT ExECUTE
# print(len(company_data_lists[-1]))
# print(company_data_lists[:5]) 
# # Should be a list of lists
# # Each list should have seven items
# # First item in each list should be 'company name', second: 'industry', third: website, 
# # fourth: 'tel', fifth 'fax', sixth: 'address', seventh: 'address maplink'
# # data not available should be represented by appropriate remarks

# # test for extract_info_of_interest_passed

In [12]:
def extract_emails_from_homepage(info_list, pattern=r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', index=2):
    """Extracts emails from a list of lists containing a referred website address using a regex pattern
    Args:
    info_list: list of lists containing extracted info including website address
    pattern: regex pattern to cover all possible formats of email addresses
    index(int): index number in the lists where website address is located, default=2
    Returns: info_list_with_emails a list with email addresses appended to info_list
    """

    import requests
    import re
    from bs4 import BeautifulSoup

    info_list_with_emails = info_list.copy()  # make a copy to avoid overwriting original list
    all_links = []
    for page in info_list_with_emails:
        try:
            home_page = str(page[index])
            response = requests.get(home_page)

            if response.status_code == 200:
                soup = BeautifulSoup(response.text, 'html.parser')
                hrefs = [link.get('href') for link in soup.find_all('a')]  # get all links on the homepage
                # search for emails in homepage response text
                emails = re.findall(pattern, response.text)
                if emails:
                    unique_emails = set(emails)
                    unique_emails_list = list(unique_emails)
                    page.append(unique_emails_list[0])
                    all_links.append(unique_emails_list[0])
                else:
                    # search for emails in homepage hrefs
                    for href in hrefs:
                        if href and href.startswith (str('mailto:')):
                            emails = [] 
                            email = href.split(':')[1]
                            emails.append(email)
                            page.append(emails[0])
                            all_links.append(email)
                            break
                        else:
                            pass
                        
            else:
                page.append('bad link')
        except:
            page.append('website not accessible')

    print('No of emails found: ', len(all_links))
   
    return info_list_with_emails


In [13]:
def extract_emails_from_allpages(info_list, pattern=r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', index=2):
    """Extracts emails from a list of lists containing a referred website address using a regex pattern
    Args:
    info_list: list of lists containing extracted info including website address
    pattern: regex pattern to cover all possible formats of email addresses
    index(int): index number in the lists where website address is located, default=2
    Returns: info_list_with_emails a list with email addresses appended to info_list
    """

    import requests
    import re
    from bs4 import BeautifulSoup

    info_list_with_emails = info_list.copy()  # make a copy to avoid overwriting original list
    all_links = []
    
    for page in info_list_with_emails:
        try:
            home_page = str(page[index])
            response = requests.get(home_page)

            if response.status_code == 200:
                soup = BeautifulSoup(response.text, 'html.parser')
                hrefs = [link.get('href') for link in soup.find_all('a')]  # get all links on the homepage

                # search for emails on the homepage
                emails = re.findall(pattern, response.text)
                if emails:
                    unique_emails = set(emails)
                    unique_emails_list = list(unique_emails)
                    page.append(unique_emails_list[0])
                    all_links.append(unique_emails_list[0])
                else:
                    # search for emails on all links found
                    email_found = False
                    for href in hrefs:
                        if href and href.startswith(str('mailto:')):
                            emails = [] 
                            email = href.split(':')[1]
                            emails.append(email)
                            page.append(emails[0])
                            all_links.append(emails[0])
                            email_found = True
                            break
                        elif 'http' not in href:
                            link = home_page[:-1]+ href
                            link_response = requests.get(link)
                            if link_response.status_code == 200:
                                link_soup = BeautifulSoup(link_response.text, 'html.parser')
                                link_hrefs = [link.get('href') for link in link_soup.find_all('a')]

                                emails = re.findall(pattern, link_response.text)
                                if emails:
                                    unique_emails = set(emails)
                                    unique_emails_list = list(unique_emails)
                                    page.append(unique_emails_list[0])
                                    all_links.append(unique_emails_list[0])
                                    email_found = True
                                    break
                                        
                    if not email_found:
                        page.append("NA")
                        
            else:
                page.append('bad link')
        except:
            page.append('website not accessible')

    print('No of emails found: ', len(all_links), all_links)

   
    return info_list_with_emails


In [14]:
# ## Test_Bloack !DO NOT EXECUTE
# company_data_lists_with_emails = extract_emails_from_websites(company_data_lists)

In [15]:
# ## Test_Block !DO NOT EXECUTE
# print(len(company_data_lists_with_emails)) # should be same as company_data_lists i.e. 200
# print(len(company_data_lists_with_emails[-1])) # should be 8
# print(company_data_lists_with_emails[-5:]) #should have 8 items in each list with emails or remarks added

# # Test for extract_emails_from_websites passed

In [16]:
# ## Test-Block !DO NOT EXECUTE
# # Convert to Dataframe first 100 items
# columns=['company_name', 'industry', 'website', 'telephone', 'fax', 'address', 'address_maplink', 'email']

# df1 = pd.DataFrame(company_data_lists_with_emails[:100], columns=columns)
 

In [17]:
# ## Test-Block !DO NOT EXECUTE
# df1.info()

In [18]:
# ## Test_Block !DO NOT EXECUTE
# df1.head()

In [19]:
# ## Test-Block !DO NOT EXECUTE
# # Convert to DataFrame next 100 items
# columns=['company_name', 'industry', 'website', 'telephone', 'fax', 'address', 'address_maplink', 'email']

# df2 = pd.DataFrame(company_data_lists_with_emails[101:200], columns=columns)


In [20]:
# ## Test-Block !DO NOT EXECUTE
# df2.info()

In [21]:
# ## Test_Block !DO NOT EXECUTE
# df2.head()

In [22]:
# ## Test_Block !DO NOT EXECUTE
# df2.tail()

In [23]:
# # Test-Block !DO nOT EXECUTE
# # Join both Dataframes
# df = pd.concat([df1, df2])

In [24]:
## Test-Block DO NOT EXECUTE
# df.info()

In [25]:
# ## Test_Block !DO NOT EXECUTE
# df.head()==df1.head() # should be same as df1.head()
# # test passed

In [26]:
# ## Test_Block !DO NOT EXECUTE
# df.tail()==df2.tail() # should be same as df2.tail()
# # test passed

### xxxxxxxxxxxxxxxxxxxxxx END OF CODE BLOCKXXXXXXXXXXXXXXXXXXXXXXXX

## XXXXXXXXXXXXX START OF EXECUTION BLOCXXXXXXXXXXXXXX

### Part-I Extract links to all company profile pages

In [27]:
## all_hrefs already extracted. Uncomment and execute only if extracting all_hrefs again
# # extract all_hrefs from 224 list pages on the site 
# start_time = time.time()
# # url = f"https://wirmarket.wir.ch/de/members/list/?page={i}&resultAmount=25"
# all_hrefs = extract_hrefs( n=224, sleep = 17)

# end_time = time.time()
# execution_time = end_time - start_time
# print("execution_time: ", execution_time)


In [28]:
## Test-Block !Uncomment and execute only if extracting all_hrefs again
# print(len(all_hrefs))

In [29]:
# ## all_hrefs.csv already saved. Uncomment and execute only if extracting all_hrefs again
# all_hrefs_dict = {"hrefs": all_hrefs}
# all_hrefs_df = pd.DataFrame(all_hrefs)
# all_hrefs_df.to_csv("all_hrefs.csv", index = False)

In [30]:
# # Duplicates already removed. !Uncomment and execute only if extracting again
# # Remove duplicate or repeated entries from the all_hrefs
# all_hrefs_unique = unique_urls(all_hrefs) 

NameError: name 'all_hrefs' is not defined

In [None]:
# # Test-Block !Uncomment and execute only if extracting again
# print(len(all_hrefs_unique))

In [None]:
# ## Already extracted !Uncomment and execute only if extracting again
# # Extract company profile page urls from all unique hrefs
# pattern = r'https:\/\/\w+\.wir\.ch\/de\/companyProfile\/profile\/[0-9A-F]{32}\/info\/\?promo=false$'
# company_profile_urls = regex_pattern_urls(pattern = pattern, hrefs_list = all_hrefs_unique)

In [None]:
# # Test_Block !Uncomment and execute only if extracting again
# print(len(company_profile_urls))

In [None]:
# # Uncomment and execute only if extracting again
# # Remove duplicate or repeated entries from company_profile_urls
# unique_company_profile_urls = unique_urls(company_profile_urls)

#### Checkpoint-1

In [None]:
# ## Test-Block. !Uncomment and execute only if extracting again
# print(len(unique_company_profile_urls)) # should be 14227 (slight variation of up to 5% is acceptable due to variation in internet speed)
# print(unique_company_profile_urls[:10])
# print(unique_company_profile_urls[-10:])

In [None]:
# # company_profile_url already saved as profile_urls.csv. Uncomment and execute only if extracting again
# # Save as csv
# company_profile_url_dict = {"urls": unique_company_profile_urls}
# company_profile_url_df = pd.DataFrame(company_profile_url_dict)
# company_profile_url_df.to_csv('profile_urls.csv', index = False)

In [None]:
## Comment this code (Ctrl+A followed by Ctrl+/) if extracting hrefs and urls again
# import profile_urls.csv as dataframe
unique_company_profiles_df = pd.read_csv("profile_urls.csv")

In [None]:
## Comment this code (Ctrl+A followed by Ctrl+/) if extracting hrefs and urls again
unique_company_profiles_df.info()

In [None]:
## Comment this code (Ctrl+A followed by Ctrl+/) if extracting hrefs and urls again
unique_company_profile_urls = unique_company_profiles_df['urls'].tolist()


In [None]:
## Comment this code (Ctrl+A followed by Ctrl+/) if extracting hrefs and urls again
print(len(unique_company_profile_urls))

In [None]:
## Comment this code (Ctrl+A followed by Ctrl+/) if extracting hrefs and urls again
print(unique_company_profile_urls[-5:])

### xxxxxxxxxxxxxxxxxxx END OF PART-1 xxxxxxxxxxxxxxxxxxxxxxxxxxx

## Part-2 Batch Processing

### Batch-1, batch_size = 1000, batch_start_index = 0

#### !Cell is executed. Uncomment (Ctrl + A followed by Ctrl + /) and execute (Shift + Enter) again only if you want to run the batch again.

In [None]:
# # Extract info of interst
# start_time = time.time()
# batch1_initial = extract_info_of_interest(unique_company_profile_urls, batch_size = 1000, batch_start_index = 0, sleep=10)
# end_time = time.time()
# execution_time = end_time - start_time
# print('execution time: ', execution_time)

### checkpoint-1

In [None]:
# # Test-Block ! Execute only if you run the batch again
# print(len(batch1_initial)) # Should be equal to batch_size + 1
# print(len(batch1_initial[-1])) # Should be equal to 7

In [None]:
## Test_Block ! Execute only if you run the batch again
# print(batch1_initial[:3])

### End of checkpoint-1

In [None]:
batch1df = pd.read_csv('batch1.csv')

In [None]:
batch1df_initial = batch1df.drop(['email'], axis = 1)


In [None]:
batch1df_initial.info()

In [None]:
batch1_initial = batch1df_initial.values.tolist()
print(batch1_initial[:3])

#### Following Function Extracts emails from Home Page Only (Takes Less Time: 1 sec per page)

In [None]:
# Extract emails
start_time = time.time()
pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
batch1 = extract_emails_from_homepage(batch1_initial, pattern = pattern , index=2)
end_time = time.time()
execution_time = end_time - start_time
print('execution time: ', execution_time)

### checkpoint-2

In [None]:
print(len(batch1)) # should be same as batch_size i.e. 1001
print(len(batch1[-1])) # should be 8
print(batch1[-5:]) #should have 8 items in each list with emails or remarks added


### End of checkpoint-2

In [None]:
# Columns to be used for all batches
columns=['company_name', 'industry', 'website', 'telephone', 'fax', 'address', 'address_maplink', 'email']


In [None]:
# # Convert to Dataframe batch1
df1 = pd.DataFrame(batch1, columns=columns)


### Checkpoint-3

In [None]:
# # Test-Block ! Execute only if you run the batch again
df1.info()

In [None]:
df1.head()

### End of Checkpoint-3

In [None]:
# Save as csv file
df1.to_csv('batch1.csv', index = False)

### xxxxxxxxxxxxxxx Push to Github xxxxxxxxxxxxxxxxxxxxxxxxx

### Batch-2, batch_size = 1000, batch_start_index = 1001

#### !Cell is executed. Uncomment (Ctrl + A followed by Ctrl + /) and execute (Shift + Enter) again only if you want to run the batch again.

In [None]:
# # Extract info of interst
# start_time = time.time()
# batch2_initial = extract_info_of_interest(unique_company_profile_urls, batch_size = 1000, batch_start_index = 1001, sleep=10)
# end_time = time.time()
# execution_time = end_time - start_time
# print('execution time: ', execution_time)

### checkpoint-1

In [None]:
# ## Test_Block ! Execute only if you run the batch again
# print(len(batch2_initial))
# print(len(batch2_initial[-1]))

In [None]:
# ## Test_Block ! Execute only if you run the batch again
# print(batch2_initial[:3])

### End of Checkpoint-1

In [None]:
batch2df = pd.read_csv('batch2.csv')

In [None]:
batch2df_initial = batch2df.drop(['email'], axis = 1)

In [None]:
batch2df_initial.info()

In [None]:
batch2_initial = batch2df_initial.values.tolist()
print(batch2_initial[:3])

#### Following Function Extracts emails from Home Page Only (Takes Less Time: 1 sec per page)

In [None]:
# Extract emails
start_time = time.time()
pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
batch2 = extract_emails_from_homepage(batch2_initial, pattern = pattern , index=2)
end_time = time.time()
execution_time = end_time - start_time
print('execution time: ', execution_time)#### Following Function Extracts emails from All Pages of a Site (Takes A Lot of Time: 15 sec per page)
#### Entire Batch can take upto 5 hours - Uncomment (Ctrl + A followed by Ctrl + / will uncomment the code.
#### Execute(Shift + Enter) only if needed

### Checkpoint-2

In [None]:
print(len(batch2)) # should be same as batch_size i.e. 1000
print(len(batch2[-1])) # should be 8
print(batch2[-5:]) #should have 8 items in each list with emails or remarks added

### End of Checkpoint-2

In [None]:
# # Convert to Dataframe batch1
df2 = pd.DataFrame(batch2, columns=columns)


### Checkpoint 3

In [None]:
df2.info()

In [None]:
df2.head()

### End of Checkpoint-3

In [None]:
# save as csv
df2.to_csv('batch2.csv', index = False)

In [None]:
# Merge the two dataframes
df = pd.concat([df1, df2])

### Checkpoint-4

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.head() == df1.head()

In [None]:
df.tail() == df2.tail()

### End of Checkpoint-4

### xxxxxxxxxxxx Push to Github xxxxxxxxxxxxxx

### Batch-3, batch_size = 1000, batch_start_index = 2001


#### !Cell is executed. Uncomment (Ctrl + A followed by Ctrl + /) and execute (Shift + Enter) again only if you want to run the batch again.

In [None]:
# # Extract info of interst
# start_time = time.time()
# batch3_initial = extract_info_of_interest(unique_company_profile_urls, batch_size = 1000, batch_start_index = 2001, sleep=5)
# end_time = time.time()
# execution_time = end_time - start_time
# print('execution time: ', execution_time)

### checkpoint-1

In [None]:
# # Test-Block ! Execute only if running the batch again
# print(len(batch3_initial))
# print(len(batch3_initial[-1]))

In [None]:
# # Test-Block ! Execute only if running the batch again
# print(batch3_initial[:3])

### End of Checkpoint-1

In [None]:
batch3df = pd.read_csv('batch3.csv')

In [None]:
batch3df_initial = batch3df.drop(['email'], axis = 1)

In [None]:
batch3df_initial.info()

In [None]:
batch3_initial = batch3df_initial.values.tolist()
print(batch3_initial[:3])

### End of Checkpoint-1

#### Following Function Extracts emails from Home Page Only (Takes Less Time: 1 sec per page)

In [None]:
# Extract emails
start_time = time.time()
pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
batch3 = extract_emails_from_homepage(batch3_initial, pattern = pattern , index=2)
end_time = time.time()
execution_time = end_time - start_time
print('execution time: ', execution_time)

### Checkpoint-2

In [None]:
print(len(batch3)) # should be same as batch_size i.e. 1000
print(len(batch3[-1])) # should be 8
print(batch3[-5:]) #should have 8 items in each list with emails or remarks added

### End of Checkpoint-2

In [None]:
# # Convert to Dataframe batch3
df3 = pd.DataFrame(batch3, columns=columns)


### Checkpoint 3

In [None]:
df3.info()

In [None]:
df3.head()

### End of Checkpoint-3

In [None]:
#save as csv
df3.to_csv('batch3.csv', index = False)

In [None]:
# Merge the two dataframes
df = pd.concat([df, df3])

### Checkpoint-4

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.head() == df1.head()

In [None]:
df.tail() == df3.tail()

### xxxxxxxxxxxx Push to Github xxxxxxxxxxxxxx

### Batch-4, batch_size = 1000, batch_start_index = 3001


#### !Cell is executed. Uncomment (Ctrl + A followed by Ctrl + /) and execute (Shift + Enter) again only if you want to run the batch again.

In [None]:
# # Extract info of interst
# start_time = time.time()
# batch4_initial = extract_info_of_interest(unique_company_profile_urls, batch_size = 1000, batch_start_index = 3001, sleep=5)
# end_time = time.time()
# execution_time = end_time - start_time
# print('execution time: ', execution_time)

### checkpoint-1

In [None]:
# # Test-Block ! Uncomment and execute only if batch is run again
# print(len(batch4_initial))
# print(len(batch4_initial[-1]))

In [None]:
# # Test-Block ! Uncomment and execute only if batch is run again
# print(batch4_initial[:3])

### End of Checkpoint-1

In [None]:
batch4df = pd.read_csv('batch4.csv')

In [None]:
batch4df_initial = batch4df.drop(['email'], axis = 1)

In [None]:
batch4df_initial.info()

In [None]:
batch4_initial = batch4df_initial.values.tolist()
print(batch1_initial[:3])

#### Following Function Extracts emails from Home Page Only (Takes Less Time: 1 sec per page)

In [None]:
# Extract emails
start_time = time.time()
pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
batch4 = extract_emails_from_homepage(batch4_initial, pattern = pattern , index=2)
end_time = time.time()
execution_time = end_time - start_time
print('execution time: ', execution_time)

### Checkpoint-2

In [None]:
print(len(batch4)) # should be same as batch_size i.e. 1000
print(len(batch4[-1])) # should be 8
print(batch4[-5:]) #should have 8 items in each list with emails or remarks added

### End of Checkpoint-2

In [None]:
# # Convert to Dataframe batch1
df4 = pd.DataFrame(batch4, columns=columns)


### Checkpoint 3

In [None]:
df4.info()

In [None]:
df4.head()

### End of Checkpoint-3

In [None]:
#save as csv
df4.to_csv('batch4.csv', index = False)

In [None]:
# Merge the two dataframes
df = pd.concat([df, df4])

### Checkpoint-4

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.head() == df1.head()

In [None]:
df.tail() == df4.tail()

### xxxxxxxxxxxx Push to Github xxxxxxxxxxxxxx

### Batch-5, batch_size = 1000, batch_start_index = 4001


#### !Cell is executed. Uncomment (Ctrl + A followed by Ctrl + /) and execute (Shift + Enter) again only if you want to run the batch again.

In [None]:
# # Extract info of interst
# start_time = time.time()
# batch5_initial = extract_info_of_interest(unique_company_profile_urls, batch_size = 1000, batch_start_index = 4001, sleep=5)
# end_time = time.time()
# execution_time = end_time - start_time
# print('execution time: ', execution_time)

### checkpoint-1

In [None]:
# # Test-Block !Uncomment and execute only if the batch is run again
# print(len(batch5_initial))
# print(len(batch5_initial[-1]))

In [None]:
# # Test-Block !Uncomment and execute only if the batch is run again
# print(batch5_initial[:3])

### End of Checkpoint-1

In [None]:
batch5df = pd.read_csv('batch5.csv')

In [None]:
batch5df_initial = batch5df.drop(['email'], axis = 1)

In [None]:
batch5df_initial.info()

In [None]:
batch5_initial = batch5df_initial.values.tolist()
print(batch1_initial[:3])

#### Following Function Extracts emails from Home Page Only (Takes Less Time: 1 sec per page)

In [None]:
# Extract emails
start_time = time.time()
pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
batch5 = extract_emails_from_homepage(batch5_initial, pattern = pattern , index=2)
end_time = time.time()
execution_time = end_time - start_time
print('execution time: ', execution_time)

### Checkpoint-2

In [None]:
print(len(batch5)) # should be same as batch_size i.e. 1000
print(len(batch5[-1])) # should be 8
print(batch5[-5:]) #should have 8 items in each list with emails or remarks added

### End of Checkpoint-2

In [None]:
# # Convert to Dataframe batch1
df5 = pd.DataFrame(batch5, columns=columns)


### Checkpoint 3

In [None]:
df5.info()

In [None]:
df5.head()

### End of Checkpoint-3

In [None]:
# save as csv
df5.to_csv('batch5.csv', index = False)

In [None]:
#### Merge the two dataframes
df = pd.concat([df, df5])

### Checkpoint-4

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.head() == df1.head()

In [None]:
df.tail() == df5.tail()

### xxxxxxxxxxxx Push to Github xxxxxxxxxxxxxx

### Batch-6, batch_size = 1000, batch_start_index = 5001


#### !Cell is executed. Uncomment (Ctrl + A followed by Ctrl + /) and execute (Shift + Enter) again only if you want to run the batch again.

In [None]:
# # Extract info of interst
# start_time = time.time()
# batch6_initial = extract_info_of_interest(unique_company_profile_urls, batch_size = 1000, batch_start_index = 5001, sleep=5)
# end_time = time.time()
# execution_time = end_time - start_time
# print('execution time: ', execution_time)

### checkpoint-1

In [None]:
# # Test-Block !Uncomment and execute only if batch is run again
# print(len(batch6_initial))
# print(len(batch6_initial[-1]))

In [None]:
# # Test-Block !Uncomment and execute only if batch is run again
# print(batch6_initial[:3])

### End of Checkpoint-1

In [None]:
batch6df = pd.read_csv('batch6.csv')

In [None]:
batch6df_initial = batch6df.drop(['email'], axis = 1)

In [None]:
batch6df_initial.info()

In [None]:
batch6_initial = batch6df_initial.values.tolist()
print(batch6_initial[:3])

#### Following Function Extracts emails from Home Page Only (Takes Less Time: 1 sec per page)

In [None]:
# Extract emails
start_time = time.time()
pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
batch6 = extract_emails_from_homepage(batch6_initial, pattern = pattern , index=2)
end_time = time.time()
execution_time = end_time - start_time
print('execution time: ', execution_time)

### Checkpoint-2

In [None]:
print(len(batch6)) # should be same as batch_size i.e. 1000
print(len(batch6[-1])) # should be 8
print(batch6[-5:]) #should have 8 items in each list with emails or remarks added

### End of Checkpoint-2

In [None]:
# # Convert to Dataframe batch1
df6 = pd.DataFrame(batch6, columns=columns)


In [None]:
# # Convert to Dataframe batch1
df6 = pd.DataFrame(batch6, columns=columns)


### Checkpoint-3

In [None]:
df6.info()

In [None]:
# save as csv
df6.to_csv('batch6.csv', index = False)

### End of Checkpoint-3

In [None]:
#### Merge the two dataframes
df = pd.concat([df, df6])

### Checkpoint-4

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.head()==df1.head()

In [None]:
df.tail()==df6.tail()

### End of Checkpoint-4

### xxxxxxxxxxxxxxxxxxxxx Push to Github xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx

### Batch-7, batch_size = 1000, batch_start_index = 6001


#### !Cell is executed. Uncomment (Ctrl + A followed by Ctrl + /) and execute (Shift + Enter) again only if you want to run the batch again.

In [None]:
# # Extract info of interst
# start_time = time.time()
# batch7_initial = extract_info_of_interest(unique_company_profile_urls, batch_size = 1000, batch_start_index = 6001, sleep=5)
# end_time = time.time()
# execution_time = end_time - start_time
# print('execution time: ', execution_time)

### Checkpoint-1

In [None]:
# # Test-Block !Execute only if running the batch again
# print(len(batch7_initial))
# print(len(batch7_initial[-1]))

In [None]:
# # Test-Block !Execute only if running the batch again
# print(batch7_initial[:3])

### End of Checkpoint-1

In [None]:
batch7df = pd.read_csv('batch7.csv')

In [None]:
batch7df_initial = batch7df.drop(['email'], axis = 1)

In [None]:
batch7df_initial.info()

In [None]:
batch7_initial = batch7df_initial.values.tolist()
print(batch1_initial[:3])

#### Following Function Extracts emails from Home Page Only (Takes Less Time: 1 sec per page)

In [None]:
# Extract emails
start_time = time.time()
pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
batch7 = extract_emails_from_homepage(batch7_initial, pattern = pattern , index=2)
end_time = time.time()
execution_time = end_time - start_time
print('execution time: ', execution_time)

### Checkpoint-2

In [None]:
print(len(batch7)) # should be same as batch_size i.e. 1000
print(len(batch7[-1])) # should be 8
print(batch7[:15]) #should have 8 items in each list with emails or remarks added

### End of Checkpoint-2

In [None]:
# # Convert to Dataframe batch7
df7 = pd.DataFrame(batch7, columns=columns)


### Checkpoint-3

In [None]:
df7.info()

In [None]:
# save as csv
df7.to_csv('batch7.csv', index = False)

### End of Checkpoint-3

In [None]:
#### Merge the two dataframes
df = pd.concat([df, df7])

### Checkpoint-4

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.head()==df1.head()

In [None]:
df.tail()==df7.tail()

### xxxxxxxxxxxxxxx Push to Github xxxxxxxxxxxxxxxxxxxx

### Batch-8, batch_size = 1000, batch_start_index = 7001


#### !Cell is executed. Uncomment (Ctrl + A followed by Ctrl + /) and execute (Shift + Enter) again only if you want to run the batch again.

In [None]:
# # Extract info of interst
# start_time = time.time()
# batch8_initial = extract_info_of_interest(unique_company_profile_urls, batch_size = 1000, batch_start_index = 7001, sleep=5)
# end_time = time.time()
# execution_time = end_time - start_time
# print('execution time: ', execution_time)

### Checkpoint-1

In [None]:
# # Test-Block !Uncomment and execute only if running batch again
# print(len(batch8_initial))
# print(len(batch8_initial[-1]))

In [None]:
# # Test-Block !Uncomment and execute only if running batch again
# print(batch8_initial[:3])

### End of Checkpoint-1

In [None]:
batch8df = pd.read_csv('batch8.csv')

In [None]:
batch8df_initial = batch8df.drop(['email'], axis = 1)

In [None]:
batch8df_initial.info()

In [None]:
batch8_initial = batch8df_initial.values.tolist()
print(batch1_initial[:3])

#### Following Function Extracts emails from Home Page Only (Takes Less Time: 1 sec per page)

In [None]:
# Extract emails
start_time = time.time()
pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
batch8 = extract_emails_from_homepage(batch8_initial, pattern = pattern , index=2)
end_time = time.time()
execution_time = end_time - start_time

print('execution time: ', execution_time)

### Checkpoint-2

In [None]:
print(len(batch8)) # should be same as batch_size i.e. 1000
print(len(batch8[-1])) # should be 8
print(batch8[-5:]) #should have 8 items in each list with emails or remarks added

### End of Checkpoint-2

In [None]:
# # Convert to Dataframe batch8
df8 = pd.DataFrame(batch8, columns=columns)


### Checkpoint-3

In [None]:
df8.info()

In [None]:
# save as csv
df8.to_csv('batch8.csv', index = False)

### End of Checkpoint-3

In [None]:
#### Merge the two dataframes
df = pd.concat([df, df8])

### Checkpoint-4

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df.head() == df1.head()

In [None]:
df.tail() == df8.tail()

### End of Checkpoint-4

### xxxxxxxxxxxxxxxxxxxxx Push to Github xxxxxxxxxxxxxxxxxxxxxxxxx

### Batch-9, batch_size = 1000, batch_start_index = 8001


#### !Cell is executed. Uncomment (Ctrl + A followed by Ctrl + /) and execute (Shift + Enter) again only if you want to run the batch again.

In [None]:
# # Extract info of interst
# start_time = time.time()
# batch9_initial = extract_info_of_interest(unique_company_profile_urls, batch_size = 1000, batch_start_index = 8001, sleep=5)
# end_time = time.time()
# execution_time = end_time - start_time
# print('execution time: ', execution_time)

### Checkpoint-1

In [None]:
# # Test-Block !Uncomment and execute only if running the batch again
# print(len(batch9_initial))
# print(len(batch9_initial[-1]))

In [None]:
# # Test-Block !Uncomment and execute only if running the batch again
# print(batch9_initial[:3])

### End of Checkpoint-1

In [None]:
batch9df = pd.read_csv('batch9.csv')

In [None]:
batch9df_initial = batch9df.drop(['email'], axis = 1)

In [None]:
batch9df_initial.info()

In [None]:
batch9_initial = batch9df_initial.values.tolist()
print(batch9_initial[:3])

#### Following Function Extracts emails from Home Page Only (Takes Less Time: 1 sec per page)

In [None]:
# Extract emails
start_time = time.time()
pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
batch9 = extract_emails_from_homepage(batch9_initial, pattern = pattern , index=2)
end_time = time.time()
execution_time = end_time - start_time
print('execution time: ', execution_time)

### Checkpoint-2

In [None]:
print(len(batch9)) # should be same as batch_size i.e. 1000
print(len(batch9[-1])) # should be 8
print(batch9[-5:]) #should have 8 items in each list with emails or remarks added

### End of Checkpoint-2

In [None]:
# # Convert to Dataframe batch9
df9 = pd.DataFrame(batch9, columns=columns)


### Checkpoint-3

In [None]:
df9.info()

In [None]:
# save as csv
df9.to_csv('batch9.csv', index = False)

### End of Checkpoint-3

In [None]:
#### Merge the two dataframes
df = pd.concat([df, df9])

### Checkpoint-4

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df.head() == df1.head()

In [None]:
df.tail() == df9.tail()

### End of Checkpoint-4

### xxxxxxxxxxxxxxxxxxxxxxxx Push to Github xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx

### Batch-10, batch_size = 1000, batch_start_index = 9001


#### !Cell is executed. Uncomment (Ctrl + A followed by Ctrl + /) and execute (Shift + Enter) again only if you want to run the batch again.

In [None]:
# # Extract info of interst
# start_time = time.time()
# batch10_initial = extract_info_of_interest(unique_company_profile_urls, batch_size = 1000, batch_start_index = 9001, sleep=5)
# end_time = time.time()
# execution_time = end_time - start_time
# print('execution time: ', execution_time)

### Checkpoint-1

In [None]:
# # Test-Block !Uncomment and execute only if running the batch again
# print(len(batch10_initial))
# print(len(batch10_initial[-1]))

In [None]:
# # Test-Block !Uncomment and execute only if running the batch again
# print(batch10_initial[:3])

### End of Checkpoint-1

In [None]:
batch10df = pd.read_csv('batch10.csv')

In [None]:
batch10df_initial = batch10df.drop(['email'], axis = 1)

In [None]:
batch10df_initial.info()

In [None]:
batch10_initial = batch10df_initial.values.tolist()
print(batch10_initial[:3])

#### Following Function Extracts emails from Home Page Only (Takes Less Time: 1 sec per page)

In [None]:
# Extract emails
start_time = time.time()
pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
batch10 = extract_emails_from_homepage(batch10_initial, pattern = pattern , index=2)
end_time = time.time()
execution_time = end_time - start_time
print('execution time: ', execution_time)

### Checkpoint-2

In [None]:
print(len(batch10)) # should be same as batch_size i.e. 1000
print(len(batch10[-1])) # should be 8
print(batch10[-5:]) #should have 8 items in each list with emails or remarks added

### End of Checkpoint-2

In [None]:
# # Convert to Dataframe batch10
df10 = pd.DataFrame(batch10, columns=columns)


### Checkpoint-3

In [None]:
df10.info()

In [None]:
# save as csv
df10.to_csv('batch10.csv', index = False)

### End of Checkpoint-3

In [None]:
#### Merge the two dataframes
df = pd.concat([df, df10])

### Checkpoint-4

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.head() == df1.head()

In [None]:
df.tail() == df10.tail()

### End of Checkpoint-4

### xxxxxxxxxxxxxxxxxxx Push to Github xxxxxxxxxxxxxxxxxxxxxxxx

### Batch-11, batch_size = 1000, batch_start_index = 10001


#### !Cell is executed. Uncomment (Ctrl + A followed by Ctrl + /) and execute (Shift + Enter) again only if you want to run the batch again.

In [None]:
# # Extract info of interst
# start_time = time.time()
# batch11_initial = extract_info_of_interest(unique_company_profile_urls, batch_size = 1000, batch_start_index = 10001, sleep=5)
# end_time = time.time()
# execution_time = end_time - start_time
# print('execution time: ', execution_time)

### Checkpoint-1

In [None]:
# # Test-Block !Uncomment and execute only if running the batch again
# print(len(batch11_initial))
# print(len(batch11_initial[-1]))

In [None]:
# # # Test-Block !Uncomment and execute only if running the batch again
# print(batch11_initial[:3])

### End of Checkpoint-1

In [None]:
batch11df = pd.read_csv('batch11.csv')

In [None]:
batch11df_initial = batch11df.drop(['email'], axis = 1)

In [None]:
batch11df_initial.info()

In [None]:
batch11_initial = batch11df_initial.values.tolist()
print(batch11_initial[:3])

#### Following Function Extracts emails from Home Page Only (Takes Less Time: 1 sec per page)

In [None]:
# Extract emails
start_time = time.time()
pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
batch11 = extract_emails_from_homepage(batch11_initial, pattern = pattern , index=2)
end_time = time.time()
execution_time = end_time - start_time
print('execution time: ', execution_time)

### Checkpoint-2

In [None]:
print(len(batch11)) # should be same as batch_size i.e. 1000
print(len(batch11[-1])) # should be 8
print(batch11[-5:]) #should have 8 items in each list with emails or remarks added

### End of Checkpoint-2

In [None]:
# # Convert to Dataframe batch11
df11 = pd.DataFrame(batch11, columns=columns)


### Checkpoint-3

In [None]:
df11.info()

In [None]:
# save as csv
df11.to_csv('batch11.csv', index = False)

### End of Checkpoint-3

In [None]:
#### Merge the two dataframes
df = pd.concat([df, df11])

### Checkpoint-4

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.head() == df1.head()

In [None]:
df.tail() == df11.tail()

### End of Checkpoint-4

### xxxxxxxxxxxxxxxxxxxxxxxx Push to Github xxxxxxxxxxxxxxxxxxxxxxxxx

### Batch-12, batch_size = 1000, batch_start_index = 11001


#### !Cell is executed. Uncomment (Ctrl + A followed by Ctrl + /) and execute (Shift + Enter) again only if you want to run the batch again.

In [None]:
# # Extract info of interst
# start_time = time.time()
# batch12_initial = extract_info_of_interest(unique_company_profile_urls, batch_size = 1000, batch_start_index = 11001, sleep=5)
# end_time = time.time()
# execution_time = end_time - start_time
# print('execution time: ', execution_time)

### Checkpoint-1

In [None]:
# # Test-Block !Uncomment and execute only if running the batch again
# print(len(batch12_initial))
# print(len(batch12_initial[-1]))

In [None]:
# # # Test-Block !Uncomment and execute only if running the batch again
# print(batch12_initial[:3])

### End of Checkpoint-1

In [None]:
batch12df = pd.read_csv('batch12.csv')

In [None]:
batch12df_initial = batch12df.drop(['email'], axis = 1)

In [None]:
batch12df_initial.info()

In [None]:
batch12_initial = batch12df_initial.values.tolist()


In [None]:
print(batch12_initial[:3])

#### Following Function Extracts emails from Home Page Only (Takes Less Time: 1 sec per page)

In [None]:
# Extract emails
start_time = time.time()
pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
batch12 = extract_emails_from_homepage(batch12_initial, pattern = pattern , index=2)
end_time = time.time()
execution_time = end_time - start_time
print('execution time: ', execution_time)

### Checkpoint-2

In [None]:
print(len(batch12)) # should be same as batch_size i.e. 1000
print(len(batch12[-1])) # should be 8
print(batch12[-5:]) #should have 8 items in each list with emails or remarks added

### End of Checkpoint-2

In [None]:
# # Convert to Dataframe batch11
df12 = pd.DataFrame(batch12, columns=columns)


### Checkpoint-3

In [None]:
df12.info()

In [None]:
# save as csv
df12.to_csv('batch12.csv', index = False)

### End of Checkpoint-3

In [None]:
#### Merge the two dataframes
df = pd.concat([df, df12])

### Checkpoint-4

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.head() == df1.head()

In [None]:
df.tail() == df12.tail()

### End of Checkpoint-4

### xxxxxxxxxxxxxxxxxxxxxxx Push to Github xxxxxxxxxxxxxxxxxxxxxxxx

### Batch-13, batch_size = 1000, batch_start_index = 12001


#### !Cell is executed. Uncomment (Ctrl + A followed by Ctrl + /) and execute (Shift + Enter) again only if you want to run the batch again.

In [None]:
# # Extract info of interst
# start_time = time.time()
# batch13_initial = extract_info_of_interest(unique_company_profile_urls, batch_size = 1000, batch_start_index = 12001, sleep=5)
# end_time = time.time()
# execution_time = end_time - start_time
# print('execution time: ', execution_time)

### Checkpoint-1

In [None]:
# # Test-Block !Uncomment and execute only if running the batch again
# print(len(batch13_initial))
# print(len(batch13_initial[-1]))

In [None]:
# # # Test-Block !Uncomment and execute only if running the batch again
# print(batch13_initial[:3])

### End of Checkpoint-1

In [None]:
batch13df = pd.read_csv('batch13.csv')

In [None]:
batch13df_initial = batch13df.drop(['email'], axis = 1)

In [None]:
batch13df_initial.info()

In [None]:
batch13_initial = batch13df_initial.values.tolist()
print(batch13_initial[:3])

#### Following Function Extracts emails from Home Page Only (Takes Less Time: 1 sec per page)

In [None]:
# Extract emails
start_time = time.time()
pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
batch13 = extract_emails_from_homepage(batch13_initial, pattern = pattern , index=2)
end_time = time.time()
execution_time = end_time - start_time
print('execution time: ', execution_time)

### Checkpoint-2

In [None]:
print(len(batch13)) # should be same as batch_size i.e. 1000
print(len(batch13[-1])) # should be 8
print(batch13[-5:]) #should have 8 items in each list with emails or remarks added

### End of Checkpoint-2

In [None]:
# # Convert to Dataframe batch11
df13 = pd.DataFrame(batch13, columns=columns)


### Checkpoint-3

In [None]:
df13.info()

In [None]:
# save as csv
df13.to_csv('batch13.csv', index = False)

### End of Checkpoint-3

In [None]:
#### Merge the two dataframes
df = pd.concat([df, df13])

### Checkpoint-4

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.head() == df1.head()

In [None]:
df.tail() == df13.tail()

### End of Checkpoint-4

### xxxxxxxxxxxxxxxxxxxxxxxxx Push to Github xxxxxxxxxxxxxxxxxxxxxx

### Batch-14, batch_size = 1227, batch_start_index = 13001


#### !Cell is executed. Uncomment (Ctrl + A followed by Ctrl + /) and execute (Shift + Enter) again only if you want to run the batch again.

In [None]:
# # Extract info of interst
# start_time = time.time()
# batch14_initial = extract_info_of_interest(unique_company_profile_urls, batch_size = 1227, batch_start_index = 13001, sleep=5)
# end_time = time.time()
# execution_time = end_time - start_time
# print('execution time: ', execution_time)

### Checkpoint-1

In [None]:
# # Test_Block !Uncomment and execute only if running the batch again
# print(len(batch14_initial))
# print(len(batch14_initial[-1]))

In [None]:
# # Test_Block !Uncomment and execute only if running the batch again
# print(batch14_initial[:3])

### End of Checkpoint-1

In [None]:
batch14df = pd.read_csv('batch14.csv')

In [None]:
batch14df_initial = batch14df.drop(['email'], axis = 1)

In [None]:
batch14df_initial.info()

In [None]:
batch14_initial = batch14df_initial.values.tolist()
print(batch14_initial[:3])

#### Following Function Extracts emails from Home Page Only (Takes Less Time: 1 sec per page)

In [None]:
# Extract emails
start_time = time.time()
pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
batch14 = extract_emails_from_homepage(batch14_initial, pattern = pattern , index=2)
end_time = time.time()
execution_time = end_time - start_time
print('execution time: ', execution_time)

### Checkpoint-2

In [None]:
print(len(batch14)) # should be same as batch_size i.e. 1000
print(len(batch14[-1])) # should be 8
print(batch14[-5:]) #should have 8 items in each list with emails or remarks added

### End of Checkpoint-2

In [None]:
# # Convert to Dataframe batch11
df14 = pd.DataFrame(batch14, columns=columns)


### Checkpoint-3

In [None]:
df14.info()

In [None]:
# save as csv
df14.to_csv('batch14.csv', index = False)

### End of Checkpoint-3

In [None]:
#### Merge the two dataframes
df = pd.concat([df, df14])

### Checkpoing-4

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.head() == df1.head()

In [None]:
df.tail() == df14.tail()

In [None]:
## Remove duplicates
df = df.drop_duplicates()

In [None]:
## verify
df.info()

In [None]:
## Save all batches in a csv file
df.to_csv('all_batches_home.csv', index = False)

### End of Checkpoint-4

### xxxxxxxxxxxxxxxxxxxxxx Push to Github xxxxxxxxxxxxxxxxxxxxxxxxx