## **This script helps to pull data from companies from various sectors/industries and save it into a file**

In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from bs4 import BeautifulSoup

import time
import pandas as pd
import pprint

from IPython.display import clear_output

## Constants and Definition of Functions

`wait()` and `long_wait()` used to make the program wait so the webpage will load and the next steps will be executed fine

In [2]:
def wait():
    time.sleep(3)
def long_wait():
    time.sleep(10)

In [3]:
empty_item = "na" #used as the default value for empty cells of data, can also be np.nan

# No longer needed, can be used for testing
# CB_Insight_Firms.xlsx is just a file with a list of company names to search for
'''
firm_names_file = "CB_Insight_Firms.xlsx" 
firm_names = pd.read_excel(firm_names_file, sheet_name="Firms", header=0)["Firm"].tolist()
firm_names[0]
'''

#Items to add is the types (columns) of data to be collected for each company 
good_items_to_add = "URL, Total Funding, Description, Total Headcount, Country, Sector, Industry, Sub-Industry, Latest Valuation"
items_to_add = good_items_to_add.split(", ")
#Data column names is the name of all columns for the data
data_column_names = items_to_add.copy(); data_column_names.insert(0, "Company Name"); data_column_names.insert(0, "Search Input")
print(data_column_names); wait(); 

#Sector columns is the types (columns) of data to be collected for each sector
sector_columns_str = "Sector, No. Companies, Total Funding (Pre-Exit/IPO)"
sector_columns = sector_columns_str.split(", ")
print(sector_columns); wait(); 

`remove_criteria()`:  
remove all the possible criterias  

`add_criteria()`:  
add all the columns and types of values for the companies that you want (edit from `items_to_add`)  

`filter_columns()`:  
1. click on button to change the columns shown  
2. `remove_criteria()` to remove all the columns  
3. `add_criteria()` to add wanted columns in the order stated
3. click on button to apply change in columns

In [24]:
def remove_criteria(driver):
    all_items_remove = "Connect, URL, Description, Company Status, Company ID, Organization ID, Founded Year, All People, "\
    "Total Headcount, Market Cap, Stock Price, Stock Price (1 week % change), Latest Revenue, Latest Revenue Multiple, "\
    "Mosaic (Overall), Mosaic (Momentum), Mosaic (Money), Mosaic (Market), Mosaic (Management), Competitors, Parent Company, "\
    "Subsidiaries, Sector, Industry, Sub-Industry, Market Reports, Team Tag, Country, Continent, State, City, Street, ZIP Code, "\
    "Total Funding, All Investors, Latest Exit Round, Acquirer, VC Backed, Latest Funding Investors, "\
    "Latest Funding Simplified Round, Latest Funding Amount, Latest Funding Date, Latest Funding Round, Date of Exit, "\
    "Latest Valuation, Your Collections, Expert Collections, Total Headcount, Total Funding, Latest Valuation"
    all_items_to_remove = all_items_remove.split(", "); remove_string_postfix = "-staged-column-remove-button"
    items_not_found = []
    for item in all_items_to_remove:
        try: 
            btn = driver.find_element(By.XPATH, f"//button[@data-test='{item}{remove_string_postfix}']"); btn.click()
        except:
            items_not_found.append(item)
    # print(items_not_found)
def add_criteria(driver):
    for item in items_to_add:
        try:
            btn = driver.find_element(By.XPATH, f"//button[@data-test='{item}-column-button']")
            btn.click()
        except:
            print(f"{btn} not found")
def filter_columns(driver):
    add_btn = WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.XPATH, "//button[@title='Add Column']"))
    ) #1
    add_btn.click(); wait()

    remove_criteria(driver); wait() #2
    add_criteria(driver); wait() #3
    
    apply_btn = driver.find_element(By.XPATH, "//button[@data-test='columns-modal-done-button']")
    apply_btn.click()
    wait()

`close_cookie()`:  
wait for up to 10 seconds for the cookie button to appear and close it, else print "Cookie button not found"  

`login()`:  
click on the relevant buttons to help to login, else print "Login not successful"


`setup()`:  
1. declares any options for the driver if needed
2. go to the site
3. maximise the window to ensure everything that needs to be seen can be seen
4. `close_cookie()`, then proceed to login
5. change search to advanced search to see the entire output
6. `filter_columns()`
7. change search outputs to be sorted by funding from High to Low
8. return the driver after all these are done

In [25]:
def close_cookie(driver):
    try:
        close_btn = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((
            By.XPATH, "//button[@id='pendo-close-guide-75f328f5']"))
        )
        close_btn.click()
        wait()
    except:
        print("Cookie button not found")
def login(driver):
    CB_USERNAME = "YAP_Hui_Keng@imda.gov.sg"; CB_PASSWORD = "1mda12345"
    try:
        email_btn = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.XPATH, "//input[@placeholder='Enter your email address']"))
        )
        email_btn.click(); 
        email_btn.send_keys(CB_USERNAME)
        continue_btn = driver.find_element(By.XPATH, "//span[text()='Continue']")
        continue_btn.click()
        
        password_btn = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.XPATH, "//input[@placeholder='Enter your password']"))
        )
        password_btn.click(); 
        password_btn.send_keys(CB_PASSWORD)
        continue_btn = driver.find_element(By.XPATH, "//span[text()='Log in']")
        continue_btn.click()
        wait(); wait()
    except:
        print("Login not successful")
def setup(headless=False):
    options = webdriver.ChromeOptions()
    # headless means that the browser will not be displayed when running the code
    options.add_argument('--headless')

    driver = webdriver.Chrome(options=options) if headless else webdriver.Chrome()
    # driver = webdriver.Chrome()#options=options) time.sleep(10)
    driver.get("https://app.cbinsights.com/i/emerging-tech")
    driver.maximize_window() #; long_wait()

    close_cookie(driver)
    login(driver)

    advanced_search_btn = WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.XPATH, "//button[@data-testid='advanced-search-button']"))
    )
    advanced_search_btn.click() #; long_wait()

    filter_columns(driver)

    funding_dropdown_btn = driver.find_element(By.XPATH, "//div[@data-rbd-drag-handle-draggable-id='Total Funding']").find_elements(By.CLASS_NAME, "flexItem--shrink")[1]
    funding_dropdown_btn.click()
    wait()

    desc_btn = driver.find_elements(By.XPATH, "//span[@data-test='High to low']")
    desc_btn[0].click()
    
    return driver

`search(company)`: (legacy, used in the past but not anymore)  
clear any prior search inputs and search for `company`

In [26]:
def search(company, driver):
    try:
        clear_input_btn = driver.find_element(By.XPATH, "//span[@data-testid='pointer']")
        clear_input_btn.click()
    except:
        pass
    search_field = driver.find_element(By.XPATH, "//input[@id='topSearchInput']")
    search_field.click()
    search_field.send_keys(company)
    wait()

    search_btn = driver.find_element(By.XPATH, "//button[@data-testid='search-button']")
    search_btn.click()
    long_wait()

`save_table(df, company)`:  
takes the table from the html and uses `BeautifulSoup` to parse and save the information from the table and save it into `df`  
`company` is the thing that was used to search - to check back on the input/search item  
**make sure to uncomment the last line of code to save data** - currently have a big file that took 3hrs to scrape 

`save_sector_data(df, industry)`:  
scrapes the information from the site to get the funding data by sector/industry, saving `industry` and funding into `df`

`save_all_tables(df, company, iterations=400)`:  
`save_table()` for every page, up until the next button is unclickable/400 pages (limit of 10 000 companies per search, 25 companies a page, 400 pages)  
`iterations` can be changed to reduce number if pages of data scraped

In [27]:
def save_table(df, company, driver, save=False):
    table_element = driver.find_elements(By.CLASS_NAME, "client-reusable-Table-___styles__maxContent___wQTEa")
    table_webpage = table_element[1]
    table_html = table_webpage.get_attribute('outerHTML')
    bs = BeautifulSoup(table_html, "html.parser")

    table = bs.find_all("div", {"class": "client-modules-top-search-components-TableView-___styles__rowWrapper___WLBoH"})

    for row_html in table:
        row = row_html.find_all("div", {"class": "flex-0"})
        row_insert = {}; row_insert['Search Input'] = company

        for index in range(1, len(row)-1):
            item = row[index].text
            text_insert = item if item else empty_item
            row_insert[data_column_names[index]] = text_insert

        df.loc[len(df)] = row_insert
    
    if save:
        df.to_csv('CBI_Firm_Data.csv', index=False)
def save_sector_data(df, industry, driver):
    row_insert = {}
    row_insert[sector_columns[0]] = industry
    
    info = driver.find_element(By.CLASS_NAME, "shadow-sm").find_elements(By.XPATH, "./div")[0]
    funding = info.text.split("Total Funding")[0].split("|")[1]
    num_companies = info.text.split(" ")[0]

    row_insert[sector_columns[1]] = num_companies
    row_insert[sector_columns[2]] = funding

    df.loc[len(df)] = row_insert
def save_all_tables(df, company, driver, iterations=400, save=False):

    save_table(df, company, driver, save=save)
    next_btn = driver.find_element(By.XPATH, "//div[@data-test='Pagebar-Next']").find_element(By.XPATH, "..")
    
    count = 1
    while next_btn.is_enabled():   
        if count >= iterations:
            # used to ensure it doesnt go past 400 pages (2nd failsafe, up to 10 000 members), 1 run had 10 025 for electronics sector
            break 
        if not count % 10:
            clear_output()
            print(count, company)
        next_btn.click()
        wait()
        save_table(df, company, driver)
        count += 1


## Start of Logic

**Rerun this cell for testing to clear the dataframes**

`pp`: `PrettyPrinter` used to debug and view items nicely (in lines and nested)  
`df`: pandas `DataFrame` used to hold all company data  
`sector_df`: pandas `DataFrame` used to hold sector funding data  
`lst_industries`: currently the list of sectors that we are scraping data from for sectors and companies in that sector

In [28]:
pp = pprint.PrettyPrinter()
df = pd.DataFrame(columns=data_column_names)
sector_df = pd.DataFrame(columns=sector_columns)
print(df, sector_df); wait(); clear_output() #used to show that both dfs has been refreshed and have nothing

lst_industries = ['Computer Hardware & Services', 
                  'Electronics', 
                  'Media (Traditional)', 
                  'Internet', 
                  'Mobile & Telecommunications', 
                  'Software (non-internet/mobile)']

Logic:  
1. Setup the driver/browser 
2. Click and open up `Industries`
3. For every sector/industry in `lst_industries`:  
        - Scrape the funding data of the sector/industry  
        - Scrape the data of companies of every page unless otherwise specified (eg. `iterations=3`)  
        - Save the data into the file if it has been unabled (on top under `save_table()`)

In [29]:
TIME_START = time.time()

driver = setup()

industries_dropdown_btn = driver.find_element(By.XPATH, "//button[@data-test='filter-section-open-Industries']")
industries_dropdown_btn.click()
wait()

for industry in lst_industries:
    industry_selector_btn = driver.find_element(By.XPATH, "//button[@data-test='tree-filter-popover-trigger-Industries']")
    industry_selector_btn.click()
    wait()

    btn = driver.find_element(By.XPATH, f"//span[text()='{industry}']")
    btn.click()
    wait()
    
    save_sector_data(sector_df, industry, driver)
    save_all_tables(df, industry, driver, iterations=3, save=False)

    clear_industry_btn = driver.find_element(By.XPATH, "//button[@data-test='Industries-on-clear']")
    clear_industry_btn.click()
    wait()


TIME_END = time.time()
TIME_TAKEN = TIME_END - TIME_START
print(f"Execution time: {TIME_TAKEN:.2f} seconds")
print(sector_df)
print(df)

Execution time: 127.24 seconds
                           Sector No. Companies Total Funding (Pre-Exit/IPO)
0    Computer Hardware & Services         39460                    $173.32B 
1                     Electronics         21514                    $224.53B 
2             Media (Traditional)         11307                     $74.77B 
3                        Internet        193583                      $1.79T 
4     Mobile & Telecommunications         51925                    $693.89B 
5  Software (non-internet/mobile)         38932                    $240.06B 
                       Search Input              Company Name  \
0      Computer Hardware & Services             Global Switch   
1      Computer Hardware & Services                EdgeConneX   
2      Computer Hardware & Services                    Ligado   
3      Computer Hardware & Services   TTenglong Holding Group   
4      Computer Hardware & Services                 CCoreSite   
..                              ...     

In [3]:
options = webdriver.ChromeOptions()
# headless means that the browser will not be displayed when running the code
options.add_argument('--headless')

driver = webdriver.Chrome() if False else webdriver.Chrome(options=options) #time.sleep(10)

## DEBUGGING

In [373]:
item = "Sector"

lst = df[item].to_list()
print(len(set(lst)))
print(len(lst))
print(lst.count("na"))
# print(lst.value)
print(df[item].value_counts())

6
60025
0
Sector
Electronics                       10025
Computer Hardware & Services      10000
Media (Traditional)               10000
Internet                          10000
Mobile & Telecommunications       10000
Software (non-internet/mobile)    10000
Name: count, dtype: int64


### Get no. of companies and total funding

In [255]:
info = driver.find_element(By.CLASS_NAME, "shadow-sm").find_elements(By.XPATH, "./div")[0]
fun = info.text.split("Total Funding")[0].split("|")[1]
num = info.text.split(" ")[0]
print(num); print(fun)

39148
$172.27B 


## APPENDIX - May or may not be useful in the future

In [None]:
ALL_ITEMS_ADD_CRITERIA = "Connect, URL, Description, Company Status, Company ID, Organization ID, Founded Year, All People, \
Total Headcount, Market Cap, Stock Price, Stock Price (1 week % change), Latest Revenue, Latest Revenue Multiple, \
Mosaic Overall, Momentum, Money, Market, Management, Competitors, Parent Company, Subsidiaries, \
Sector, Industry, Sub-Industry, Market Reports, Team Tag, Country, Continent, State, City, Street, ZIP Code, \
Total Funding, All Investors, Latest Exit Round, Exit Date, Acquirer, VC Backed, Round, Date, Amount, \
Simplified Round, Investors, Latest Valuation, Your Collections, Expert Collections"