In [12]:
# Import statements
import pandas as pd
import numpy as np 
import csv 
from parsel import Selector
from time import sleep
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
import json
import re

In [2]:
'''
Class to describe a given company 
    @field: name: a string for the company name
    @field: description: a string for the company description
    @field: founders: a list of Founder objects
    @field: industries: a list of strings for different industries
    @field: website: a string for the website
    @field: lastStage: a string for the last stage of funding (eg. Series A)
    @field: linkedin: a string for the company's LinkedIn profile
    @field: location: a string for the company's location
'''
class Company:
    def __init__(self, companyName):
        self.name = companyName
        self.description = None
        self.founders = []
        self.industries = []
        self.website = None
        self.lastStage = None
        self.linkedin = None
    
    def toJson(self):
        return json.dumps(self, default=lambda o: o.__dict__)
    
'''
Class to describe a founder
    @field: name: a string for the founder's name
    @field: education: an list of education objects
    @field: experience: a list of experience objects
'''
class Founder:
    def __init__(self, founderName):
        self.name = founderName
        self.connections = None
        self.location = None
        self.education = []
        self.experience = []

'''
Class to help describe a founder's education
    @field: degree: a string to describe the degree objective
    @field: school: a string for the school attended
    @field: field: a string to describe the major
'''
class Education:
    def __init__(self, schoolName):
        self.school = schoolName
        self.degree = None
        self.field = None
'''
Class to help describe a founder's experience
    @field: companyName: a string to describe the company's name
    @field: title: a string to describe the title held
    @field: description: a string to describe the job description
'''       
class Experience:
    def __init__(self, companyName):
        self.companyName = companyName
        self.title = None
        self.dates = None

In [3]:
'''
Loads the dataframe from Query 1 and Query 2 and merges + drops duplicates and NaNs
    @param: csv1: path to CSV 1 (formed by Query 1)
    @param: csv2: path to CSV 2 (formed by Query 2)
    @return: df1: a merged dataframe of csv1 and csv2
'''
def loadBacktestData(csv1, csv2):
    df1 = pd.read_csv(csv1)
    df2 = pd.read_csv(csv2)
    df1 = df1.append(df2)
    df1 = df1.drop_duplicates(subset=['Organization Name'])
    df1 = df1[df1.Founders.notna()]
    df1 = df1[df1.LinkedIn.notna()]
    df1 = df1.reset_index()
    for i in range(len(df1)):
        if df1.iloc[i].LinkedIn.count('about') > 0:
            df1['LinkedIn'][i] = df1['LinkedIn'][i].split('about')[0]
    return df1

# Loads and combines both CSVs into df1 dataframe
df1 = loadBacktestData('backtest1.csv', 'backtest2.csv')
# Creates a company_data dictionary to store scraped data
company_data = {}

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [4]:
'''
Method to set up and log into the LinkedIn using chromedriver
    @param: driverPath: path to the chromedriver.exe file
    @param: liUsername: string of LI username
    @param: liPassword: string of LI password
    @return: driver: the Chrome Webdriver (can be passed into future function arguments)
'''
def setupDriver(driverPath, liUsername, liPassword):
    # Sets up Chrome Webdriver and navigates to LinkedIn
    driver = webdriver.Chrome(driverPath)
    driver.get('https://www.linkedin.com/')
    sleep(2.0)
    
    # Signs in with given credentials and returns the driver
    driver.find_element_by_xpath('//a[text()="Sign in"]').click()
    sleep(2.0)
    username_input = driver.find_element_by_name('session_key')
    username_input.send_keys(liUsername)
    password_input = driver.find_element_by_name('session_password')
    password_input.send_keys(liPassword)
    sleep(2.0)
    driver.find_element_by_xpath('//button[text()="Sign in"]').click()
    return driver

# Launches LinkedIn and logs in
driver = setupDriver('./chromedriver', 'LIUsername', 'LIPassword')

In [5]:
'''
Method to save the company data as a txt file (loadable as JSON)
    @param: data: company_data dictionary of Company objects
    @return: None
'''
def saveData(data, fileName):
    data_json = json.dumps(data, default=lambda x: x.__dict__)
    with open(fileName, 'w') as outfile:
        json.dump(data_json, outfile)

'''
Method to load company data into a dictionary (same structure as Company object)
    @param: dataFile: string path to saved txt file
    @return: dataDict: a dictionary with the same structure as a Company object
'''
def loadData(dataFile):
    with open(dataFile) as json_file:
        data = json.load(json_file)
    dataDict = json.loads(data)
    return dataDict

In [6]:
'''
Method to get the number of connections for a founder on that founder's LI profile
    @param: driver - the chromedriver used
    @param: founder_ - the founder object
    @return: None
Updates the founder object with the number of connections found on the profile
'''
def addConnections(driver, founder_):
    conn_ = driver.find_elements_by_xpath('//ul[@class = "pv-top-card--list pv-top-card--list-bullet mt1"]/li[@class = "inline-block"]')
    try:
        founder_.connections = conn_[0].text
    except:
        print("Connections not found for {}".format(founder_.name))
        
'''
Method to get extract the founder location
    @param: driver - the chromedriver used 
    @param: founder_ - the founder object
    @return: None
Updates the founder object with the location found on the profile
'''
def updateLocation(driver, founder_):
    loc_ = driver.find_elements_by_xpath('//li[@class = "t-16 t-black t-normal inline-block"]')
    try:
        founder_.location = loc_[0].text
    except:
        print("Location not found for {}".format(founder_.name))

In [7]:
'''
Method to get the School Name, FoS and Degree Name for all educational experiences for a founder
    @param: driver - the chromedriver used 
    @param: founder_ - the founder object
    @return: None
Updates the founder object with all education experiences
'''
def getSchools(driver, founder_):
    schools = driver.find_elements_by_xpath('//div[@class="pv-entity__degree-info"]')
    for school_ in schools:
        try:
            educ_ = Education(school_.find_element_by_class_name('pv-entity__school-name').text)
            try:
                educ_.field = school_.find_element_by_class_name('pv-entity__fos').text.split('\n')[1]
                educ_.degree = school_.find_element_by_class_name('pv-entity__degree-name').text.split('\n')[1]
            except:
                print("No degree info found for {} at {}".format(founder_.name, educ_.school))
            
            # Appends the temporary education object to the founder
            founder_.education.append(educ_)
        except:
            print("No schools found for {}".format(founder_.name))

In [8]:
'''
Method to get the Company Name, Title and Dates for all work experiences for a founder 
    @param: driver - the chromedriver used 
    @param: founder_ - the founder object
    @return: None
Updates the founder object with all work experiences
'''
def getWorkExperience(driver, founder_):
    experiences = driver.find_elements_by_xpath('//a[@data-control-name="background_details_company"]')
    for exp in experiences:
        exp_lst = exp.text.split('\n')
        # The exp_ is formatted as [title, 'companyname', companyname, 'datesemployed', datesemployed, ..]
        try:
            if exp_lst[0] == 'Company Name':
                exp_ = Experience(exp_lst[1])
                founder_.experience.append(exp_)
            else:
                exp_ = Experience(exp_lst[2])
                try:
                    exp_.title = exp_lst[0]
                    exp_.dates = exp_lst[4]
                    founder_.experience.append(exp_)
                except:
                    founder_.experience.append(exp_)
        except:
            print("No experience found for {}".format(str(founder_.name)))

In [9]:
'''
Extracts the founder information from a LinkedIn search for a founder and adds the founder to the company
    @param: driver - the chromedriver used
    @param: founder - a string for the founder's name
    @param: company_ - the Company object for the company the founder is working at
'''

def getFounderInfo(driver, founder, company_):
    res_ = driver.find_element_by_xpath('//a[@data-control-name="search_srp_result"]')
    if res_:
        res_.click()
        sleep(2.0)
        # Scrolls to the bottom of the webpage (if no scroll, error where the full webpage doens't load)
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight/2);")
        sleep(0.75)
        # Creates a founder object for the given founder
        founder_ = Founder(founder)
        
        #Extracts data for connections
        addConnections(driver, founder_)
        
        #Extract founder location
        updateLocation(driver, founder_)

        # Extracts degree information (formatted as a list of items)
        getSchools(driver, founder_)
 
        # Extracts experience information (formatted as a list of items)
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight/3);")
        sleep(1.0)
        getWorkExperience(driver, founder_)
        
        # Appends the temporary founder object to the company
        company_.founders.append(founder_)
        sleep(1.0)
    else:
        founder_ = Founder(founder)
        company_.founders.append(founder_)
        print("{} not found for {}".format(company_.name, founder_.name))
        sleep(1.0)

In [15]:
'''
Method to search for founder and company on the search bar and extract founder information if found
    @param: driver - the chromedriver used 
    @param: founder - a string for the name of the founder to be searched
    @param: company_ = the company being searched
    @return: None
The founder object is populated and added to the company
'''
def searchForFounder(driver, founder, company_):
    # Finds the search bar on top of the LinkedIn page
    search = driver.find_elements_by_xpath('//input[@class="search-global-typeahead__input always-show-placeholder"]')
    sleep(0.5)
    search[0].click()
    for i in range(50):
        search[0].send_keys(Keys.RIGHT)
    # Clears any existing search
    for i in range(80):
        search[0].send_keys(Keys.BACKSPACE)
    # Types in founder name + company name and searches
#     search[0].send_keys(founder + " " + company_.name.split(" ")[0])
    search[0].send_keys(founder + " " + re.split(" |,", company_.name)[0])
    search[0].send_keys(Keys.ENTER)
    sleep(2.0)
    try:
        getFounderInfo(driver, founder, company_)
    except:
        print("{} not found for {}".format(founder, company_.name))

In [11]:
'''
Method to add LI information for a company into the company_data dictionary with a new company object
    @param: entry: a pandas series extracted from a single row in the dataframe from loadBacktestData
    @return: None
This method updates the company_data dictionary and returns nothing
'''
def scrapeLI(entry):
    # Adds a new company entry to the company_data dictionary and populates fields
    company_ = Company(entry['Organization Name'])
    company_.description = entry['Description']
    company_.industries = [i.strip() for i in entry['Industries'].split(',')]
    company_.website = entry['Website']
    company_.lastStage = entry['Last Funding Type']
    # Edge case where the LinkedIn link does not end in '/'
    if entry['LinkedIn'][-1] != '/':
        entry['LinkedIn'] = entry['LinkedIn'] + '/'
    company_.linkedin = entry['LinkedIn']
    company_.location = entry['Headquarters Location']
    
    # Generates a list of founder names
    founderNames = [i.strip() for i in entry['Founders'].split(',')]
    try:
        # For each founder in the list, the school name, degree, and major is extracted
        for founder in founderNames:
            searchForFounder(driver, founder, company_)
    except:
        print("Structural Error")
    # Adds the company to the company_data dictionary
    company_data[company_.name] = company_

In [13]:
for i in range(100,200):
    print(i)
    if i%5 == 0:
        sleep(5.0)
    scrapeLI(df1.iloc[i])

0
No degree info found for Amar Hanspal at Stanford University
1
2
No degree info found for Vishal Sikka at Holy Rosary Academy & High School
3
No degree info found for Paz Eshel at New York University
No degree info found for Paz Eshel at Institut d'Etudes politiques de Paris
4
No degree info found for Matthew Baier at European School Karlsruhe
5
No degree info found for Thomas Jermoluk at Virginia Tech
6


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()


No degree info found for Sharath Keshava Narayana at Y Combinator
7
8
No degree info found for Robert Leshner at CFA Institute
9
No degree info found for Jason Ross at Danville Community High School
10
No degree info found for Sugu Sougoumarane at Birla Institute of Technology and Science
11
No degree info found for Thomas Graham at Massachusetts Institute of Technology - Sloan School of Management
12
No degree info found for Amanda Kelly at Stanford University
13
No degree info found for Ion Stoica at Carnegie Mellon University
Michael I. ​ Jordan not found for
14
No degree info found for Rohan Sathe at University of California, Davis
No degree info found for Rohan Sathe at Stanford University
No degree info found for Rohan Sathe at Lynbrook High School
15
No degree info found for Jeremy Hermann at Stanford University
16
No degree info found for Bryan Cantrill at George Washington High School
No degree info found for Jessie Frazelle at University of Arizona
No degree info found for Je

93
Connections not found for Bobby Tinsley
Location not found for Bobby Tinsley
Connections not found for Jim Sampey
Location not found for Jim Sampey
Connections not found for Kenneth Douglas
Location not found for Kenneth Douglas
Connections not found for Robert Zaccardo
Location not found for Robert Zaccardo
94
95
Cedric Montet not found for
96
97
No degree info found for Arlo Gilbert at The University of Texas at Austin
98
No degree info found for Deepak Chhugani at Y Combinator
No degree info found for Deepak Chhugani at The London School of Economics and Political Science (LSE)
99
No degree info found for Anna Kopp at Columbia University in the City of New York
No degree info found for Banu Guler at New York University


In [14]:
saveData(company_data, 'data100.txt')

In [None]:
# Used to extract and analyze the data from the saved text file

schools = {}
fields = {}
companynames = {}
jobtitle = {}

for key in loaded.keys():
    for founder in loaded[key]['founders']:
        if founder['education']:
            for educ_ in founder['education']:
                school = educ_['school'].lower()
                for word in school.split(' '):
                    if word in schools:
                        schools[word] += 1
                    else:
                        schools[word] = 1
                if educ_['field']:
                    field = educ_['field'].lower()
                    for word in field.split(' '):
                        if word in fields:
                            fields[word] += 1
                        else:
                            fields[word] = 1
        if founder['experience']:
            for exp_ in founder['experience']:
                companyName = exp_['companyName'].lower()
                for word in companyName.split(' '):
                    if word in companynames:
                        companynames[word] += 1
                    else:
                        companynames[word] = 1
                if exp_['title']:
                    title = exp_['title'].lower()
                    if title in jobtitle:
                        jobtitle[title] += 1
                    else:
                        jobtitle[title] = 1
                
            

In [None]:
''' Saves the extracted data to a CSV'''

In [None]:
df_schools = pd.DataFrame.from_dict(schools, orient='index', columns=['freq'])
df_schools.sort_values(['freq'], ascending=False).to_csv('schools.csv')

In [None]:
df_fields = pd.DataFrame.from_dict(fields, orient='index', columns=['freq'])
df_fields.sort_values(['freq'], ascending=False).to_csv('fields.csv')

In [None]:
df_compnames = pd.DataFrame.from_dict(companynames, orient='index', columns=['freq'])
df_compnames.sort_values(['freq'], ascending=False).to_csv('companynames.csv')

In [None]:
df_titles = pd.DataFrame.from_dict(jobtitle, orient='index', columns=['freq'])
df_titles.sort_values(['freq'], ascending=False).to_csv('titles.csv')

In [7]:
df1.iloc[1404]

index                                                                                  753
Organization Name                                            Xomi, Inc. (d/b/a Chargeback)
Organization Name URL                    https://www.crunchbase.com/organization/http-c...
Industries                               Fraud Detection, Payments, Software, Transacti...
Headquarters Location                                  Salt Lake City, Utah, United States
Description                              Chargeback is a dispute management platform fo...
Last Funding Amount                                                                6.6e+06
Last Funding Amount Currency                                                           USD
Last Funding Amount Currency (in USD)                                              6.6e+06
Last Funding Type                                                                 Series A
CB Rank (Company)                                                                    4,699