In [50]:
# Import statements
import pandas as pd
import numpy as np 
import csv 
from parsel import Selector
from time import sleep
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
import json
import re

In [51]:
'''
Class to describe a given company 
    @field: name: a string for the company name
    @field: description: a string for the company description
    @field: founders: a list of Founder objects
    @field: industries: a list of strings for different industries
    @field: website: a string for the website
    @field: lastStage: a string for the last stage of funding (eg. Series A)
    @field: linkedin: a string for the company's LinkedIn profile
    @field: location: a string for the company's location
'''
class Company:
    def __init__(self, companyName):
        self.name = companyName
        self.description = None
        self.founders = []
        self.industries = []
        self.website = None
        self.lastStage = None
        self.linkedin = None
    
    def toJson(self):
        return json.dumps(self, default=lambda o: o.__dict__)
    
'''
Class to describe a founder
    @field: name: a string for the founder's name
    @field: education: an list of education objects
    @field: experience: a list of experience objects
'''
class Founder:
    def __init__(self, founderName):
        self.name = founderName
        self.connections = None
        self.location = None
        self.education = []
        self.experience = []

'''
Class to help describe a founder's education
    @field: degree: a string to describe the degree objective
    @field: school: a string for the school attended
    @field: field: a string to describe the major
'''
class Education:
    def __init__(self, schoolName):
        self.school = schoolName
        self.degree = None
        self.field = None
'''
Class to help describe a founder's experience
    @field: companyName: a string to describe the company's name
    @field: title: a string to describe the title held
    @field: description: a string to describe the job description
'''       
class Experience:
    def __init__(self, companyName):
        self.companyName = companyName
        self.title = None
        self.dates = None

In [52]:
'''
Loads the dataframe from Query 1 and Query 2 and merges + drops duplicates and NaNs
    @param: csv1: path to CSV 1 (formed by Query 1)
    @param: csv2: path to CSV 2 (formed by Query 2)
    @return: df1: a merged dataframe of csv1 and csv2
'''
def loadBacktestData(csv1, csv2):
    df1 = pd.read_csv(csv1)
    df2 = pd.read_csv(csv2)
    df1 = df1.append(df2)
    df1 = df1.drop_duplicates(subset=['Organization Name'])
    df1 = df1[df1.Founders.notna()]
    df1 = df1[df1.LinkedIn.notna()]
    df1 = df1.reset_index()
    for i in range(len(df1)):
        if df1.iloc[i].LinkedIn.count('about') > 0:
            df1['LinkedIn'][i] = df1['LinkedIn'][i].split('about')[0]
    df1 = df1.set_index('Organization Name')
    df1 = df1.drop('index', 1)
    df1['Organization Name'] = df1.index
    return df1

# Loads and combines both CSVs into df1 dataframe
df1 = loadBacktestData('backtest1.csv', 'backtest2.csv')
# Creates a company_data dictionary to store scraped data
df2 = loadBacktestData('notFundedCompanies.csv', 'FailedCompanies.csv')

company_data = {}

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [53]:
'''
Method to set up and log into the LinkedIn using chromedriver
    @param: driverPath: path to the chromedriver.exe file
    @param: liUsername: string of LI username
    @param: liPassword: string of LI password
    @return: driver: the Chrome Webdriver (can be passed into future function arguments)
'''
def setupDriver(driverPath, liUsername, liPassword):
    # Sets up Chrome Webdriver and navigates to LinkedIn
    driver = webdriver.Chrome(driverPath)
    driver.get('https://www.linkedin.com/')
    sleep(2.0)
    
    # Signs in with given credentials and returns the driver
    driver.find_element_by_xpath('//a[text()="Sign in"]').click()
    sleep(2.0)
    username_input = driver.find_element_by_name('session_key')
    username_input.send_keys(liUsername)
    password_input = driver.find_element_by_name('session_password')
    password_input.send_keys(liPassword)
    sleep(2.0)
    driver.find_element_by_xpath('//button[text()="Sign in"]').click()
    return driver

# Launches LinkedIn and logs in
driver = setupDriver('./chromedriver', 'beasley.sarahe@gmail.com', 'Che$$ie2020')

In [54]:
'''
Method to save the company data as a txt file (loadable as JSON)
    @param: data: company_data dictionary of Company objects
    @return: None
'''
def saveData(data, fileName):
    data_json = json.dumps(data, default=lambda x: x.__dict__)
    with open(fileName, 'w') as outfile:
        json.dump(data_json, outfile)

'''
Method to load company data into a dictionary (same structure as Company object)
    @param: dataFile: string path to saved txt file
    @return: dataDict: a dictionary with the same structure as a Company object
'''
def loadData(dataFile):
    with open(dataFile) as json_file:
        data = json.load(json_file)
    dataDict = json.loads(data)
    return dataDict

In [55]:
'''
Method to get the number of connections for a founder on that founder's LI profile
    @param: driver - the chromedriver used
    @param: founder_ - the founder object
    @return: None
Updates the founder object with the number of connections found on the profile
'''
def addConnections(driver, founder_):
    conn_ = driver.find_elements_by_xpath('//ul[@class = "pv-top-card--list pv-top-card--list-bullet mt1"]/li[@class = "inline-block"]')
    try:
        founder_.connections = conn_[0].text
    except:
        print("Connections not found for {}".format(founder_.name))
        
'''
Method to get extract the founder location
    @param: driver - the chromedriver used 
    @param: founder_ - the founder object
    @return: None
Updates the founder object with the location found on the profile
'''
def updateLocation(driver, founder_):
    loc_ = driver.find_elements_by_xpath('//li[@class = "t-16 t-black t-normal inline-block"]')
    try:
        founder_.location = loc_[0].text
    except:
        print("Location not found for {}".format(founder_.name))

In [56]:
'''
Method to get the School Name, FoS and Degree Name for all educational experiences for a founder
    @param: driver - the chromedriver used 
    @param: founder_ - the founder object
    @return: None
Updates the founder object with all education experiences
'''
def getSchools(driver, founder_):
    schools = driver.find_elements_by_xpath('//div[@class="pv-entity__degree-info"]')
    for school_ in schools:
        try:
            educ_ = Education(school_.find_element_by_class_name('pv-entity__school-name').text)
            try:
                educ_.degree = school_.find_element_by_class_name('pv-entity__degree-name').text.split('\n')[1]
            except:
                print("No degree info found for {} at {}".format(founder_.name, educ_.school))
            try:
                educ_.field = school_.find_element_by_class_name('pv-entity__fos').text.split('\n')[1]
            except:
                print("No field of study for {} at {}".format(founder_.name, educ_.school))

            # Appends the temporary education object to the founder
            founder_.education.append(educ_)
        except:
            print("No schools found for {}".format(founder_.name))

In [57]:
'''
Method to get the Company Name, Title and Dates for all work experiences for a founder 
    @param: driver - the chromedriver used 
    @param: founder_ - the founder object
    @return: None
Updates the founder object with all work experiences
'''
def getWorkExperience(driver, founder_):
    experiences = driver.find_elements_by_xpath('//a[@data-control-name="background_details_company"]')
    for exp in experiences:
        exp_lst = exp.text.split('\n')
        # The exp_ is formatted as [title, 'companyname', companyname, 'datesemployed', datesemployed, ..]
        try:
            if exp_lst[0] == 'Company Name':
                exp_ = Experience(exp_lst[1])
                founder_.experience.append(exp_)
            else:
                exp_ = Experience(exp_lst[2])
                try:
                    exp_.title = exp_lst[0]
                    exp_.dates = exp_lst[4]
                    founder_.experience.append(exp_)
                except:
                    founder_.experience.append(exp_)
        except:
            print("No experience found for {}".format(str(founder_.name)))

In [58]:
'''
Extracts the founder information from a LinkedIn search for a founder and adds the founder to the company
    @param: driver - the chromedriver used
    @param: founder - a string for the founder's name
    @param: company_ - the Company object for the company the founder is working at
'''

def getFounderInfo(driver, founder, company_):
    res_ = driver.find_element_by_xpath('//a[@data-control-name="search_srp_result"]')
    if res_:
        res_.click()
        sleep(2.0)
        # Scrolls to the bottom of the webpage (if no scroll, error where the full webpage doens't load)
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight/2);")
        sleep(0.75)
        # Creates a founder object for the given founder
        founder_ = Founder(founder)
        
        #Extracts data for connections
        addConnections(driver, founder_)
        
        #Extract founder location
        updateLocation(driver, founder_)

        # Extracts degree information (formatted as a list of items)
        getSchools(driver, founder_)
 
        # Extracts experience information (formatted as a list of items)
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight/3);")
        sleep(1.0)
        getWorkExperience(driver, founder_)
        
        # Appends the temporary founder object to the company
        company_.founders.append(founder_)
        sleep(1.0)
    else:
        founder_ = Founder(founder)
        company_.founders.append(founder_)
        print("{} not found for {}".format(company_.name, founder_.name))
        sleep(1.0)

In [59]:
'''
Method to search for founder and company on the search bar and extract founder information if found
    @param: driver - the chromedriver used 
    @param: founder - a string for the name of the founder to be searched
    @param: company_ = the company being searched
    @return: None
The founder object is populated and added to the company
'''
def searchForFounder(driver, founder, company_):
    # Finds the search bar on top of the LinkedIn page
    search = driver.find_elements_by_xpath('//input[@class="search-global-typeahead__input always-show-placeholder"]')
    sleep(0.5)
    search[0].click()
    for i in range(50):
        search[0].send_keys(Keys.RIGHT)
    # Clears any existing search
    for i in range(80):
        search[0].send_keys(Keys.BACKSPACE)
    # Types in founder name + company name and searches
#     search[0].send_keys(founder + " " + company_.name.split(" ")[0])
    search[0].send_keys(founder + " " + re.split(" |,", company_.name)[0])
    search[0].send_keys(Keys.ENTER)
    sleep(2.0)
    try:
        getFounderInfo(driver, founder, company_)
    except:
        print("{} not found for {}".format(founder, company_.name))

In [60]:
'''
Method to add LI information for a company into the company_data dictionary with a new company object
    @param: entry: a pandas series extracted from a single row in the dataframe from loadBacktestData
    @return: None
This method updates the company_data dictionary and returns nothing
'''
def scrapeLI(entry):
    # Adds a new company entry to the company_data dictionary and populates fields
    company_ = Company(entry['Organization Name'])
    company_.description = entry['Description']
    company_.industries = [i.strip() for i in entry['Industries'].split(',')]
    company_.website = entry['Website']
    company_.lastStage = entry['Last Funding Type']
    # Edge case where the LinkedIn link does not end in '/'
    if entry['LinkedIn'][-1] != '/':
        entry['LinkedIn'] = entry['LinkedIn'] + '/'
    company_.linkedin = entry['LinkedIn']
    company_.location = entry['Headquarters Location']
    
    # Generates a list of founder names
    founderNames = [i.strip() for i in entry['Founders'].split(',')]
    try: 
        # For each founder in the list, the school name, degree, and major is extracted
        for founder in founderNames:
            searchForFounder(driver, founder, company_)
    except:
        print("Structural Error")
    # Adds the company to the company_data dictionary
#     company_data[company_.name] = company_
    company_data[company_.name] = company_

In [34]:
for i in range(40, len(df2)):
    print(i)
    scrapeLI(df2.iloc[i])


40


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()


41
42
No degree info found for Scott Miller at CCCL
No field of study for Scott Miller at CCCL
43
Steven Sevic not found for EconEvo
44
No field of study for David Vara Espuga at Universidad Europea
45
46
47
No degree info found for Dylan Wheeler at Bow High School
No field of study for Dylan Wheeler at Bow High School
48
Simeon Miighty not found for Piing
49
No degree info found for Peter Hsieh at Torrey Pines High School
No field of study for Peter Hsieh at Torrey Pines High School
50
No degree info found for Omari Fennell at George Mason University - School of Business
No field of study for Omari Fennell at George Mason University - School of Business
No field of study for Omari Fennell at University of California, Irvine
No field of study for Omari Fennell at Institute of Project Management
51
52
No field of study for Behan Webster at University of Manitoba
53
No degree info found for Chris Rampey at California State University-Long Beach
No field of study for Chris Rampey at Calif

148
No degree info found for Josh Winter at Xenia Nazarene Christian
Lodewijk Veldhuizen not found for Anything App Holding B.V.
149
150
No degree info found for Jan Vorcak at ISC Paris
151
Tomer Shalit not found for MapLauncher
152
No degree info found for Ozan YALCINKAYA at Universität Wien
No field of study for Ozan YALCINKAYA at Universität Wien
No degree info found for Ozan YALCINKAYA at Universität Wien
No field of study for Ozan YALCINKAYA at Universität Wien
153
Constance Lavergne-Pouillaude not found for MYFABLAB
154
Connections not found for Andrea Wallace
Location not found for Andrea Wallace
155
156
No field of study for Archie Reed at S P Jain School of Global Management
157
Domantas Bakutis not found for Cvsite.io
158
No field of study for Cyril Desmoinaux at EFREI - Ecole Française d'Electronique et d'Informatique
No degree info found for Cyril Desmoinaux at Collège Saint-Exupéry, Noisy-le-Grand
No field of study for Cyril Desmoinaux at Collège Saint-Exupéry, Noisy-le-Gr

264
Imran Selimkhanov not found for Productive Shop
265
No field of study for Sönke Liebau at Fachhochschule Wedel
No degree info found for Sönke Liebau at University of Waikato
No field of study for Sönke Liebau at University of Waikato
266
No degree info found for Aaron John at Trinity Academy
No field of study for Aaron John at Trinity Academy
No degree info found for George Metcalfe at Year Here
No field of study for George Metcalfe at Year Here
267
No degree info found for Jon Doughty at Lehigh University
268
269
Nina Baliga not found for <div> ersity
270
No degree info found for Matthieu Pierrot at Université de Nantes
No field of study for Yann Pierrot at AFPA d'Angers
271
No degree info found for Thomas Doki-Thonon at IMD Business School
272
Scott Tompkins not found for Oilmar Inc.
273
No degree info found for George Dousa at First Faculty of Medicine, Charles University in Prague
No field of study for George Dousa at First Faculty of Medicine, Charles University in Prague
Pete

327
328
No degree info found for Allan James at Australian Institute of Company Directors
No field of study for Allan James at Australian Institute of Company Directors
No degree info found for Allan James at Oakhill College
No field of study for Allan James at Oakhill College
329
No degree info found for John Wells at Northwestern State University
330
No field of study for Lucas Brown at Babson College
331
332
No field of study for Emmanuel Olatunji at University of Oxford
No field of study for Emmanuel Olatunji at Loxford School of Science and Technology
333
Arancha Riestra not found for GoMadrid
No field of study for José Nistal at Massachusetts Institute of Technology
No field of study for José Nistal at Harvard University
334
No field of study for Eric Yeung at University of Calgary
No field of study for Eric Yeung at Crescent Heights High School
335
No degree info found for Olivier Clair at Unversité catholique de Lille
No field of study for Olivier Clair at Unversité catholique 

437
No degree info found for Subramanian Parameswaran at Xaviers institute of Counselling
438
No field of study for Lucia Gallardo at American School of Tegucigalpa, Honduras
439
440
Jeppe Hallgren not found for Hallex Technologies
441
442
443
No degree info found for Colin Hause at Homeschool
No field of study for Colin Hause at Homeschool
444
445
446
447
No degree info found for Pam Nelligan at University at Buffalo
448
No degree info found for Steven Hughes at Harbor High School
449
No degree info found for Mike Halder at University of Iowa
No field of study for Mike Halder at University of Iowa
450
No degree info found for Daniele Bianchi at Università degli Studi di Sassari
451
No degree info found for Andreas Markewärn at DigJourney
452
No degree info found for Rory Skinner at Hutchesons' Grammar School
No field of study for Rory Skinner at Hutchesons' Grammar School
453
454
455
No degree info found for Fahad Jahanzeb at The Institute of Chartered Accountants of Pakistan
No field

Structural Error
602
Structural Error
603
Structural Error
604
Structural Error
605
Structural Error
606
Structural Error
607
Structural Error
608
Structural Error
609
Structural Error
610
Structural Error
611
Structural Error
612
Structural Error
613
Structural Error
614
Structural Error
615
Structural Error
616
Structural Error
617
Structural Error
618
Structural Error
619
Structural Error
620
Structural Error
621
Structural Error
622
Structural Error
623
Structural Error
624
Structural Error
625
Structural Error
626
Structural Error
627
Structural Error
628
Structural Error
629
Structural Error
630
Structural Error
631
Structural Error
632
Structural Error
633
Structural Error
634
Structural Error
635
Structural Error
636
Structural Error
637
Structural Error
638
Structural Error
639
Structural Error
640
Structural Error
641
Structural Error
642
Structural Error
643
Structural Error
644
Structural Error
645
Structural Error
646
Structural Error
647
Structural Error
648
Structural Er

In [35]:
saveData(company_data, 'failedpt2.txt')

In [48]:
for i in range(545, 917):
    print(i)
    scrapeLI(df2.iloc[i])


545
No field of study for Mostapha Sadeghipour Roudsari at University of Pennsylvania
No field of study for Mostapha Sadeghipour Roudsari at Shahid Beheshti University
No field of study for Mostapha Sadeghipour Roudsari at University of Tehran
546
Ergo Sõõru not found for Medicy
547
No field of study for Jakeer Mohammad at Kakatiya University
548
No degree info found for Remco Vriesema at IPD / NIMA-C
No degree info found for Remco Vriesema at HES Consultancy / Post HBO
549
Anthony Grivet not found for ShiftMe
550


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()


No degree info found for Samuel Miller at Rutgers University-New Brunswick
No field of study for Samuel Miller at Rutgers University-New Brunswick
551
552
No field of study for Marisa Denker at University of Pennsylvania
No field of study for Naomi Murphy at Dublin Institute of Technology
553
554
No degree info found for Joe Stewart at Warner University
555
556
Miki Devic not found for doyodu GmbH
557
558
Alegra Namshi Horne not found for RMSF Corporation
559
No field of study for Patrick Billiet at Vlerick Business School
No degree info found for Patrick Billiet at EHSAL Management School
560
No field of study for Moritz Kothe at INSEAD
561
Rohit Sood not found for Garage Data
562
563
No degree info found for Edouard Blin at Universität Karlsruhe (TH)
No degree info found for Max MyLeanMBA at Università degli Studi di Milano
No degree info found for Max MyLeanMBA at Mälardalens högskola / Mälardalen University
564
No degree info found for Eboni J.D. Freeman at Harvard University
No de

649
Connections not found for Joan Fabrégat
Location not found for Joan Fabrégat
650
No degree info found for Zdravko Loborec at Stanford University
No degree info found for Zdravko Loborec at UC Berkeley College of Engineering
No degree info found for Zdravko Loborec at Harvard University
651
Horace Ho not found for Nextlayer
652
No degree info found for Natalie Torin at University of Virginia
No degree info found for Natalie Torin at Mandarin House, Shanghai
653
Chirstoph Stettner not found for Heronius
Christian Reik not found for Heronius
No field of study for Michel Thost at Robert-Bosch-Schule Ulm
No degree info found for Michel Thost at Gewerbliche Berufsfachschule Bad Saulgau
No field of study for Michel Thost at Gewerbliche Berufsfachschule Bad Saulgau
654
655
No degree info found for Jeroen Sakkers at Murmelliusgymnasium
No field of study for Jeroen Sakkers at Murmelliusgymnasium
656
657
658
No field of study for David Ellzey at University of Denver
No field of study for Davi

746
No degree info found for Sven Kristjansen at Viimsi High School
No field of study for Sven Kristjansen at Viimsi High School
747
Basil Hangarter not found for MaxBrain
Stefan Fraude not found for MaxBrain
748
ervin ruci not found for Geocode.xyz
749
750
751
No field of study for Nicol Pasuit at Oakland University
752
No degree info found for Tomasz Grzegorczyk at Massachusetts Institute of Technology
753
No degree info found for Maxwell Perry at St. John's Prep
No field of study for Maxwell Perry at St. John's Prep
754
Antoine Hage not found for Herron Tech
antoine hage not found for Herron Tech
755
Morten Lauridsen not found for Manpremo
756
No field of study for Mayur Motgi at Stanford University Graduate School of Business
757
Carl Kaiser not found for Easy-Voice
758
No field of study for Matt Schaubroeck at University of Manitoba
759
760
No degree info found for James Cerna at Serra High School
No field of study for James Cerna at Serra High School
761
762
Dale Branch not found

In [49]:
saveData(company_data, 'failedpt3.txt')

In [67]:
'''
Method to load and merge dictionaries
'''
listDicts = ['faileddata_final.txt', 'failedpt4.txt']
def mergeDicts(listDicts, newName):
    merged = {**loadData(listDicts[0]), **loadData(listDicts[1])}
    for i in listDicts[2:]:
        merged = {**merged, **loadData(i)}
    saveData(merged, newName)

mergeDicts(listDicts, 'failedData.txt')

In [61]:
for i in range(803, 917):
    print(i)
    scrapeLI(df2.iloc[i])


803
No degree info found for Kathryn Loewen at Abraham Baldwin Agricultural College
No field of study for Kathryn Loewen at Abraham Baldwin Agricultural College
No degree info found for Kathryn Loewen at Perry High School
No field of study for Kathryn Loewen at Perry High School
804
805


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()


806
807
808
No degree info found for Brian Zotter at Stony Brook University
809
No field of study for Matthieu Vollmer at University of Wisconsin-Stevens Point
No degree info found for Matthieu Vollmer at Chilton High School
No field of study for Matthieu Vollmer at Chilton High School
810
811
No degree info found for Nick Williamson at Illinois Institute of Technology
812
813
No field of study for Joseph Nicklas at University of New Haven
No degree info found for Khashab Khashab at Gateway Community College
Mia Sumra not found for Scroll
Nathan Pitruzzello not found for Scroll
No degree info found for Samuel Opper at Lord Fairfax Community College
No field of study for Samuel Opper at Lord Fairfax Community College
814
815
Dawson Wheeler not found for RootsRated
No degree info found for Gordon Seabury at Hamilton College
No field of study for Gordon Seabury at Hamilton College
No degree info found for Gordon Seabury at St. Lawrence University
No field of study for Gordon Seabury at St

900
No degree info found for Keith Brisson at University of Chicago
No field of study for Keith Brisson at University of Chicago
901
902
No degree info found for Taze R. Ellis at Arizona State University
No field of study for Taze R. Ellis at Arizona State University
903
Kevin Desai not found for TAFi, Inc.
Structural Error
904
Structural Error
905
Structural Error
906
Structural Error
907
Structural Error
908
Structural Error
909
Structural Error
910
Structural Error
911
Structural Error
912
Structural Error
913
Structural Error
914
Structural Error
915
Structural Error
916
Structural Error


In [62]:
saveData(company_data, 'failedpt4.txt')