In [1]:
from collections import defaultdict
import pandas as pd
from time import sleep

from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
import chromedriver_binary
from bs4 import BeautifulSoup


from pymongo import MongoClient

In [2]:
def get_login():
    '''
    Access keys from external file and placed in a list.

    Parameters
    ----------
    None:

    Returns
    ----------
    creds: (list)
        Return keys used for session.
    '''
    f = open('../../data/LI_login.txt', 'r')
    creds = f.readlines()

    for idx, key in enumerate(creds):
        creds[idx] = key.replace('\n', '')

    return creds

In [3]:
def li_login():
    '''
    Login into LinkedIn and webdriver session for more web manipulation.
    Sets up flow to search in LinkedIn head search bar.

    Parameters
    ----------
    None:

    Returns
    ----------
    driver: (selenium.webdriver.chrome.webdriver.WebDriver)
        Return webdriver session for web manipulation.
    '''
    # session keys for LI instance
    email, pw = get_login()
    
    # selenium webdriver
    driver = webdriver.Chrome()
    driver.get('https://www.linkedin.com/')
    # log in
    driver.find_element_by_id('session_key').send_keys(email)
    driver.find_element_by_id('session_password').send_keys(pw)
    driver.find_element_by_id('session_password').send_keys(Keys.RETURN)

    return driver

In [4]:
def scrape_contacts(driver, co):
    '''
    Search company (co) in LinkedIn head search bar and scrape that company's contacts of interest.
    Returns dictionary of {co: {name: link}}

    Parameters
    ----------
    driver: (selenium.webdriver.chrome.webdriver.WebDriver)
        Webdriver session for web manipulation.
    co: (str)
        Company string to search in head search bar.

    Returns
    ----------
    d: (dict)
        Return dictionary for mongo DB insert.
    '''
    # XPaths
    # Click first item when search company
    srch_x_path = '//*[@id="ember16"]/input'
    co_x_path = '/html/body/div[7]/div[3]/div/div[2]/div/div/div/div[1]/div/div/div/div[2]/div[1]/div[1]/span/div/span/span/a'
    ppl_x_path = '/html/body/div[7]/div[3]/div/div[3]/div[2]/div[1]/div/div/nav/ul/li[5]/a'
    ppl_search_xpath = '/html/body/div[7]/div[3]/div/div[3]/div[2]/div[2]/div[1]/div[2]/div/div/div/input'
    gal_alum = 'galvanize'
    tech_rec = 'technical recruiter'
    
    
    sleep(5)
    driver.find_element_by_xpath(srch_x_path).send_keys(co + Keys.RETURN)
    sleep(5) #TODO check page load
    driver.find_element_by_xpath(co_x_path).click()
    sleep(5) #TODO check page load
    driver.find_element_by_xpath(ppl_x_path).click()
    sleep(5) #TODO check page load
    driver.find_element_by_xpath(srch_x_path).clear()
    sleep(5) 
    driver.find_element_by_xpath(ppl_search_xpath).send_keys(tech_rec + Keys.RETURN)
    sleep(5)
    scroll_to_end(driver, 3)
    r = driver.page_source
    soup = BeautifulSoup(r, 'html.parser')
    # TODO insert mongo raw scrape
    results = soup.find('ul', 'org-people-profiles-module__profile-list')
    
    d = mongo_insert(results, co)
    
    return d
    
    
#     try:
#         driver.find_element_by_xpath(co_x_path).click()
#         driver.find_element_by_xpath(ppl_x_path).click()
#         driver.find_element_by_xpath(srch_x_path).clear()
#         return 'PASS', co
#     except:
#         driver.find_element_by_xpath(srch_x_path).clear()
#         return 'FAIL', co
#     sleep(2)

In [59]:
def mongo_insert(results, co):
    contact_elements = results.find_all('li', 'org-people-profiles-module__profile-item')
    
    d = defaultdict(dict)
    
    for contact in contact_elements:
        name = contact.find('div', 'org-people-profile-card__profile-title t-black lt-line-clamp lt-line-clamp--single-line ember-view')
        if name is None:
            continue
        name = name.text.rstrip().replace(' ', '', 2)
        link = 'https://www.linkedin.com' + contact.a['href']

        if co not in d:
            d[co]

        if name not in co:
            d[co][name] = link
    
    return d

In [7]:
df = pd.read_csv('../../data/glassdoor_scrape_1.csv')
df.dropna(inplace=True)
edu_flag = df['industry'].apply(lambda x: True if 'College' in x else False)
cos = pd.Series(df.name[~edu_flag].unique())

In [23]:
driver = li_login()

In [34]:
def scroll_to_end(driver, timeout):
    scroll_pause_time = timeout

    # Get scroll height
    last_height = driver.execute_script("return document.body.scrollHeight")

    while True:
        # Scroll down to bottom
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

        # Wait to load page
        sleep(scroll_pause_time)

        # Calculate new scroll height and compare with last scroll height
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            # If heights are the same it will exit the function
            break
        last_height = new_height

In [36]:
scroll(driver, 3)

In [9]:
sample_test = cos[:10]
sample_test

0                     Apple
1         Dell Technologies
2                       IBM
3      National Instruments
4                    NVIDIA
5                 Atlassian
6              Silicon Labs
7    SailPoint Technologies
8                  Facebook
9                    Google
dtype: object

In [39]:
# results variable to testing, otherwise no var
results = scrape_contacts(driver, sample_test[4])

In [10]:
# test single co with scroll
d = scrape_contacts(driver, sample_test[5])

In [11]:
d

defaultdict(dict,
            {'Atlassian': {'Jamie Hayes': 'https://www.linkedin.com/in/jamieghayes/',
              'Amanda Sloup': 'https://www.linkedin.com/in/amandapadellaro/',
              'Monica Harris': 'https://www.linkedin.com/in/monicarickenbacher/',
              'Elena Wester, PHR': 'https://www.linkedin.com/in/westerelenac/',
              'Dipti Sood': 'https://www.linkedin.com/in/dipti-sood-108ba37/',
              'Mark Grantham': 'https://www.linkedin.com/in/mgrantham/',
              'Michelle Rivera': 'https://www.linkedin.com/in/michellerivera3/',
              'Amali Siedlecki': 'https://www.linkedin.com/in/amalids/',
              'Jake Foster': 'https://www.linkedin.com/in/jacobfoster1/',
              'Mitra Mahdavi': 'https://www.linkedin.com/in/mitra-mahdavi-15413611/',
              'Kristin Barger': 'https://www.linkedin.com/in/kristintbarger/'}})

In [10]:
# d variable to testing, otherwise no var
ds = sample_test.apply(lambda x: scrape_contacts(driver, x))

KeyboardInterrupt: 

## selenium scroll done
## scrapping results

In [40]:
contact_elements = results.find_all('li', 'org-people-profiles-module__profile-item')

In [41]:
len(contact_elements)

97

In [43]:
d = defaultdict(dict)

In [37]:
# returned None from scrape
# TODO: add None handling to dict construction
#type(contact_elements[19].find('div', 'org-people-profile-card__profile-title t-black lt-line-clamp lt-line-clamp--single-line ember-view'))

NoneType

In [44]:
for contact in contact_elements:
    name = contact.find('div', 'org-people-profile-card__profile-title t-black lt-line-clamp lt-line-clamp--single-line ember-view').text.rstrip()
    name = name.replace(' ', '', 2)
    link = 'https://www.linkedin.com' + contact.a['href']
    
    if co_test not in d:
        d[co_test]
        
    if name not in co_test:
        d[co_test][name] = link

In [45]:
d

defaultdict(dict,
            {'NVIDIA': {'Steve Stepan Kehayan': 'https://www.linkedin.com/in/steve-stepan-kehayan-77b3753/',
              'Bella Yanovsky': 'https://www.linkedin.com/in/bella-yanovsky-4a17ba3/',
              'Thaddeus Dickens': 'https://www.linkedin.com/in/thaddeusdickens/',
              'Linda Trias': 'https://www.linkedin.com/in/lindatrias2005/',
              'Marilyn Ibanez': 'https://www.linkedin.com/in/marilyn-ibanez-1b7347141/',
              'Larry Gonzales': 'https://www.linkedin.com/in/larrygonzales1/',
              'Jennifer Jones': 'https://www.linkedin.com/in/jenniferannkim/',
              'Meryl Kaiser': 'https://www.linkedin.com/in/merylkaiser/',
              'Lisa Calderon': 'https://www.linkedin.com/in/lisacal/',
              'Anita Rexinger': 'https://www.linkedin.com/in/rexinger/',
              'DAN PATEL': 'https://www.linkedin.com/in/recruiterdan/',
              'Julia Tyson': 'https://www.linkedin.com/in/julia-tyson-b935ab/',
           

In [None]:
cos_scrape = cos.apply(lambda x: scrape_contacts(driver, x))

### Unused below
### ~~defining flow for `li_login` function~~

In [3]:
email, pw = get_login()
# pull pw/login from external file
driver = webdriver.Chrome()
# inst chrome webdriver
driver.get('https://www.linkedin.com/')
# open LI page
driver.find_element_by_id('session_key').send_keys(email)
# pass in email
driver.find_element_by_id('session_password').send_keys(pw)
# pass in pw
'''TODO: .click() with xpath'''

# driver.find_element_by_id('homepage-basic_signin-form_submit-button').click()

'TODO: .click() with xpath'

In [5]:
driver.find_element_by_id('session_password').send_keys(Keys.RETURN)
# press enter for to log in

In [4]:
co_test = 'apple'

In [5]:
driver.find_element_by_xpath('//*[@id="ember16"]/input').send_keys(co_test)

In [6]:
driver.find_element_by_xpath('//*[@id="ember16"]/input').send_keys(Keys.RETURN)
# enter search for co. name

In [7]:
co_x_path = '/html/body/div[7]/div[3]/div/div[2]/div/div[2]/div/div/div/ul/li[1]/div/div/div[2]/a/h3'

In [8]:
driver.find_element_by_xpath(co_x_path).click()
# click co. link

In [9]:
ppl_x_path = '/html/body/div[7]/div[3]/div/div[3]/div[2]/div[1]/div/div/nav/ul/li[5]/a'

In [10]:
driver.find_element_by_xpath(ppl_x_path).click()
# click People

## ~~edit button testing~~
### moving to general search, b/c more flexibility in search

In [16]:
gen_search_xpath = '/html/body/div[7]/div[3]/div/div[3]/div[2]/div[2]/div[1]/div[2]/div/div/div/input'
gal_alum = 'galvanize'
tech_rec = 'technical recruiter'

In [17]:
driver.find_element_by_xpath(gen_search_xpath).send_keys(gal_alum)
driver.find_element_by_xpath(gen_search_xpath).send_keys(Keys.RETURN)

In [37]:
r = driver.page_source

In [38]:
soup = BeautifulSoup(r, 'html.parser')

In [39]:
results = soup.find('ul', 'org-people-profiles-module__profile-list')

In [60]:
len(results)

3007

In [41]:
contact_elements = results.find_all('li', 'org-people-profiles-module__profile-item')

### iterate through indecies of `contact_elements` to pull k-v pairs

In [56]:
name = contact_elements[1].find('div', 'org-people-profile-card__profile-title t-black lt-line-clamp lt-line-clamp--single-line ember-view')

In [58]:
name = name.text.rstrip().replace(' ', '', 2)

In [22]:
#k example
name = contact_elements[-1].find('div', 'org-people-profile-card__profile-title t-black lt-line-clamp lt-line-clamp--single-line ember-view').text.rstrip()
name = name.replace(' ', '', 2)
name

'Antwan Little'

In [23]:
#v example
#link to contact
link = 'https://www.linkedin.com' + contact_elements[-1].a['href']
link

'https://www.linkedin.com/in/antwanlittle/'

In [45]:
co_test = 'apple'

In [69]:
contact_elements = results.find_all('li', 'org-people-profiles-module__profile-item')

d = defaultdict(dict)
cnt=0
for contact in contact_elements:
    name = contact.find('div', 'org-people-profile-card__profile-title t-black lt-line-clamp lt-line-clamp--single-line ember-view')
    if name is None:
        cnt=+1
        continue
    name = name.text.rstrip().replace(' ', '', 2)
    link = 'https://www.linkedin.com' + contact.a['href']

    if co_test not in d:
        d[co_test]

    if name not in co_test:
        d[co_test][name] = link

In [70]:
len(d['apple'])

697

In [66]:
d

defaultdict(dict,
            {'apple': {'Joy Baughman': 'https://www.linkedin.com/in/joy-baughman/',
              'Jaquala Couch': 'https://www.linkedin.com/in/jaquala-couch-a74102116/',
              'Sanam Delrooz, CIR': 'https://www.linkedin.com/in/sanamdelrooz/',
              'Janelle Parkin': 'https://www.linkedin.com/in/janelleparkin/',
              'Tiffany Landayan': 'https://www.linkedin.com/in/tiffany-landayan-9b46847/',
              'Kimberly Janes': 'https://www.linkedin.com/in/kimberlypuzzo/',
              'Kal Luben': 'https://www.linkedin.com/in/kallubenrecruiter/',
              'Joseph Verdugo': 'https://www.linkedin.com/in/jverdugo78/',
              'Lauren Bokum Alaee': 'https://www.linkedin.com/in/laurenalaee/',
              'Jenny Voges': 'https://www.linkedin.com/in/jennyvoges/',
              'Long Phung': 'https://www.linkedin.com/in/lphung/',
              'Nataly Cortes': 'https://www.linkedin.com/in/natalygracecortes/',
              'Kashan Shami': '

In [49]:
d

defaultdict(dict,
            {'apple': {'Joy Baughman': 'https://www.linkedin.com/in/joy-baughman/',
              'Jaquala Couch': 'https://www.linkedin.com/in/jaquala-couch-a74102116/',
              'Sanam Delrooz, CIR': 'https://www.linkedin.com/in/sanamdelrooz/',
              'Janelle Parkin': 'https://www.linkedin.com/in/janelleparkin/',
              'Tiffany Landayan': 'https://www.linkedin.com/in/tiffany-landayan-9b46847/',
              'Kimberly Janes': 'https://www.linkedin.com/in/kimberlypuzzo/',
              'Kal Luben': 'https://www.linkedin.com/in/kallubenrecruiter/',
              'Joseph Verdugo': 'https://www.linkedin.com/in/jverdugo78/',
              'Lauren Bokum Alaee': 'https://www.linkedin.com/in/laurenalaee/',
              'Jenny Voges': 'https://www.linkedin.com/in/jennyvoges/',
              'Long Phung': 'https://www.linkedin.com/in/lphung/',
              'Nataly Cortes': 'https://www.linkedin.com/in/natalygracecortes/',
              'Kashan Shami': '

Ok, interacting with LI just fine. ~~Need to refine 'add' edu "Galvanize" and~~ Move to general search for flexbility. Able to search co. for my desired fields... Need to scrape portion of return employees. 

--> ~~beautifulsoup to scrape return cells for (k) employee name (v) url to profile~~

--> script Just portion. Just need DF out. for list of co's to search

--> MongoDB for k-v's

--> Dataframes

*perhaps load into psql for py wrapper struc. RDBMS experience*