In [1]:
# libraries used in production
from collections import defaultdict

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import chromedriver_binary
from bs4 import BeautifulSoup


# testing
from pymongo import MongoClient
from time import sleep
import pandas as pd

In [2]:
def get_login():
    '''
    Access keys from external file and placed in a list.

    Parameters
    ----------
    None:

    Returns
    ----------
    creds: (list)
        Return keys used for session.
    '''
    f = open('../data/LI_login.txt', 'r')
    creds = f.readlines()

    for idx, key in enumerate(creds):
        creds[idx] = key.replace('\n', '')

    return creds

In [3]:
def li_login():
    '''
    Login into LinkedIn and webdriver session for more web manipulation.
    Sets up flow to search in LinkedIn head search bar.

    Parameters
    ----------
    None:

    Returns
    ----------
    driver: (selenium.webdriver.chrome.webdriver.WebDriver)
        Return webdriver session for web manipulation.
    '''
    # session keys for LI instance
    email, pw = get_login()
    
    # selenium webdriver
    driver = webdriver.Chrome()
    driver.get('https://www.linkedin.com/')
    # log in
    driver.find_element_by_id('session_key').send_keys(email)
    driver.find_element_by_id('session_password').send_keys(pw)
    driver.find_element_by_id('session_password').send_keys(Keys.RETURN)

    return driver

In [4]:
def scrape_contacts(driver, co):
    '''
    Search company (co) in LinkedIn head search bar and scrape that company's contacts of interest.
    Returns dictionary of {co: {name: link}}

    Parameters
    ----------
    driver: (selenium.webdriver.chrome.webdriver.WebDriver)
        Webdriver session for web manipulation.
    co: (str)
        Company string to search in head search bar.

    Returns
    ----------
    d: (dict)
        Return dictionary for mongo DB insert.
    '''
    # XPaths
    # Click first item when search company
    srch_x_path = '//*[@id="ember16"]/input'
    co_x_path = '/html/body/div[7]/div[3]/div/div[2]/div/div/div/div[1]/div/div/div/div[2]/div[1]/div[1]/span/div/span/span/a'
    ppl_x_path = '/html/body/div[7]/div[3]/div/div[3]/div[2]/div[1]/div/div/nav/ul/li[5]/a'
    
    
    sleep(5)
    driver.find_element_by_xpath(srch_x_path).send_keys(co + Keys.RETURN)
    sleep(2)
    try:
        driver.find_element_by_xpath(co_x_path).click()
        driver.find_element_by_xpath(srch_x_path).clear()
        return 'PASS', co
    except:
        driver.find_element_by_xpath(srch_x_path).clear()
        return 'FAIL', co
    sleep(2)
#     try:
#         driver.find_element_by_xpath(ppl_x_path).click()
#         return 'PASS', co
#     except:
#         driver.find_element_by_xpath(ppl_x_path).click()
#         return 'FAIL', co
    
    
#     driver.find_element_by_xpath('//*[@id="ember16"]/input').send_keys(Keys.RETURN)
    
    
    
#     for _ in range(17):
# #         driver.find_element_by_xpath('//*[@id="ember16"]/input').send_keys(Keys.TAB)
#         sleep(2)
# #         print('simulating tab')
#     sleep(3)
    
#     driver.find_element_by_xpath('//*[@id="ember16"]/input').send_keys(Keys.RETURN)
#     driver.find_element_by_xpath(co_x_path).click()




In [5]:
driver = li_login()

In [7]:
df = pd.read_csv('../data/glassdoor_scrape_1.csv')

In [8]:
df.head(10)

Unnamed: 0.1,Unnamed: 0,name,size,industry,job_url,overview_url
0,0,University of Texas at Austin,10000+ Employees,Colleges & Universities,glassdoor.com/Jobs/University-of-Texas-at-Aust...,glassdoor.com/Overview/Working-at-University-o...
1,1,Apple,10000+ Employees,Computer Hardware & Software,glassdoor.com/Jobs/Apple-Jobs-E1138.htm,glassdoor.com/Overview/Working-at-Apple-EI_IE1...
2,2,Dell Technologies,10000+ Employees,IT Services,glassdoor.com/Jobs/Dell-Technologies-Jobs-E132...,glassdoor.com/Overview/Working-at-Dell-Technol...
3,3,IBM,10000+ Employees,IT Services,glassdoor.com/Jobs/IBM-Jobs-E354.htm,glassdoor.com/Overview/Working-at-IBM-EI_IE354...
4,4,National Instruments,5001 to 10000 Employees,Electrical & Electronic Manufacturing,glassdoor.com/Jobs/National-Instruments-Jobs-E...,glassdoor.com/Overview/Working-at-National-Ins...
5,5,NVIDIA,10000+ Employees,Computer Hardware & Software,glassdoor.com/Jobs/NVIDIA-Jobs-E7633.htm,glassdoor.com/Overview/Working-at-NVIDIA-EI_IE...
6,6,Atlassian,1001 to 5000 Employees,Computer Hardware & Software,glassdoor.com/Jobs/Atlassian-Jobs-E115699.htm,glassdoor.com/Overview/Working-at-Atlassian-EI...
7,7,Silicon Labs,1001 to 5000 Employees,Electrical & Electronic Manufacturing,glassdoor.com/Jobs/Silicon-Labs-Jobs-E9122.htm,glassdoor.com/Overview/Working-at-Silicon-Labs...
8,8,SailPoint Technologies,1001 to 5000 Employees,Enterprise Software & Network Solutions,glassdoor.com/Jobs/SailPoint-Technologies-Jobs...,glassdoor.com/Overview/Working-at-SailPoint-Te...
9,9,Facebook,10000+ Employees,Internet,glassdoor.com/Jobs/Facebook-Jobs-E40772.htm,glassdoor.com/Overview/Working-at-Facebook-EI_...


In [18]:
df.dropna(inplace=True)

In [19]:
edu_flag = df['industry'].apply(lambda x: True if 'College' in x else False)

In [20]:
edu_flag.value_counts()

False    5330
True       59
Name: industry, dtype: int64

In [28]:
cos = pd.Series(df.name[~edu_flag].unique())

In [31]:
edu = pd.Series(df.name[edu_flag].unique())

In [None]:
cos[:-4]

In [36]:
test_series = cos.apply(lambda x: scrape_contacts(driver, x))

KeyboardInterrupt: 

In [35]:
test_series

1       (FAIL, Dell Technologies)
2                     (PASS, IBM)
3    (FAIL, National Instruments)
4                  (FAIL, NVIDIA)
5               (PASS, Atlassian)
dtype: object

In [56]:
scrape_contacts(driver, '3m')

### unused in functions. Here for reference.

In [135]:
email, pw = get_login()
# pull pw/login from external file
driver = webdriver.Chrome()
# inst chrome webdriver
driver.get('https://www.linkedin.com/')
# open LI page
driver.find_element_by_id('session_key').send_keys(email)
# pass in email
driver.find_element_by_id('session_password').send_keys(pw)
# pass in pw
driver.find_element_by_xpath('/html/body/main/section[1]/div[2]/form/button').click()
'''TODO: .click() with xpath'''

# driver.find_element_by_id('homepage-basic_signin-form_submit-button').click()

'TODO: .click() with xpath'

In [136]:
driver.find_element_by_xpath('//*[@id="ember16"]/input').send_keys(co_test)

In [137]:
driver.find_element_by_xpath('//*[@id="ember16"]/input').send_keys(Keys.RETURN)
# enter search for co. name

In [22]:
co_x_path = '/html/body/div[7]/div[3]/div/div[2]/div/div[2]/div/div/div/ul/li[1]/div/div/div[2]/a/h3'

In [23]:
driver.find_element_by_xpath(co_x_path).click()
# click co. link

NoSuchElementException: Message: no such element: Unable to locate element: {"method":"xpath","selector":"/html/body/div[7]/div[3]/div/div[2]/div/div[2]/div/div/div/ul/li[1]/div/div/div[2]/a/h3"}
  (Session info: chrome=84.0.4147.89)


In [48]:
ppl_x_path = '/html/body/div[7]/div[3]/div/div[3]/div[2]/div[1]/div/div/nav/ul/li[5]/a'

In [49]:
driver.find_element_by_xpath(ppl_x_path).click()
# click People

## ~~edit button testing~~
### moving to general search, b/c more flexibility in search

In [11]:
gen_search_xpath = '/html/body/div[7]/div[3]/div/div[3]/div[2]/div[2]/div[1]/div[2]/div/div/div/input'
gal_alum = 'galvanize'
tech_rec = 'technical recruiter'

In [12]:
driver.find_element_by_xpath(gen_search_xpath).send_keys(gal_alum)
driver.find_element_by_xpath(gen_search_xpath).send_keys(Keys.RETURN)

In [13]:
r = driver.page_source

In [14]:
soup = BeautifulSoup(r, 'html.parser')

In [94]:
results = soup.find('ul', 'org-people-profiles-module__profile-list')

In [95]:
len(results)

101

In [96]:
contact_elements = results.find_all('li', 'org-people-profiles-module__profile-item')

### iterate through indecies of `contact_elements` to pull k-v pairs

In [103]:
#k example
name = contact_elements[0].find('div', 'org-people-profile-card__profile-title t-black lt-line-clamp lt-line-clamp--single-line ember-view').text.rstrip()
name = name.replace(' ', '', 2)
name

'Erin Gong'

In [107]:
#v example
#link to contact
link = 'https://www.linkedin.com' + contact_elements[0].a['href']
link

'https://www.linkedin.com/in/eringong/'

In [118]:
# for loop and dictionary pop init
d = defaultdict(dict)

for contact in contact_elements:
    name = contact.find('div', 'org-people-profile-card__profile-title t-black lt-line-clamp lt-line-clamp--single-line ember-view').text.rstrip()
    name = name.replace(' ', '', 2)
    link = 'https://www.linkedin.com' + contact.a['href']
    
    if co_test not in d:
        d[co_test]
        
    if name not in co_test:
        d[co_test][name] = link
        

In [119]:
d

defaultdict(dict,
            {'apple': {'Erin Gong': 'https://www.linkedin.com/in/eringong/',
              'Kendal Holcombe (Chilcott)': 'https://www.linkedin.com/in/kendal-holcombe/',
              'Ryan Hunter': 'https://www.linkedin.com/in/ryan-hunter-1313a53b/',
              'Bob Mickus': 'https://www.linkedin.com/in/bobmickus/',
              'Isaac Lessard': 'https://www.linkedin.com/in/isaaclessard/',
              'Zhifan (Jeff) Sang': 'https://www.linkedin.com/in/zfsang/',
              'Sierra Murphy': 'https://www.linkedin.com/in/sierra-murphy-81b75ba2/',
              'Hao Ding': 'https://www.linkedin.com/in/haoding1/',
              'Udaiveer Singh': 'https://www.linkedin.com/in/udaiveers/',
              'Kevin Becerra': 'https://www.linkedin.com/in/kevinbece55/',
              'Charlie (Changsong) Ding': 'https://www.linkedin.com/in/changsongding/',
              'Bryce Schmidtchen': 'https://www.linkedin.com/in/bryceschmidtchen/',
              'Omar Sobh': 'https://

Ok, interacting with LI just fine. ~~Need to refine 'add' edu "Galvanize" and~~ Move to general search for flexbility. Able to search co. for my desired fields... Need to scrape portion of return employees. 

--> ~~beautifulsoup to scrape return cells for (k) employee name (v) url to profile~~

--> script Just portion. Just need DF out. for list of co's to search

--> MongoDB for k-v's

--> Dataframes

*perhaps load into psql for py wrapper struc. RDBMS experience*