In [2]:
from collections import defaultdict
import pandas as pd
from time import sleep

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import chromedriver_binary
from bs4 import BeautifulSoup

from pymongo import MongoClient

In [3]:
def get_login():
    '''
    Access keys from external file and placed in a list.

    Parameters
    ----------
    None:

    Returns
    ----------
    creds: (list)
        Return keys used for session.
    '''
    f = open('../../data/LI_login.txt', 'r')
    creds = f.readlines()

    for idx, key in enumerate(creds):
        creds[idx] = key.replace('\n', '')

    return creds

In [4]:
def li_login():
    '''
    Login into LinkedIn and webdriver session for more web manipulation.
    Sets up flow to search in LinkedIn head search bar.

    Parameters
    ----------
    None:

    Returns
    ----------
    driver: (selenium.webdriver.chrome.webdriver.WebDriver)
        Return webdriver session for web manipulation.
    '''
    # session keys for LI instance
    email, pw = get_login()
    
    # selenium webdriver
    driver = webdriver.Chrome()
    driver.get('https://www.linkedin.com/')
    # log in
    driver.find_element_by_id('session_key').send_keys(email)
    driver.find_element_by_id('session_password').send_keys(pw)
    driver.find_element_by_id('session_password').send_keys(Keys.RETURN)

    return driver

In [5]:
def scrape_contacts(driver, co):
    '''
    Search company (co) in LinkedIn head search bar and scrape that company's contacts of interest.
    Returns dictionary of {co: {name: link}}

    Parameters
    ----------
    driver: (selenium.webdriver.chrome.webdriver.WebDriver)
        Webdriver session for web manipulation.
    co: (str)
        Company string to search in head search bar.

    Returns
    ----------
    d: (dict)
        Return dictionary for mongo DB insert.
    '''
    # XPaths
    # Click first item when search company
    srch_x_path = '//*[@id="ember16"]/input'
    co_x_path = '/html/body/div[7]/div[3]/div/div[2]/div/div/div/div[1]/div/div/div/div[2]/div[1]/div[1]/span/div/span/span/a'
    ppl_x_path = '/html/body/div[7]/div[3]/div/div[3]/div[2]/div[1]/div/div/nav/ul/li[5]/a'
    
    
    sleep(5)
    driver.find_element_by_xpath(srch_x_path).send_keys(co + Keys.RETURN)
    sleep(2)
    try:
        driver.find_element_by_xpath(co_x_path).click()
        driver.find_element_by_xpath(srch_x_path).clear()
        return 'PASS', co
    except:
        driver.find_element_by_xpath(srch_x_path).clear()
        return 'FAIL', co
    sleep(2)

In [6]:
df = pd.read_csv('../../data/glassdoor_scrape_1.csv')
df.dropna(inplace=True)
edu_flag = df['industry'].apply(lambda x: True if 'College' in x else False)
cos = pd.Series(df.name[~edu_flag].unique())

In [7]:
driver = li_login()

OSError: [Errno 8] Exec format error: 'chromedriver'

In [None]:
cos_scrape = cos.apply(lambda x: scrape_contacts(driver, x))

In [None]:
pickle.dump(cos_scrape, open( "save.pkl", "wb" ))

### Unused below
### ~~defining flow for `li_login` function~~

In [3]:
email, pw = get_login()
# pull pw/login from external file
driver = webdriver.Chrome()
# inst chrome webdriver
driver.get('https://www.linkedin.com/')
# open LI page
driver.find_element_by_id('session_key').send_keys(email)
# pass in email
driver.find_element_by_id('session_password').send_keys(pw)
# pass in pw
'''TODO: .click() with xpath'''

# driver.find_element_by_id('homepage-basic_signin-form_submit-button').click()

'TODO: .click() with xpath'

In [5]:
driver.find_element_by_id('session_password').send_keys(Keys.RETURN)
# press enter for to log in

In [4]:
co_test = 'apple'

In [5]:
driver.find_element_by_xpath('//*[@id="ember16"]/input').send_keys(co_test)

In [6]:
driver.find_element_by_xpath('//*[@id="ember16"]/input').send_keys(Keys.RETURN)
# enter search for co. name

In [7]:
co_x_path = '/html/body/div[7]/div[3]/div/div[2]/div/div[2]/div/div/div/ul/li[1]/div/div/div[2]/a/h3'

In [8]:
driver.find_element_by_xpath(co_x_path).click()
# click co. link

In [9]:
ppl_x_path = '/html/body/div[7]/div[3]/div/div[3]/div[2]/div[1]/div/div/nav/ul/li[5]/a'

In [10]:
driver.find_element_by_xpath(ppl_x_path).click()
# click People

## ~~edit button testing~~
### moving to general search, b/c more flexibility in search

In [11]:
gen_search_xpath = '/html/body/div[7]/div[3]/div/div[3]/div[2]/div[2]/div[1]/div[2]/div/div/div/input'
gal_alum = 'galvanize'
tech_rec = 'technical recruiter'

In [12]:
driver.find_element_by_xpath(gen_search_xpath).send_keys(gal_alum)
driver.find_element_by_xpath(gen_search_xpath).send_keys(Keys.RETURN)

In [13]:
r = driver.page_source

In [14]:
soup = BeautifulSoup(r, 'html.parser')

In [94]:
results = soup.find('ul', 'org-people-profiles-module__profile-list')

In [95]:
len(results)

101

In [96]:
contact_elements = results.find_all('li', 'org-people-profiles-module__profile-item')

### iterate through indecies of `contact_elements` to pull k-v pairs

In [103]:
#k example
name = contact_elements[0].find('div', 'org-people-profile-card__profile-title t-black lt-line-clamp lt-line-clamp--single-line ember-view').text.rstrip()
name = name.replace(' ', '', 2)
name

'Erin Gong'

In [107]:
#v example
#link to contact
link = 'https://www.linkedin.com' + contact_elements[0].a['href']
link

'https://www.linkedin.com/in/eringong/'

In [118]:
# for loop and dictionary pop init
d = defaultdict(dict)

for contact in contact_elements:
    name = contact.find('div', 'org-people-profile-card__profile-title t-black lt-line-clamp lt-line-clamp--single-line ember-view').text.rstrip()
    name = name.replace(' ', '', 2)
    link = 'https://www.linkedin.com' + contact.a['href']
    
    if co_test not in d:
        d[co_test]
        
    if name not in co_test:
        d[co_test][name] = link
        

In [119]:
d

defaultdict(dict,
            {'apple': {'Erin Gong': 'https://www.linkedin.com/in/eringong/',
              'Kendal Holcombe (Chilcott)': 'https://www.linkedin.com/in/kendal-holcombe/',
              'Ryan Hunter': 'https://www.linkedin.com/in/ryan-hunter-1313a53b/',
              'Bob Mickus': 'https://www.linkedin.com/in/bobmickus/',
              'Isaac Lessard': 'https://www.linkedin.com/in/isaaclessard/',
              'Zhifan (Jeff) Sang': 'https://www.linkedin.com/in/zfsang/',
              'Sierra Murphy': 'https://www.linkedin.com/in/sierra-murphy-81b75ba2/',
              'Hao Ding': 'https://www.linkedin.com/in/haoding1/',
              'Udaiveer Singh': 'https://www.linkedin.com/in/udaiveers/',
              'Kevin Becerra': 'https://www.linkedin.com/in/kevinbece55/',
              'Charlie (Changsong) Ding': 'https://www.linkedin.com/in/changsongding/',
              'Bryce Schmidtchen': 'https://www.linkedin.com/in/bryceschmidtchen/',
              'Omar Sobh': 'https://

Ok, interacting with LI just fine. ~~Need to refine 'add' edu "Galvanize" and~~ Move to general search for flexbility. Able to search co. for my desired fields... Need to scrape portion of return employees. 

--> ~~beautifulsoup to scrape return cells for (k) employee name (v) url to profile~~

--> script Just portion. Just need DF out. for list of co's to search

--> MongoDB for k-v's

--> Dataframes

*perhaps load into psql for py wrapper struc. RDBMS experience*