In [16]:
from collections import defaultdict
import pandas as pd
from time import sleep

from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
import chromedriver_binary
from bs4 import BeautifulSoup

import mongo

Add project overview and nexts steps

In [17]:
def get_login():
    '''
    Access keys from external file and placed in a list.

    Parameters
    ----------
    None:

    Returns
    ----------
    creds: (list)
        Return keys used for session.
    '''
    f = open('../../data/LI_login.txt', 'r')
    creds = f.readlines()

    for idx, key in enumerate(creds):
        creds[idx] = key.replace('\n', '')

    return creds

In [18]:
def li_login():
    '''
    Login into LinkedIn and webdriver session for more web manipulation.
    Sets up flow to search in LinkedIn head search bar.

    Parameters
    ----------
    None:

    Returns
    ----------
    driver: (selenium.webdriver.chrome.webdriver.WebDriver)
        Return webdriver session for web manipulation.
    '''
    # session keys for LI instance
    email, pw = get_login()
    
    # selenium webdriver
    driver = webdriver.Chrome()
    driver.get('https://www.linkedin.com/')
    # log in
    sleep(2)
    driver.find_element_by_id('session_key').send_keys(email)
    sleep(1)
    driver.find_element_by_id('session_password').send_keys(pw+Keys.RETURN)

    return driver

In [47]:
def scrape_contacts(driver, co):
    '''
    Search company (co) in LinkedIn head search bar and scrape that company's contacts of interest.
    Returns dictionary of {co: {name: link}}

    Parameters
    ----------
    driver: (selenium.webdriver.chrome.webdriver.WebDriver)
        Webdriver session for web manipulation.
    co: (str)
        Company string to search in head search bar.

    Returns
    ----------
    d: (dict)
        Return dictionary for mongo DB insert.
    '''
    global_srch = 'https://www.linkedin.com/search/results/companies/?keywords=&origin=SWITCH_SEARCH_VERTICAL'
    driver.get(global_srch)
    wait = WebDriverWait(driver, 10)
    # XPaths
    srch_x_path = '//*[@id="ember16"]/input'
#     co_x_path = '/html/body/div[7]/div[3]/div/div[2]/div/div/div/div[1]/div/div/div/div[2]/div[1]/div[1]/span/div/span/span/a'
#     co_x_path = '/html/body/div[7]/div[3]/div/div[2]/div/div/div/div[1]/div/div/div/div[2]/div[1]/div[1]/span/div/span/span/a'
#     co_x_path = '/html/body/div[7]/div[3]/div/div[2]/div/div/div/div[2]/div/div/div/div[2]/div[1]/div[1]/span/div/span/span/a'
#     ppl_x_path = '/html/body/div[7]/div[3]/div/div[3]/div[2]/div[1]/div/div/nav/ul/li[5]/a'
    ppl_search_xpath = '//*[@id="people-search-keywords"]'
#     ppl_search_xpath = '/html/body/div[7]/div[3]/div/div[3]/div[2]/div[2]/div[1]/div[2]/div/div/div/input'
    gal_alum = 'galvanize'
    tech_rec = 'technical recruiter'
    
    
    driver.find_element_by_xpath(srch_x_path).send_keys(co + Keys.RETURN)
    sleep(3)
    r = driver.page_source
    soup = BeautifulSoup(r, 'html.parser')
    first_hit = soup.find_all('a')[16]['id']
    
    if first_hit == 'globalfooter-accessibility':
        mongo.insert_one({co: 'Company Page 404'})
    
    up = ActionChains(driver)
    up.send_keys(Keys.HOME)
    up.perform()
    sleep(3)
    driver.find_element_by_id(first_hit).click()
    
    try:
        wait.until(EC.element_to_be_clickable((By.LINK_TEXT, 'People')))
        driver.find_element_by_link_text('People').click()
    except:
        return {co: 'People Link 404'}
        mongo.insert_one({co: 'People Link 404'})
        
    wait.until(EC.element_to_be_clickable((By.XPATH, ppl_search_xpath))) 
    driver.find_element_by_xpath(ppl_search_xpath).send_keys(gal_alum + Keys.RETURN)
    
    wait.until(EC.element_to_be_clickable((By.TAG_NAME, 'ul'))) 
    scroll_to_end(driver, 3)
    r = driver.page_source
    soup = BeautifulSoup(r, 'html.parser')
    # TODO insert mongo raw scrape
    results = soup.find('ul', 'org-people-profiles-module__profile-list')
    
    if results is None:
        mongo.insert_one({co: 'No results'})
        return None
    
    d = construct_record(results, co)
    mongo.insert_one(d)

In [20]:
def construct_record(results, co):
    contact_elements = results.find_all('li', 'org-people-profiles-module__profile-item')
    
    d = defaultdict(dict)
    
    for contact in contact_elements:
        name = contact.find('div', 'org-people-profile-card__profile-title t-black lt-line-clamp lt-line-clamp--single-line ember-view')
        if name is None:
            continue
        name = name.text.rstrip().replace(' ', '', 2)
        name = name.replace('.', '')
        link = 'https://www.linkedin.com' + contact.a['href']

        if co not in d:
            d[co]

        if name not in co:
            d[co][name] = link
    
    return d

In [21]:
def scroll_to_end(driver, timeout):
    scroll_pause_time = timeout

    # Get scroll height
    last_height = driver.execute_script("return document.body.scrollHeight")

    while True:
        # Scroll down to bottom
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

        # Wait to load page
        sleep(scroll_pause_time)

        # Calculate new scroll height and compare with last scroll height
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            # If heights are the same it will exit the function
            break
        last_height = new_height

In [22]:
df = pd.read_csv('../../data/glassdoor_scrape_1.csv')
df.dropna(inplace=True)
edu_flag = df['industry'].apply(lambda x: True if 'College' in x else False)
cos = pd.Series(df.name[~edu_flag].unique())

In [23]:
driver = li_login()
mongo.connect_mongo()
mongo.connect_coll('script_test', 'gal_alum')

In [50]:
df.loc[:0, 'name']

0    University of Texas at Austin
Name: name, dtype: object

In [51]:
edu = df.loc[:0, 'name'].apply(lambda x: scrape_contacts(driver, x))

In [34]:
global_srch = 'https://www.linkedin.com/search/results/companies/?keywords=&origin=SWITCH_SEARCH_VERTICAL'
driver.get(global_srch)
wait = WebDriverWait(driver, 10)
# XPaths
srch_x_path = '//*[@id="ember16"]/input'
#     co_x_path = '/html/body/div[7]/div[3]/div/div[2]/div/div/div/div[1]/div/div/div/div[2]/div[1]/div[1]/span/div/span/span/a'
#     co_x_path = '/html/body/div[7]/div[3]/div/div[2]/div/div/div/div[1]/div/div/div/div[2]/div[1]/div[1]/span/div/span/span/a'
#     co_x_path = '/html/body/div[7]/div[3]/div/div[2]/div/div/div/div[2]/div/div/div/div[2]/div[1]/div[1]/span/div/span/span/a'
#     ppl_x_path = '/html/body/div[7]/div[3]/div/div[3]/div[2]/div[1]/div/div/nav/ul/li[5]/a'
ppl_search_xpath = '//*[@id="people-search-keywords"]'
#     ppl_search_xpath = '/html/body/div[7]/div[3]/div/div[3]/div[2]/div[2]/div[1]/div[2]/div/div/div/input'
gal_alum = 'galvanize'
tech_rec = 'technical recruiter'


driver.find_element_by_xpath(srch_x_path).send_keys(co + Keys.RETURN)
sleep(3)
r = driver.page_source
soup = BeautifulSoup(r, 'html.parser')
first_hit = soup.find_all('a')[16]['id']

In [36]:
r = driver.page_source
soup = BeautifulSoup(r, 'html.parser')
first_hit = soup.find_all('a')[16]['id']

In [37]:
first_hit

'globalfooter-accessibility'

In [None]:
try:
    up = ActionChains(driver)
    up.send_keys(Keys.HOME)
    up.perform()
    sleep(3)
    driver.find_element_by_id(first_hit).click()
except:
    driver.find_element_by_xpath(srch_x_path).clear()
    mongo.insert_one({co: 'Company Page 404'})

## Slice of first 10 companies

In [9]:
sample_test = cos[:10]
sample_test

0                     Apple
1         Dell Technologies
2                       IBM
3      National Instruments
4                    NVIDIA
5                 Atlassian
6              Silicon Labs
7    SailPoint Technologies
8                  Facebook
9                    Google
dtype: object

## Testing scrape function

# PASS first ten companies

In [15]:
scrape_results = sample_test.apply(lambda x: scrape_contacts(driver, x))
scrape_results

0    None
1    None
2    None
3    None
4    None
5    None
6    None
7    None
8    None
9    None
dtype: object

### Mongo insert db test

In [26]:
sample_test.apply(lambda x: test_scrape_contacts(driver, x))

0    None
1    None
2    None
3    None
4    None
5    None
6    None
7    None
8    None
9    None
dtype: object

## Expected results
```python
0    {'Apple': {'Erin Gong': 'https://www.linkedin....
1    {'Dell Technologies': 'People Xpath clickable,...
2    {'IBM': {'James Helfrich': 'https://www.linked...
3               {'National Instruments': 'No results'}
4    {'NVIDIA': {'Deep Narain Singh': 'https://www....
5    {'Atlassian': {'Tristan Rubadeau': 'https://ww...
6                       {'Silicon Labs': 'No results'}
7       {'SailPoint Technologies': 'Company Page 404'}
8    {'Facebook': {'Michael Suttles': 'https://www....
9    {'Google': {'Jennifer Paige': 'https://www.lin...
dtype: object
```

### slice company array for testing

In [None]:
sample_test[:1].apply(lambda x: test_scrape_contacts(driver, x))

# Testing random samples from company list

In [141]:
random_sample = cos.sample(10)
random_sample

1277                            Clarks
2545      Supernova Software Solutions
3463                Hill International
0                                Apple
3978         Global Water Intelligence
392                                BP3
4231    Nicholson Construction Company
2519                      We Are Blood
2457                            Corgan
3835                     Polymershapes
dtype: object

In [142]:
scrape_results = random_sample.apply(lambda x: test_scrape_contacts(driver, x))
scrape_results

1277    {'Clarks': {'Doug Searle': 'https://www.linked...
2545       {'Supernova Software Solutions': 'No results'}
3463                 {'Hill International': 'No results'}
0       {'Apple': {'Erin Gong': 'https://www.linkedin....
3978          {'Global Water Intelligence': 'No results'}
392     {'BP3': {'Michael D. Quiroz': 'https://www.lin...
4231     {'Nicholson Construction Company': 'No results'}
2519                       {'We Are Blood': 'No results'}
2457                             {'Corgan': 'No results'}
3835                      {'Polymershapes': 'No results'}
dtype: object

# Mongo testing

In [4]:
mongo.connect_mongo()

In [5]:
mongo.connect_coll('script_test', 'gal_alum')

In [6]:
mongo.insert_one({'key':'values'})

## Testing waits

### start company search on global company search page

In [None]:
global_srch = 'https://www.linkedin.com/search/results/companies/?keywords=&origin=SWITCH_SEARCH_VERTICAL'
driver.get(global_srch)

In [34]:
wait = WebDriverWait(driver, 10)

In [48]:
srch_x_path = '//*[@id="ember16"]/input'
co_x_path = '/html/body/div[7]/div[3]/div/div[2]/div/div/div/div[1]/div/div/div/div[2]/div[1]/div[1]/span/div/span/span/a'
# co_x_path = '/html/body/div[7]/div[3]/div/div[2]/div/div/div/div[2]/div/div/div/div[2]/div[1]/div[1]/span/div/span/span/a'
co_x_path2 = '/html/body/div[7]/div[3]/div/div[2]/div/div/div/div[1]/div/div/div/div[2]/div[1]/div[1]/span/div/span/span/a'
ppl_x_path = '/html/body/div[7]/div[3]/div/div[3]/div[2]/div[1]/div/div/nav/ul/li[5]/a'
ppl_search_xpath = '/html/body/div[7]/div[3]/div/div[3]/div[2]/div[2]/div[1]/div[2]/div/div/div/input'
gal_alum = 'galvanize'
tech_rec = 'technical recruiter'

In [None]:
driver.find_element_by_xpath(srch_x_path).send_keys(co + Keys.RETURN)

In [60]:
co_path_2 = '/html/body/div[7]/div[3]/div/div[2]/div/div/div/div[2]/ul/li[1]/div/div/div/div[2]/div[1]/div/div[1]/span/div/span/span/a'

In [61]:
co_x_path2 == co_x_path

True

In [21]:
driver.find_element_by_xpath(co_x_path).click()

In [None]:
driver.find_element_by_xpath(ppl_x_path).click()

In [None]:
driver.find_element_by_xpath(srch_x_path).clear()

In [None]:
driver.find_element_by_xpath(ppl_search_xpath).send_keys(gal_alum + Keys.RETURN)

In [63]:
scroll_to_end(driver, 3)

In [69]:
driver.find_element_by_tag_name('a', 'app-aware-link ember-view').click()

TypeError: find_element_by_tag_name() takes 2 positional arguments but 3 were given

In [73]:
driver.find_element_by_class_name('a', 'app-aware-link ember-view')

TypeError: find_element_by_class_name() takes 2 positional arguments but 3 were given

In [67]:
driver.find_element_by_link_text("BigID").click()

In [78]:
driver.find_element_by_css_selector

NoSuchElementException: Message: no such element: Unable to locate element: {"method":"css selector","selector":".a"}
  (Session info: chrome=86.0.4240.111)


In [74]:
driver.find_element_by_xpath(co_x_path).click()

NoSuchElementException: Message: no such element: Unable to locate element: {"method":"xpath","selector":"/html/body/div[7]/div[3]/div/div[2]/div/div/div/div[1]/div/div/div/div[2]/div[1]/div[1]/span/div/span/span/a"}
  (Session info: chrome=86.0.4240.111)


In [36]:
driver.find_element_by_xpath(ppl_x_path).click()

wait.until(EC.element_to_be_clickable((By.XPATH, ppl_search_xpath)))

driver.find_element_by_xpath(ppl_search_xpath).send_keys(gal_alum + Keys.RETURN)


In [28]:
EC.element_to_be_clickable((By.XPATH, ppl_search_xpath))

## selenium scroll done
## scrapping results

In [40]:
contact_elements = results.find_all('li', 'org-people-profiles-module__profile-item')

In [41]:
len(contact_elements)

97

In [43]:
d = defaultdict(dict)

In [37]:
# returned None from scrape
# TODO: add None handling to dict construction
#type(contact_elements[19].find('div', 'org-people-profile-card__profile-title t-black lt-line-clamp lt-line-clamp--single-line ember-view'))

NoneType

In [44]:
for contact in contact_elements:
    name = contact.find('div', 'org-people-profile-card__profile-title t-black lt-line-clamp lt-line-clamp--single-line ember-view').text.rstrip()
    name = name.replace(' ', '', 2)
    link = 'https://www.linkedin.com' + contact.a['href']
    
    if co_test not in d:
        d[co_test]
        
    if name not in co_test:
        d[co_test][name] = link

In [7]:
#d

NameError: name 'd' is not defined

In [None]:
cos_scrape = cos.apply(lambda x: scrape_contacts(driver, x))

### Unused below
### ~~defining flow for `li_login` function~~

In [3]:
email, pw = get_login()
# pull pw/login from external file
driver = webdriver.Chrome()
# inst chrome webdriver
driver.get('https://www.linkedin.com/')
# open LI page
driver.find_element_by_id('session_key').send_keys(email)
# pass in email
driver.find_element_by_id('session_password').send_keys(pw)
# pass in pw
'''TODO: .click() with xpath'''

# driver.find_element_by_id('homepage-basic_signin-form_submit-button').click()

'TODO: .click() with xpath'

In [5]:
driver.find_element_by_id('session_password').send_keys(Keys.RETURN)
# press enter for to log in

In [4]:
co_test = 'apple'

In [5]:
driver.find_element_by_xpath('//*[@id="ember16"]/input').send_keys(co_test)

In [6]:
driver.find_element_by_xpath('//*[@id="ember16"]/input').send_keys(Keys.RETURN)
# enter search for co. name

In [7]:
co_x_path = '/html/body/div[7]/div[3]/div/div[2]/div/div[2]/div/div/div/ul/li[1]/div/div/div[2]/a/h3'

In [8]:
driver.find_element_by_xpath(co_x_path).click()
# click co. link

In [9]:
ppl_x_path = '/html/body/div[7]/div[3]/div/div[3]/div[2]/div[1]/div/div/nav/ul/li[5]/a'

In [10]:
driver.find_element_by_xpath(ppl_x_path).click()
# click People

## ~~edit button testing~~
### moving to general search, b/c more flexibility in search

In [16]:
gen_search_xpath = '/html/body/div[7]/div[3]/div/div[3]/div[2]/div[2]/div[1]/div[2]/div/div/div/input'
gal_alum = 'galvanize'
tech_rec = 'technical recruiter'

In [17]:
driver.find_element_by_xpath(gen_search_xpath).send_keys(gal_alum)
driver.find_element_by_xpath(gen_search_xpath).send_keys(Keys.RETURN)

In [27]:
r = driver.page_source

In [28]:
soup = BeautifulSoup(r, 'html.parser')

In [29]:
results = soup.find('ul', 'org-people-profiles-module__profile-list')

In [30]:
results is None

False

In [107]:
a = soup.find_all('a')

In [108]:
for idx, button in enumerate(a):
    if 'BigID' in button:
        print(idx)

16


In [109]:
a[16]['id']

'ember1589'

In [110]:
soup.find_all('a')[16]['id']

'ember1589'

In [86]:
print(a[0].prettify())

<a class="nav-item__link js-nav-item-link" data-alias="" data-control-name="" data-link-to="feed" data-resource="feed/badge" href="/feed/">
 <span class="nav-item__icon nav-item__icon--inbug" lang="en" role="presentation">
  <li-icon aria-hidden="true" color="brand" size="34dp" type="linkedin-bug">
   <svg focusable="false" preserveaspectratio="xMinYMin meet" xmlns="http://www.w3.org/2000/svg">
    <g class="scaling-icon" style="fill-opacity: 1">
     <defs>
     </defs>
     <g class="bug-14dp" fill="none" fill-rule="evenodd" stroke="none" stroke-width="1">
      <g class="dp-1">
       <path class="bug-text-color" d="M14,1.25 L14,12.75 C14,13.44 13.44,14 12.75,14 L1.25,14 C0.56,14 0,13.44 0,12.75 L0,1.25 C0,0.56 0.56,0 1.25,0 L12.75,0 C13.44,0 14,0.56 14,1.25" fill="#FFFFFF">
       </path>
       <path class="background" d="M14,1.25 L14,12.75 C14,13.44 13.44,14 12.75,14 L1.25,14 C0.56,14 0,13.44 0,12.75 L0,1.25 C0,0.56 0.56,0 1.25,0 L12.75,0 C13.44,0 14,0.56 14,1.25 Z M5,5 L6.85,5 L

In [115]:
driver.find_element_by_id(soup.find_all('a')[16]['id']).click()

In [60]:
len(results)

3007

### iterate through indecies of `contact_elements` to pull k-v pairs

In [None]:
contact_elements = results.find_all('li', 'org-people-profiles-module__profile-item')

In [56]:
#k example to test for NoneType
# used in script functions
name = contact_elements[1].find('div', 'org-people-profile-card__profile-title t-black lt-line-clamp lt-line-clamp--single-line ember-view')
name = name.text.rstrip().replace(' ', '', 2)
name

In [22]:
#k example
name = contact_elements[-1].find('div', 'org-people-profile-card__profile-title t-black lt-line-clamp lt-line-clamp--single-line ember-view').text.rstrip()
name = name.replace(' ', '', 2)
name

'Antwan Little'

In [23]:
#v example
#link to contact
link = 'https://www.linkedin.com' + contact_elements[-1].a['href']
link

'https://www.linkedin.com/in/antwanlittle/'

Ok, interacting with LI just fine. ~~Need to refine 'add' edu "Galvanize" and~~ Move to general search for flexbility. Able to search co. for my desired fields... Need to scrape portion of return employees. 

--> ~~beautifulsoup to scrape return cells for (k) employee name (v) url to profile~~

--> script Just portion. Just need DF out. for list of co's to search

--> MongoDB for k-v's

--> Dataframes

*perhaps load into psql for py wrapper struc. RDBMS experience*