In [3]:
from selenium import webdriver 
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import StaleElementReferenceException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import re
import json
import time
from datetime import datetime
import pandas as pd

In [4]:
def write_result_to_json(results, filename):    
    with open(filename, 'r') as f:
        try:
            data = json.load(f)
        except ValueError:
            data = []
    f.close()
    with open(filename, 'w') as f:
        data += results
        json.dump(data, f)
    f.close()

In [5]:
def check_exists_by_class_name(webelement, class_name):
    try:
        webelement.find_element_by_class_name(class_name)
        return True
    except NoSuchElementException:
        return False

In [6]:
class ProfileHeader:
    def __init__(self, profile_id, profile_name, headline, email, num_connection):
        self.profile_id = profile_id
        self.profile_link = "https://www.linkedin.com/in/"+profile_id+"/"
        self.profile_name = profile_name
        self.headline = headline
        self.email = email
        self.num_connection = num_connection
        self.updated_at = datetime.strftime(datetime.now(), "%Y-%m-%d %H:%M:%S")
        
    def dictionarize(self):
        return self.__dict__

class Experience:
    def __init__(self, profile_id, company, position, beg_date, end_date, location):
        self.profile_id = profile_id
        self.company = company
        self.position = position
        self.begin_date = beg_date
        self.end_date = end_date
        self.location = location
        self.updated_at = datetime.strftime(datetime.now(), "%Y-%m-%d %H:%M:%S")
    
    def dictionarize(self):
        return self.__dict__

class Education:
    def __init__(self, profile_id, school, degree_name, fos, start_year, end_year):
        self.profile_id = profile_id
        self.school_name = school
        self.degree_name = degree_name
        self.field_of_study = fos
        self.start_year = start_year
        self.end_year = end_year
        self.updated_at = datetime.strftime(datetime.now(), "%Y-%m-%d %H:%M:%S")
    
    def dictionarize(self):
        return self.__dict__

In [7]:
def extract_from_education_section(education_section, profile_id):
    educations = []
    WebDriverWait(browser, 60).until(
            EC.presence_of_element_located((By.CLASS_NAME, "education-section")))

    if check_exists_by_class_name(education_section, "pv-profile-section__see-more-inline"):
        show_more_education_button = education_section.find_element_by_class_name('pv-profile-section__see-more-inline')
        show_more_education_button.click()

    for education in education_section.find_elements_by_class_name("pv-profile-section__sortable-card-item"):
        school = education.find_element_by_class_name("pv-entity__school-name").text
        secondary_titles = education.find_elements_by_class_name("pv-entity__secondary-title")
        degree_name = None
        fos = None
        start_year = None
        end_year = None
        for title in secondary_titles:
            if "degree" in title.get_attribute("class"):
                degree_name = title.text.split('\n')[-1]
            if "fos" in title.get_attribute("class"):
                fos = title.text.split('\n')[-1]
        if check_exists_by_class_name(education, "pv-entity__dates"):
            year = re.sub('[^0-9a-zA-Z ]+', '-', 
                            education.find_element_by_class_name("pv-entity__dates").text.split("\n")[1])
            start_year = year.split(' - ')[0]
            end_year = year.split(' - ')[-1] if '-' in year else "Present"
        educations.append(
            Education(profile_id, school, degree_name, fos, start_year, end_year).dictionarize())
    return educations

In [8]:
def extract_from_experience_section(experience_section, profile_id):
    WebDriverWait(browser, 60).until(
        EC.presence_of_element_located((By.CLASS_NAME, "experience-section"))
    )
    if check_exists_by_class_name(experience_section, "pv-profile-section__see-more-inline"):
        show_more_button = \
            experience_section.find_element_by_class_name('pv-profile-section__see-more-inline')
        show_more_button.click()
    sortable_card_items = experience_section.find_elements_by_class_name('pv-profile-section__sortable-card-item')
    other_items = experience_section.find_elements_by_class_name('pv-profile-section__card-item-v2')
    experience_items = sortable_card_items + other_items
    work_experiences = []
    for experience_item in experience_items:
        work_experiences = work_experiences + \
            [experience.dictionarize() 
             for experience in extract_experience_from_experience_sortable_card_item(experience_item, profile_id)]
    return work_experiences

In [9]:
def extract_experience_from_experience_sortable_card_item(experience_sortable_card_item, profile_id):
    container = [card_item.get_attribute("class").split(" ")[0] 
                 for card_item in experience_sortable_card_item.find_elements_by_tag_name("div")]

    if "pv-entity__summary-info" in container:
        entity = experience_sortable_card_item.find_element_by_class_name("pv-entity__summary-info")
        experience=extract_from_entity_summary_info(entity, profile_id)
        return([experience,])
    elif "pv-entity__company-details" in container and "pv-entity__company-summary-info" in container:
        experiences = extract_from_entity_summary_info_v2(experience_sortable_card_item, profile_id)
        return experiences            

In [10]:
def extract_from_entity_summary_info_v2(sortable_card_item, profile_id):
    experiences = []
    company = sortable_card_item.find_element_by_class_name("pv-entity__company-summary-info")\
                            .find_element_by_tag_name("h3").text.split("\n")[-1]
    position_group_role_items = [item for item in sortable_card_item.find_elements_by_tag_name("li") 
                                 if "pv-entity__position-group-role-item" in item.get_attribute("class")]
    for position_group_role_item in position_group_role_items:
        summary_info_v2 = position_group_role_item.find_element_by_class_name("pv-entity__summary-info-v2")
        position = summary_info_v2.find_element_by_tag_name("h3").text.split("\n")[-1]
        date_range = re.sub('[^0-9a-zA-Z ]+', '-', 
                            summary_info_v2.find_element_by_class_name("pv-entity__date-range")\
                            .text.split("\n")[1])
        from_date = date_range.split(' - ')[0]
        to_date = date_range.split(' - ')[1]
        if check_exists_by_class_name(summary_info_v2, "pv-entity__location"):
            location = summary_info_v2.find_element_by_class_name("pv-entity__location").text.split("\n")[-1]
        else:
            location = None
        experiences.append(Experience(profile_id, company, position, from_date, to_date, location))
    return experiences

In [11]:
def extract_from_entity_summary_info(entity, profile_id):
    position = entity.find_element_by_tag_name("h3").text
    company = entity.find_element_by_class_name("pv-entity__secondary-title").text
    if check_exists_by_class_name(entity, "pv-entity__location"):
        location = entity.find_element_by_class_name("pv-entity__location").text.split("\n")[-1]
    else:
        location = None
    date_range = re.sub('[^0-9a-zA-Z ]+', '-', 
                        entity.find_element_by_class_name("pv-entity__date-range").text.split("\n")[1])
    from_date = date_range.split(' - ')[0]
    to_date = date_range.split(' - ')[1]
    return Experience(profile_id, company, position, from_date, to_date, location)

In [12]:
def get_profile_link(browser, profile):
    top_card_links = browser.find_element_by_class_name("pv-top-card-v2-section__links").find_elements_by_tag_name("a")
    contact_info_link = None
    contact_email = None
    connection = None
    for link in top_card_links:
        class_name = str(link.get_attribute("class"))
        if "contact-info" in class_name:
            contact_info_link = link.get_attribute("href")
        elif "connection" in class_name:
            connection = int(re.sub('[^0-9]+', '', link.text))

    if contact_info_link is not None:
        browser.get(str(contact_info_link))
        elements = browser.find_elements_by_tag_name("a")
        for e in elements:
            href = e.get_attribute("href")
            if "mailto" in href:
                contact_email = href.split(":")[1]
        for button in browser.find_elements_by_tag_name("button"):
            if "dismiss" in button.get_attribute("class"):
                button.click()
                break
                
    return (contact_email, connection)

def get_profile_header(browser, profile):
    profile_id = profile.split("/")[-2]
    profile_name = browser.find_element_by_class_name("pv-top-card-section__name").text
    headline = browser.find_element_by_class_name("pv-top-card-section__headline").text
    email, num_connection = get_profile_link(browser, profile)
    profile_header = ProfileHeader(profile_id, profile_name, headline, email, num_connection)
    return profile_header.dictionarize()

In [13]:
def extract_linkedin_information(browser, profile):
    profile_id = profile.split("/")[-2]
    browser.get(profile)
    waiting = True
    attempt, max_attempt = (1,3)
    get_experience, get_education, get_profile = (False, False, False)
    while waiting:
        try:        
            if not get_experience:
                experience_section = browser.find_element_by_id("experience-section")
                write_result_to_json(
                    extract_from_experience_section(experience_section, profile_id), "output_data/experiences.json")
                experience = True
            if not get_education:
                education_section = browser.find_element_by_id("education-section")
                write_result_to_json(
                    extract_from_education_section(education_section, profile_id), "output_data/educations.json")
                education = True
            if not get_profile:
                write_result_to_json(
                    [get_profile_header(browser, profile)], "output_data/profiles.json")
                profile = True

            waiting = False
        except (StaleElementReferenceException, NoSuchElementException):
            if attempt>max_attempt:
                raise
            else:
                explore_page(browser)
                attempt += 1

In [14]:
def explore_page(browser):
    for i in [100, 500, 1000, 2000, 3000, 0]:
        try:
            browser.execute_script("window.scrollTo(0, {})".format(i)) 
            time.sleep(2)
        except:
            pass

### Open User Profile Page

In [15]:
f = open("linkedin_user_profile_list.txt", 'r')
profile_id_list = []
for i,item in enumerate(f):
    profile_id_list.append(item[:-1])
f.close()

In [16]:
profile_id_list.index("https://www.linkedin.com/in/alberttriadrian/")

635

## Extract Profile

In [18]:
# Login to LinkedIn first
browser = webdriver.Chrome(executable_path="D:\Data Science Software\chromedriver")
browser.get('https://www.linkedin.com')
username_input = browser.find_element_by_id("login-email")
password_input = browser.find_element_by_id('login-password')
login_button = browser.find_element_by_id('login-submit')

username_input.send_keys('xxx@gmail.com')
password_input.send_keys('xxx')
login_button.click()

In [None]:
start = 636
for i, profile in enumerate(profile_id_list[start:]):
    if i%5 == 0:
        time.sleep(90)
    try:
        extract_linkedin_information(browser, profile)
        if i%10 == 0:
            print("Successfully write {} profile. Latest: {}".format(i+start, profile))
    except Exception as e:
        profile_exception = {'profile' : profile,'exception': str(e)}
        write_result_to_json([profile_exception], "exceptions.json")

Successfully write 666 profile. Latest: https://www.linkedin.com/in/wahyuutomo101/
Successfully write 676 profile. Latest: https://www.linkedin.com/in/ario-dean-wirawan/
Successfully write 686 profile. Latest: https://www.linkedin.com/in/putu-eka-pramudita-bb295927/
Successfully write 696 profile. Latest: https://www.linkedin.com/in/demetrius-bagas-primanto/
Successfully write 716 profile. Latest: https://www.linkedin.com/in/hafidahadiani/
Successfully write 726 profile. Latest: https://www.linkedin.com/in/diaz-ekaputra-6985bb130/
Successfully write 766 profile. Latest: https://www.linkedin.com/in/azka-fathininta-windaningrum-830229125/
