This work is licensed under the Creative Commons Attribution 4.0 Unported License. To view a copy of this license, visit http://creativecommons.org/licenses/by/4.0/.

#### Importing libraries

In [None]:
import getpass
import json
import requests
import operator
import os
from bs4 import BeautifulSoup
from requestium import Session,Keys
import gender_guesser.detector as gg
from gender import GenderDetector
import datetime
from elasticsearch import Elasticsearch
import time
from random import randint
from nameparser import HumanName
from selenium.common.exceptions import NoSuchElementException

#### Creates a webdriver session

In [None]:
# Login to Viadeo using the webpage form manually
def login_viadeo():
    # Create session
    session = Session(webdriver_path='/usr/bin/chromedriver', 
                      browser='chrome', default_timeout=15,webdriver_options={'arguments':['--incognito','--no-sandbox']})
            # To avoid openning the browser window add 'headless' to the 'arguments' list
    

    session.driver.get('https://secure.viadeo.com/en/signin')

    return session

#### Professions considered per country

In [None]:
# Returns a dict composed by lists of professions written in different languages
def get_professions():
    professions_en = [
                    "developer",
                    "teacher",
                    "lawyer",
                    "doctor",
                    "veterinarian",
                    "economist",
                    "designer",
                    "consultant",
                    "architect",
                    "dentist",
                    "mechanic",
                    "scientist",
                    "chef",
                    "musician",
                    "engineer",
                    "pharmacist",
                    "psychologist",
                    "surgeon",
                    "judge",
                    "journalist",
                    "photographer",
                    "translator",
                    "recepcionist",
                    "pilot",
                    "firefighter",
                    "police",
                    "nurse",
                    "writer",
                    "plumber",
                    "bookseller",
                    "instructor",
                    "hairdresser",
                    "politician",
                    "psychiatrist",
                    "reporter",
                    "salesman",
                    "jeweller",
                    "gardener",
                    "postman",
                    "recruiter",
                    "surveyor",
                    "physiotherapist",
                    "radiographer",
                    "secretary"
                ]
    professions_es = [
                    "desarrollador",
                    "profesor",
                    "abogado",
                    "veterinario",
                    "economista",
                    "diseñador",
                    "consultor",
                    "arquitecto",
                    "dentista",
                    "mecánico",
                    "científico",
                    "cocinero",
                    "músico",
                    "ingeniero",
                    "farmacéutico",
                    "psicólogo",
                    "cirujano",
                    "juez",
                    "periodista",
                    "fotógrafo",
                    "traductor",
                    "recepcionista",
                    "piloto",
                    "bombero",
                    "policía",
                    "médico",
                    "enfermero",
                    "escritor",
                    "fontanero",
                    "librero",
                    "instructor",
                    "peluquero",
                    "político",
                    "psiquiatra",
                    "reportero",
                    "vendedor",
                    "joyero",
                    "jardinero",
                    "cartero",
                    "reclutador",
                    "topógrafo",
                    "fisioterapeuta",
                    "radiógrafo",
                    "secretario"]
    
    professions_fr = ["développeur",
                    "professeur",
                    "avocat",
                    "docteur",
                    "vétérinaire",
                    "économiste",
                    "designer",
                    "consultant",
                    "architecte",
                    "dentiste",
                    "mécanique",
                    "scientifique",
                    "cuisinier",
                    "musicien",
                    "ingénieur",
                    "pharmacien",
                    "psychologue",
                    "chirurgien",
                    "juge",
                    "journaliste",
                    "photographe",
                    "traducteur",
                    "réceptionniste",
                    "pilote",
                    "pompier",
                    "police",
                    "infirmière",
                    "écrivain",
                    "plombier",
                    "libraire",
                    "instructeur",
                    "coiffeur",
                    "politique",
                    "psychiatre",
                    "reporter",
                    "vendeur",
                    "bijoutier",
                    "jardinier",
                    "mailman",
                    "recruteur",
                    "arpenteur",
                    "kinésithérapeute",
                    "radiographe",
                    "secrétaire"]
    
    professions = {'en': professions_en, 'es': professions_es, 'fr': professions_fr}
    #print(str(len(professions_es)) + " professions considered,",len(professions),"languages")
    return professions

#### Gathering 25 profiles url per profession

In [None]:
# Returns a dictionary of the url of the candidates per profession and language
# Format of the dictionary: {'professions_es' : {'profession': [url_candidate1,...]},...}
def get_candidates_url(session):
    professions_dict = get_professions()
    url_candidates = {}

    url_countries = {'es': 'https://www.viadeo.com/en/search/#/?q=', 
                     'en': 'https://www.viadeo.com/en/search/#/?facet=%5B%22country%3Agb%22%5D&q=', 
                     'fr': 'https://www.viadeo.com/en/search/#/?facet=%5B%22country%3Afr%22%5D&q='}

    for language,professions in professions_dict.items():
        if language == 'es':
            url_to_visit = url_countries['es']
        elif language == 'fr':
            url_to_visit = url_countries['fr']
        elif language == 'en':
            url_to_visit = url_countries['en']

        url_candidates[language] = []
        urls = {}

        for p in professions:
            time.sleep(randint(5,15))

            # List of 25 recommended candidates given a profession
            session.driver.get(url_to_visit + p)  
            session.driver.execute_script("document.body.style.zoom='80%'") 
            time.sleep(5)

            candidates = session.driver.find_elements_by_xpath('//a[contains(@href,"consultationType=29")]')
            session.driver.execute_script("window.scrollTo(0,0);")
            time.sleep(2)
            urls[p] = []

            # Avoid getting 1st element (url own profile)
            for c in candidates[1:]: # 10 candidates
                session.driver.execute_script("window.scrollTo(0," + str(c.location['y']+10) + ");")
                time.sleep(2)
                url = c.get_attribute('href')
                if url not in urls[p]:
                    urls[p].append(url) 
                    print(url)

            session.driver.get(url_to_visit + p + '&page=2')
            session.driver.execute_script("document.body.style.zoom='80%'") 
            time.sleep(5)
            
            
            candidates = session.driver.find_elements_by_xpath('//a[contains(@href,"consultationType=29")]') 
            session.driver.execute_script("window.scrollTo(0,0);")
            time.sleep(2)
            
            for c in candidates[1:]: # 10 candidates to get 20 in total
                session.driver.execute_script("window.scrollTo(0," + str(c.location['y']+10) + ");")
                time.sleep(2)
                url = c.get_attribute('href')
                if url not in urls[p]:
                    urls[p].append(url) 
            print(p, len(urls[p]))
        url_candidates[language] = urls


    return url_candidates

#### Elasticsearch mapping initialization 

In [None]:
# Returns Elasticsearch instance
def init_elasticsearch():
    es = Elasticsearch('localhost', port=9200)
    template = {"index_patterns":["job_candidates"],"settings":{"number_of_shards":1},"mappings":{"doc":{"_source":{"enabled":True},"properties":{"timestamp":{"type":"date"},"source":{"type":"keyword"},"ranking":{"type":"integer"},"query":{"type":"keyword"},"country":{"type":"keyword"},"has_photo":{"type":"boolean"},"is_premium":{"type":"boolean"},"experience":{"type":"nested","properties":{"job_position":{"type":"keyword"},"company":{"type":"keyword"},"date_init":{"type":"keyword"},"date_end":{"type":"keyword"},"location":{"type":"keyword"}}},"experience_years":{"type":"float"},"experience_level":{"type":"keyword"},"ethnea":{"type":"keyword"},"is_foreigner":{"type":"boolean"},"education":{"type":"nested","properties":{"school":{"type":"keyword"},"name":{"type":"keyword"},"year_init":{"type":"integer"},"year_end":{"type":"integer"}}},"accomplishments":{"type":"nested","properties":{"certificates":{"type":"integer"},"languages":{"type":"keyword"},"courses":{"type":"integer"},"publications":{"type":"integer"}}},"age":{"type":"integer"},"name":{"type":"keyword"},"surnames":{"type":"text"},"gender":{"type":"keyword"},"connections":{"type":"integer"},"url":{"type":"keyword"}}}}}
    es.indices.put_template(name='template_job_candidates', body=template)
    
    return es

# Indexes a document into a Elasticsearch index
def insert_to_elasticsearch(es, body):
    es.index(index='job_candidates', doc_type='doc', body=body)

#### Collecting information present in the candidate's profile

In [None]:
# Returns a HTML element if is present
def get_element_if_exists(method, css_selector):
    for i in range(0,5):
        try:
            element = method(css_selector)
        except NoSuchElementException:
            element = None
    return element

def get_basic_info(session):
    time.sleep(5)
    
    MAX_ATTEMPTS = 3
    
    # Maximum of 3 attempts to collect information
    attempts = 0
    name_empty = True
    while (attempts < MAX_ATTEMPTS and name_empty):
        # Complete name
        full_name = get_element_if_exists(session.driver.find_elements_by_tag_name,"h1")
        session.driver.execute_script("window.scrollTo(0," + str(full_name[0].location['y']) +");")
        time.sleep(5)

        name = None
        surnames = None
        if full_name:
            attempts += 1

            full_name = full_name[1].text    
            full_name = full_name.strip().replace('\n','')
            full_name = HumanName(full_name)
            name = full_name.first 
            surnames = full_name.middle + " " + full_name.last
            name_empty = False
            

    time.sleep(5)
    
    # Premium profile
    badge = get_element_if_exists(session.driver.find_element_by_css_selector,"span.profile-premium-icon")
    if badge:
        premium = True
    else: 
        premium = False

    # Photo present
    avatar =  get_element_if_exists(session.driver.find_element_by_xpath,"//img[@src='//static8.viadeo-static.com/kAsAmZypf-o_fFGfmsNsk6WnqwE=/140x140/member/0024yj3iq0wlq2d%3Fts%3D1320760759000']")
    if avatar:
        photo = False
    else:
        photo = True

    # Number of connections
    connections = get_element_if_exists(session.driver.find_elements_by_css_selector,"span.profile-card__contacts__count")
    if connections:
        connections = connections[1].text.strip()
        if "+" in connections:
            connections = connections.replace(" ","").replace(".","")
            connections
            connections = int(connections[:-1])
        else:
            connections = int(connections)

    return {'name': name, 'surnames':surnames, 'connections': connections, 'has_photo': photo, 'is_premium': premium}

# Returns the experience information of a candidate
def get_experience_info(session):
    # Experience section
    experience_section = get_element_if_exists(session.driver.find_element_by_css_selector,"div.profile-experience")

    # List of experiences to fill
    list_exp = []

    # Sum of years of experience
    experience_years = 0

    if experience_section:
        session.driver.execute_script("window.scrollTo(0," + str(experience_section.location['y']) +");")
        time.sleep(3)

         # Get experience element
        experiences = experience_section.find_elements_by_css_selector("section.profile-positions")

        # Click buttons to toggle information of truncated experiences
        for e in experiences:
            session.driver.execute_script("window.scrollTo(0," + str(e.location['y']+50) + ");")
            time.sleep(3)

            # Position name
            job_position = e.find_element_by_css_selector('h1.title').text.strip()

            # Company name
            company = e.find_element_by_css_selector('p.location.mbn').text.strip().split()
            company = ' '.join(company[1:])

            # Dates
            date_range = get_element_if_exists(e.find_element_by_css_selector,'p.period')
            date_init = None
            date_end = None
            if date_range:
                date_range = date_range.text.strip().split()
                if len(date_range) >= 5:
                    date_init = " ".join(date_range[1:3])
                    date_end = " ".join(date_range[4:])
                elif len(date_range) == 4:
                    date_init = date_range[1]
                    date_end = date_range[3]

            # Accumulate years of experience
            duration = get_element_if_exists(e.find_element_by_css_selector,'div.timeline-duration')
            if duration:
                duration = duration.text
                if "ans" in duration:
                    experience_years += float(duration[0:duration.index("ans")])
                elif "an" in duration:
                    experience_years += float(duration[0:duration.index("an")])
                elif "m" in duration:
                    experience_years += float(duration[0:duration.index("m")])/12.0

            # Collect them in a list
            list_exp.append({'job_position': job_position, 
                             'company': company, 
                             'date_init': date_init,
                             'date_end': date_end})

    return {'experience': list_exp, 'experience_years': round(experience_years,2)}

# Returns the educational information of a candidate
def get_education_info(session):
    # Education section
    education_section = get_element_if_exists(session.driver.find_element_by_css_selector,"div.profile-experience")
    # List of educations to fill
    list_edu = []
    
    if education_section:
        session.driver.execute_script("window.scrollTo(0," + str(education_section.location['y']) + ");")
        time.sleep(3)

        # Education elements
        educations = education_section.find_elements_by_css_selector('section.profile-educations')

        for education in educations:
            session.driver.execute_script("window.scrollTo(0," + str(education.location['y']+50) + ");")
            time.sleep(3)
            
            # School name
            school = education.find_element_by_css_selector('h1.title').text
            
            # Degree name
            degree_name = education.find_element_by_css_selector('p.location.mbn').text 
            
            # Dates
            date_range = get_element_if_exists(education.find_element_by_css_selector,'p.period')
            
            year_init = None
            year_end = None

            if date_range:
                date_range = date_range.text.strip().split()
                if len(date_range) == 6:
                    year_init = date_range[2]
                    year_end = date_range[5]
                    if year_end == 'Present':
                        year_end = datetime.datetime.now().year
                elif len(date_range) == 4:
                    year_init = date_range[1]
                    year_end = date_range[3]
                    if year_end == 'Present':
                        year_end = datetime.datetime.now().year

            # Collect them in a list        
            list_edu.append({'school': school, 
                             'name': degree_name, 
                             'year_init': year_init,
                             'year_end': year_end
                            })
        
        # Scroll to skill section
        skill_section = get_element_if_exists(session.driver.find_element_by_css_selector, "div#profile-skills")
        if skill_section:
            session.driver.execute_script("window.scrollTo(0," + str(skill_section.location['y']+50) + ");")
            time.sleep(2)
            
    return {"education": list_edu}

# Returns the accomplishments information of a candidate
def get_accomplishments_info(session):
    accomplishments_section = get_element_if_exists(session.driver.find_element_by_class_name,'div#profile-skills')
    accomplishments = {}

    languages = []
    languages_section = get_element_if_exists(session.driver.find_element_by_css_selector,'div.profile-languages')
    if languages_section:
        session.driver.execute_script("window.scrollTo(0," + str(languages_section.location['y']) + ");")
        languages_li = languages_section.find_elements_by_css_selector('h3.profile-list__name')
        for language in languages_li:
            languages.append(language.text)

    accomplishments = {'accomplishments':{ 'languages': languages } }
        
    return accomplishments

#### Estimating information not present in the candidate's profile 

In [None]:
# Returns the estimated gender of a candidate
def gender_estimation(name):
    gender_gg = None
    gender_illinois = None
    sex_gd = None
    
    d = gg.Detector()
    # Predict sex by name
    gender_gg = d.get_gender(name)
    if "mostly" in gender_gg:
        gender_gg = gender_gg.split("_")
        gender_gg = gender_gg[1]
        
    
    gd = GenderDetector()
    gender_gd = gd.gender(name)
    
    try:
        # Get ethnea based on name
        response = requests.get('http://abel.lis.illinois.edu/cgi-bin/ethnea/search.py?Fname='+name+'&format=json')
        if response.status_code == 200:
            gender_illinois = json.loads(response.text.replace("'","\""))['Genni'].lower()
    
    finally:
        # Normalize values for sex attribute
        
        if gender_gd == 'f':
            gender_gd = 'female'
        elif gender_gd == 'm':
            gender_gd = 'male'
        else:
            gender_gd = 'unknown'
    
        if gender_illinois == 'f':
            gender_illinois = 'female'
        elif gender_illinois == 'm':
            gender_illinois = 'male'
        else:
            gender_illinois = 'unknown'
            
        # Collect gender estimations for the 3 different methods    
        gender_estimations = {'gg': gender_gg, 'gd': gender_gd, 'illinois': gender_illinois}
        
        # Count number of occurrencies
        sum_female = sum(value == 'female' for value in gender_estimations.values())
        sum_male = sum(value == 'male' for value in gender_estimations.values())
        sum_unknown = sum(value == 'unknown' for value in gender_estimations.values())
        
        # Collect gender occurrencies
        sex_occurrencies = {'female': sum_female, 'male': sum_male, 'unknown': sum_unknown}
        
        # Get the gender with higher number of occurrencies (by voting)
        gender_max_occurrencies = (max(gender_occurrencies.items(), key=operator.itemgetter(1))[0])
        
        return {'gender': gender_max_occurrencies}
    
# Returns the estimated age of a candidate
def estimate_age(list_edu):
    age = None
    if len(list_edu) > 0:
        # First year of university
        first_year = list_edu[len(list_edu)-1].get('year_init', None)
        
        if first_year:
            # First year of university until now
            age = abs( int(first_year) - datetime.datetime.now().year) + 18
            
    return {'age': age}

# Returns the estimated experience level of a candidate
def estimate_experience_level(experience_years):
    # Experience category level based on years
    if experience_years>=0 and experience_years<=3:
        experience_level = 'entry-level'
    elif experience_years>3 and experience_years<=5:
        experience_level = 'mid-level'
    elif experience_years>5 and experience_years<=8:
        experience_level = 'senior-level'
    elif experience_years>8:
        experience_level = 'amateur-level'
    else: 
         experience_level = None
        
    return {'experience_level': experience_level}

# Returns the estimated ethnea of a candidate
def estimate_ethnicity(name, surnames, country):
    foreigner = None
    ethnicity = None
    try:
        # Get ethnea based on name
        response = requests.get('http://abel.lis.illinois.edu/cgi-bin/ethnea/search.py?Fname='+name+'&Lname='+surnames+'&format=json')
        if response.status_code == 200:
            ethnicity = json.loads(response.text.replace("'","\""))['Ethnea'].lower()
            
            # Other nationality
            foreigner = True
            
            # Spanish person
            if country == "Spain" and 'hispanic' in ethnicity:
                foreigner = False
            
            # French perosn
            elif country == "France" and 'french' in ethnicity:
                foreigner = False
                    
            # English person
            elif country == "United Kingdom" and 'english' in ethnicity:
                foreigner = english
    finally:
        return {'is_foreigner': foreigner, 'ethnicity': ethnicity}

#### Joining all information about a candidate

In [None]:
def collect_profile(url, session, rank_position, query, country):
    time.sleep(randint(10,40))
    session.driver.get(url)
    session.driver.execute_script("document.body.style.zoom='70%'") 
    session.driver.execute_script("window.scrollTo(0,0);")
    
    root_directory = "viadeo_html"
    if not os.path.exists(root_directory):
        os.makedirs(root_directory)
        
    country_directory = root_directory+"/"+ country
    if not os.path.exists(country_directory):
        os.makedirs(country_directory)
    
    query_directory = country_directory + "/" + query
    if not os.path.exists(query_directory):
        os.makedirs(query_directory)
    
    
    html_content = session.driver.page_source
    f = open(query_directory + "/" +str(rank_position)+".html","w+")
    f.write(html_content)
    f.close()
    
    generic_error_list = session.driver.find_elements_by_xpath('//h3[contains(text(),"An error occurred")]') 
    not_found_list = session.driver.find_elements_by_xpath('//h3[contains(text(),"Page not found")]') 

    if len(not_found_list) == 0 and len(generic_error_list) == 0:
        # Basic information
        dict_basic_info = get_basic_info(session)

        # Experience section
        dict_exp_info = get_experience_info(session)

        # Education section
        dict_edu_info = get_education_info(session)

        # Accomplisments section
        dict_accomp_info = get_accomplishments_info(session)

        # Age estimation
        dict_age_est = estimate_age(dict_edu_info['education'])

        # Sex estimation
        dict_gender_estimation = gender_estimation(dict_basic_info['name'])

        # Experience category
        dict_level_est = estimate_experience_level(dict_exp_info['experience_years'])

        # Foreigner estimation
        dict_ethnicity_est = estimate_ethnicity(dict_basic_info['name'], dict_basic_info['surnames'],country)

        # Join all information
        profile = {}
        profile.update(dict_basic_info)
        profile.update(dict_exp_info)
        profile.update(dict_edu_info)
        profile.update(dict_accomp_info)
        profile.update(dict_age_est)
        profile.update(dict_gender_estimation)
        profile.update(dict_level_est)
        profile.update(dict_ethnicity_est)

        now = datetime.datetime.now()
        profile.update({'source':'viadeo','url':url, 'ranking': rank_position, 'query': query, 'country': country, 'timestamp': now})
    else: # Page not found
        print('page not found')
        now = datetime.datetime.now()
        profile = {'error':True, 'source':'viadeo','url':url, 'ranking': rank_position, 'query': query, 'country': country, 'timestamp': now}

    # Full information of the candidate
    return profile


# Collects and stores all the information of a candidate into ES
def collect_and_store_info(url, session, rank_position, query, country, es):
    MAX_ATTEMPTS = 3
    
    # Maximum of 3 attempts to collect information
    attempts = 0
    profile_empty = True
    while (attempts < MAX_ATTEMPTS and profile_empty):
        # Gather profile's info
        profile = collect_profile(url, session, rank_position, query, country)
        error = profile.get('error',None)
        if error == None:
            profile_empty = len(profile['education']) == 0 or len(profile['experience']) == 0
            attempts+=1
        else:
            attempts = 4

    # Store in ES
    insert_to_elasticsearch(es, profile)
    
def scraping_viadeo(session, url_candidates_dict, country):
    # Init Elasticsearch Instance
    es = init_elasticsearch()
    
    for pair in url_candidates_dict.items():
        # Query used for searching candidates
        query = pair[0] 
        # Url of the candidates profile
        url_list = pair[1]
        
        print("Scraping " + str(len(url_list)) + " candidates using as query '" + query +"'...")
        
        rank_position = 0

        # For each candidate profile
        if len(url_list)>0:
            times = []
            for url in url_list:
                start = time.time()

                # Collect and store information about candidate
                collect_and_store_info(url, session, rank_position, query, country, es)   

                end = time.time()            
                times.append(end-start)

                rank_position += 1

            # Execution times
            print("\t- Avg. time:", round(float(sum(times)/len(times))/60.0,2),"min")
            print("\t- Total time:", round(float(sum(times))/60.0,2),"min")

#### Main executor

In [None]:
# Log in to Linkedin manually
session = login_viadeo()

#### Collect url of the candidates whose content will be scrapped

In [None]:
# Candidates per query Viadeo for all countries Spain, France and UK
url_candidates = get_candidates_url(session)

# Store candidates' url into a file
f = open("viadeo_urls.txt","w+")
f.write(json.dumps(url_candidates))
f.close()

f = open("viadeo_urls.txt", "r")
url_candidates_es = json.loads(f.read())['es']
url_candidates_en = json.loads(f.read())['en']
url_candidates_fr = json.loads(f.read())['fr']

#### Start scraping

In [None]:
# Gathering information from Viadeo France
scraping_viadeo(session, url_candidates_fr_rest, country="France")

In [None]:
# Gathering information from Viadeo Spain
scraping_viadeo(session, url_candidates_es_rest, country="Spain")

In [None]:
# Gathering information from Viadeo UK
scraping_viadeo(session, url_candidates_uk_rest, country="United Kingdom")