This work is licensed under the Creative Commons Attribution 3.0 Unported License. To view a copy of this license, visit http://creativecommons.org/licenses/by/3.0/.

#### Importing libraries

In [None]:
import getpass
import json
import requests
import operator
from bs4 import BeautifulSoup
from requestium import Session,Keys
import gender_guesser.detector as gg
from gender import GenderDetector
import datetime
from elasticsearch import Elasticsearch
import time
from random import randint
from nameparser import HumanName
from selenium.common.exceptions import NoSuchElementException

#### Opening webdriver session

In [None]:
# Login to Linkedin using the webpage form manually
def login_linkedin():
    # Create session
    session = Session(webdriver_path='/usr/bin/chromedriver', 
                      browser='chrome', default_timeout=60, webdriver_options={'arguments':['--incognito']})
    # To avoid opening the browser window add the argument 'headless' to the 'arguments' parameter
    
    # Get Linkedin content
    session.driver.get("https://www.linkedin.com")
    
    return session

#### Professions considered per country

In [None]:
# Returns a dict composed by lists of professions written in different languages
def get_professions(language):
    professions_en = [
                    "developer",
                    "teacher",
                    "lawyer",
                    "veterinarian",
                    "economist",
                    "designer",
                    "consultant",
                    "architect",
                    "dentist",
                    "mechanic",
                    "scientist",
                    "chef",
                    "musician",
                    "engineer",
                    "pharmacist",
                    "psychologist",
                    "surgeon",
                    "judge",
                    "journalist",
                    "photographer",
                    "translator",
                    "recepcionist",
                    "pilot",
                    "firefighter",
                    "police",
                    "nurse",
                    "writer",
                    "plumber",
                    "bookseller",
                    "instructor",
                    "hairdresser",
                    "politician",
                    "psychiatrist",
                    "reporter",
                    "salesman",
                    "jeweller",
                    "gardener",
                    "postman",
                    "recruiter",
                    "surveyor",
                    "physiotherapist",
                    "radiographer",
                    "secretary"
                ]
    professions_es = [
                    "desarrollador",
                    "profesor",
                    "abogado",
                    "doctor",
                    "veterinario",
                    "economista",
                    "diseñador",
                    "consultor",
                    "arquitecto",
                    "dentista",
                    "mecánico",
                    "científico",
                    "cocinero",
                    "músico",
                    "ingeniero",
                    "farmacéutico",
                    "psicólogo",
                    "cirujano",
                    "juez",
                    "periodista",
                    "fotógrafo",
                    "traductor",
                    "recepcionista",
                    "piloto",
                    "bombero",
                    "policía",
                    "médico",
                    "enfermera",
                    "escritor",
                    "fontanero",
                    "librero",
                    "instructor",
                    "peluquero",
                    "político",
                    "psiquiatra",
                    "reportero",
                    "vendedor",
                    "joyero",
                    "jardinero",
                    "cartero",
                    "cocinar",
                    "reclutador",
                    "topógrafo",
                    "fisioterapeuta",
                    "radiógrafo",
                    "secretario"]
    
    professions_fr = ["développeur",
                    "professeur",
                    "avocat",
                    "docteur",
                    "vétérinaire",
                    "économiste",
                    "designer",
                    "consultant",
                    "architecte",
                    "dentiste",
                    "mécanique",
                    "scientifique",
                    "cuisinier",
                    "musicien",
                    "ingénieur",
                    "pharmacien",
                    "psychologue",
                    "chirurgien",
                    "juge",
                    "journaliste",
                    "photographe",
                    "traducteur",
                    "réceptionniste",
                    "pilote",
                    "pompier",
                    "police",
                    "docteur",
                    "infirmière",
                    "écrivain",
                    "plombier",
                    "libraire",
                    "instructeur",
                    "coiffeur",
                    "politique",
                    "psychiatre",
                    "reporter",
                    "vendeur",
                    "bijoutier",
                    "jardinier",
                    "mailman",
                    "recruteur",
                    "arpenteur",
                    "kinésithérapeute",
                    "radiographe",
                    "secrétaire"]
    
    professions = {'en': professions_en, 'es': professions_es, 'fr': professions_fr}

    return professions[language]

#### Gathering top 25 profiles url per job title

In [None]:
# Returns a dictionary of the url of the candidates per profession and language
# Format of the dictionary: {'professions_es' : {'profession': [url_candidate1,...]},...}
def get_candidates_url(session, language):
    professions = get_professions(language)
    urls = {}

    for p in professions:

        time.sleep(randint(5,15))
        
        # List of 25 recommended candidates given a profession
        session.driver.get("https://www.linkedin.com/title/" + p)        
        session.driver.execute_script("document.body.style.zoom='80%'") 
        time.sleep(5)        
        candidates = session.driver.find_elements_by_css_selector('h3.name')
        urls[p] = []

        # Url of the candidates profile per profession
        for c in candidates:
            session.driver.execute_script("window.scrollTo(0," + str(c.location['y']) + ");")
            url = c.find_element_by_tag_name('a').get_attribute('href')
            urls[p].append(url)       
            
    return urls

#### Elasticsearch mapping initialization

In [None]:
# Returns Elasticsearch instance
def init_elasticsearch():
    es = Elasticsearch('localhost', port=9200)
    template = {"index_patterns":["job_candidates"],"settings":{"number_of_shards":1},"mappings":{"doc":{"_source":{"enabled":True},"properties":{"ingest_time":{"type":"date"}, "source":{"type":"keyword"},"ranking":{"type":"integer"},"query":{"type":"keyword"},"country":{"type":"keyword"},"has_photo":{"type":"boolean"},"is_premium":{"type":"boolean"},"experience":{"type":"nested","properties":{"job_position":{"type":"keyword"},"company":{"type":"keyword"},"date_init":{"type":"keyword"},"date_end":{"type":"keyword"},"location":{"type":"keyword"}}},"experience_years":{"type":"float"},"experience_level":{"type":"keyword"},"ethnea":{"type":"keyword"},"is_foreigner":{"type":"boolean"},"education":{"type":"nested","properties":{"school":{"type":"keyword"},"name":{"type":"keyword"},"year_init":{"type":"integer"},"year_end":{"type":"integer"}}},"accomplishments":{"type":"nested","properties":{"certificates":{"type":"integer"},"languages":{"type":"keyword"},"courses":{"type":"integer"},"publications":{"type":"integer"}}},"age":{"type":"integer"},"name":{"type":"keyword"},"surnames":{"type":"text"},"gender":{"type":"keyword"},"connections":{"type":"integer"},"url":{"type":"keyword"}}}}}
    es.indices.put_template(name='template_job_candidates', body=template)
    
    return es

# Indexes a document into a Elasticsearch index
def insert_to_elasticsearch(es, body):
    es.index(index='job_candidates', doc_type='doc', body=body)

#### Collecting information present in the candidate's profile

In [None]:
# Returns a HTML element if is present
def get_element_if_exists(method, css_selector):
    for i in range(0,5):
        try:
            element = method(css_selector)
        except NoSuchElementException:
            element = None
    return element

# Returns basic information of a candidate
def get_basic_info(session):

    # Premium profile
    badge = get_element_if_exists(session.driver.find_element_by_css_selector,"a.pv-member-badge__premium-upsell")
    if badge:
        premium = True
    else: 
        premium = False

    # Photo present
    avatar =  get_element_if_exists(session.driver.find_element_by_css_selector,"div.ghost-person")
    if avatar:
        photo = False
    else:
        photo = True

    # Complete name
    full_name = get_element_if_exists(session.driver.find_element_by_css_selector,"h1.pv-top-card-section__name")
    name = None
    surnames = None
    if full_name:
        full_name = full_name.text.strip().replace('\n','')
        full_name = HumanName(full_name)
        name = full_name.first 
        surnames = full_name.middle + " " + full_name.last    

    # Number of connections
    connections = get_element_if_exists(session.driver.find_element_by_css_selector,"span.pv-top-card-v2-section__connections")
    if connections:
        connections = [int(token) for token in connections.text.split() if token.isdigit()][0]

    # Scroll to highlights section
    highlights_section = get_element_if_exists(session.driver.find_element_by_css_selector,"section.pv-highlights-section")
    if highlights_section:
        session.driver.execute_script("window.scrollTo(0," + str(highlights_section.location['y']+50) + ");")
        time.sleep(3)
    
    # Scroll to recent activity section
    recent_activity_section = get_element_if_exists(session.driver.find_element_by_css_selector, "div.pv-recent-activity-section-v2")
    if recent_activity_section:
        session.driver.execute_script("window.scrollTo(0," + str(recent_activity_section.location['y']+50) + ");")
        time.sleep(3)
    
    return {'name': name, 'surnames':surnames, 'connections': connections, 'has_photo': photo, 'is_premium': premium}


# Returns the experience information of a candidate
def get_experience_info(session):
    # Experience section
    experience_section = get_element_if_exists(session.driver.find_element_by_id,"experience-section")
    
    # List of experiences to fill
    list_exp = []
    
    # Sum of years of experience
    experience_years = 0
    
    if experience_section:
        session.driver.execute_script("window.scrollTo(0," + str(experience_section.location['y']) +");")
        time.sleep(3)

        # Get experience element
        experiences = experience_section.find_element_by_css_selector("ul.pv-profile-section__section-info")
        experiences = experiences.find_elements_by_css_selector("li.pv-position-entity")

        # Click buttons to toggle information of truncated experiences
        for e in experiences:
            session.driver.execute_script("window.scrollTo(0," + str(e.location['y']+50) + ");")
            time.sleep(3)
            buttons = get_element_if_exists(e.find_elements_by_class_name,"pv-profile-section__text-truncate-toggle")
            if buttons:
                for b in buttons:
                    session.driver.execute_script("arguments[0].click();", b)
                    
        # Get experience element
        experiences = experience_section.find_element_by_css_selector("ul.pv-profile-section__section-info")
        experiences = experiences.find_elements_by_css_selector("li.pv-position-entity")


        for e in experiences:
            # Get job position elements
            positions = e.find_elements_by_css_selector("li.pv-entity__position-group-role-item")
            session.driver.execute_script("window.scrollTo(0," + str(e.location['y']+50) + ");")
            time.sleep(3)

            exp_details = None
            
            # Multiple positions in the same company
            if len(positions) > 0 :
                # Company name
                company = "".join(e.find_element_by_css_selector('h3.t-16').find_elements_by_tag_name('span')[-1].text.split())
                dates = []
                job_position = []
                
                # Collect position names
                for p in positions:
                    job_position.append(p.find_element_by_css_selector('h3.t-14').find_elements_by_tag_name('span')[-1].text.strip())
                    dates.append(p.find_element_by_css_selector('h4.pv-entity__date-range').find_elements_by_tag_name('span')[-1].text.strip())

                # Dates
                recent_date = dates[0] 
                old_date = dates[-1]
                
                if "–" in recent_date and "–" in old_date:
                    date_end = dates[0][dates[0].index("–")+1:].strip()
                    date_init = dates[-1][0:dates[0].index("–")].strip()
                else:
                    date_end = recent_date
                    date_init = old_date
                
                # Experience details
                exp_details = e.find_elements_by_css_selector('h4.t-14')[0]
                session.driver.execute_script("window.scrollTo(0," + str(exp_details.location['y']) + ");")
                time.sleep(3)
                
                # Accumulate years of experience
                duration = get_element_if_exists(exp_details.find_elements_by_tag_name,'span')
                if duration:
                    duration_split = duration[-1].text.split(" ")
                    # Months
                    if len(duration_split) == 2:
                        experience_years += float(duration_split[0])/12.0
                    # Years and months
                    elif len(duration_split) == 5:
                        experience_years += float(duration_split[0]) + float(duration_split[-2])/12.0

            # One position in the same company
            else:
                # Position name
                job_position = e.find_element_by_css_selector('h3.t-bold').text.strip()
                
                # Company name
                company = e.find_element_by_css_selector('h4.t-normal').find_elements_by_tag_name('span')[-1].text.strip()

                # Dates
                date_range = get_element_if_exists(e.find_element_by_css_selector,'h4.pv-entity__date-range')
                date_init = None
                date_end = None
                if date_range:
                    date_range = date_range.find_elements_by_tag_name('span')[-1].text.strip().split("–")
                    date_init = date_range[0].strip()
                    if len(date_range) > 1:
                        date_end = date_range[1].strip()

                    # Experience details
                    exp_details = get_element_if_exists(e.find_elements_by_css_selector,'h4.t-black--light')
                    if len(exp_details) > 2:
                        exp_details = exp_details[1]
                    else:
                        exp_details = exp_details[-1]

                    if exp_details:
                        session.driver.execute_script("window.scrollTo(0," + str(exp_details.location['y']) + ");")
                        time.sleep(3)

                        # Accumulate years of experience
                        duration = get_element_if_exists(exp_details.find_elements_by_tag_name,'span')
                        if duration:
                            duration_split = duration[-1].text.split(" ")
                            # Months
                            if len(duration_split) == 2:
                                experience_years += float(duration_split[0])/12.0
                            # Years and months
                            elif len(duration_split) == 5:
                                experience_years += float(duration_split[0]) + float(duration_split[-2])/12.0



            # Location
            location = get_element_if_exists(e.find_element_by_css_selector,'h4.pv-entity__location')
            if location:
                location = location.find_elements_by_tag_name('span')[-1].text

            # Collect them in a list
            list_exp.append({'job_position': job_position, 
                                 'company': company, 
                                 'date_init': date_init,
                                 'date_end': date_end,
                                 'location': location})
    return {'experience': list_exp, 'experience_years': round(experience_years,2)}


# Returns the educational information of a candidate
def get_education_info(session):
    # Education section
    education_section = get_element_if_exists(session.driver.find_element_by_id,'education-section')
    
    # List of educations to fill
    list_edu = []
    
    if education_section:
        session.driver.execute_script("window.scrollTo(0," + str(education_section.location['y']) + ");")
        time.sleep(3)

        # Education elements
        educations = education_section.find_element_by_css_selector('ul.pv-profile-section__section-info')
        educations = educations.find_elements_by_css_selector('div.pv-entity__summary-info')

        for education in educations:
            session.driver.execute_script("window.scrollTo(0," + str(education.location['y']+50) + ");")
            time.sleep(3)
            
            # School name
            school = education.find_element_by_css_selector('h3.pv-entity__school-name').text
            
            # Degree name
            degree_name = get_element_if_exists(education.find_element_by_css_selector,'p.pv-entity__degree-name')
            if degree_name:
                degree_name = degree_name.find_elements_by_tag_name('span')[1].text
                discipline = get_element_if_exists(education.find_element_by_css_selector,'p.pv-entity__fos')
                if discipline:
                    discipline = discipline.find_elements_by_tag_name('span')[-1].text.strip()
                    degree_name = degree_name + " " + discipline
                    
            # Years
            dates = get_element_if_exists(education.find_element_by_css_selector,'p.pv-entity__dates')
            year_init = None
            year_end = None
            if dates:
                times = dates.find_elements_by_tag_name('time')
                if len(times)>1:
                    year_init = times[0].text 
                    year_end = times[1].text
                else:
                    year_end = times[0].text
            
            # Collect them in a list
            list_edu.append({'school': school, 
                             'name': degree_name, 
                             'year_init': year_init,
                             'year_end': year_end
                            })
            
    # Scroll to skill section
    skill_section = get_element_if_exists(session.driver.find_element_by_css_selector, "section.pv-skill-categories-section")
    if skill_section:
        session.driver.execute_script("window.scrollTo(0," + str(skill_section.location['y']+50) + ");")
        time.sleep(2)
        
    # Scroll to recommendation section 
    recommendation_section = get_element_if_exists(session.driver.find_element_by_css_selector, "section.pv-recommendations-section")
    if recommendation_section:
        session.driver.execute_script("window.scrollTo(0," + str(recommendation_section.location['y']+50) + ");")
        time.sleep(2)
        
    return {"education": list_edu}


# Returns the accomplishments information of a candidate
def get_accomplishments_info(session):
    # Accomplishments secion
    accomplishments_section = get_element_if_exists(session.driver.find_element_by_class_name,'pv-accomplishments-section')
    accomplishments = {}
    
    if accomplishments_section:
        session.driver.execute_script("window.scrollTo(0," + str(accomplishments_section.location['y']) + ");")
        time.sleep(2)

        # Number of certificates
        certificates_section = get_element_if_exists(session.driver.find_element_by_class_name,'certifications')
        certificates = 0
        if certificates_section:
            session.driver.execute_script("window.scrollTo(0," + str(certificates_section.location['y']) + ");")
            certificates = int(certificates_section.find_element_by_css_selector('h3.pv-accomplishments-block__count').find_elements_by_tag_name('span')[-1].text)

        # Number of courses
        courses_section = get_element_if_exists(session.driver.find_element_by_class_name,'courses')
        courses = 0
        if courses_section:
            session.driver.execute_script("window.scrollTo(0," + str(courses_section.location['y']) + ");")
            courses = int(courses_section.find_element_by_css_selector('h3.pv-accomplishments-block__count').find_elements_by_tag_name('span')[-1].text)

        # Number of publications
        publications_section = get_element_if_exists(session.driver.find_element_by_class_name,'publications')
        publications = 0
        if publications_section:
            session.driver.execute_script("window.scrollTo(0," + str(publications_section.location['y']) + ");")
            publications = publications_section.find_element_by_css_selector('h3.pv-accomplishments-block__count').find_elements_by_tag_name('span')
            publications = int(publications[-1].text)

        # Languages            
        languages = []
        languages_section = get_element_if_exists(session.driver.find_element_by_class_name,'languages')
        if languages_section:
            session.driver.execute_script("window.scrollTo(0," + str(languages_section.location['y']) + ");")
            languages_li = languages_section.find_elements_by_css_selector('li.pv-accomplishments-block__summary-list-item')
            for language in languages_li:
                languages.append(language.text)

        accomplishments = {'accomplishments':{    'certificates': certificates, 
                                 'courses': courses,
                                 'publications': publications,
                                 'languages': languages}}
    return accomplishments

### Estimating information not present in the candidate's profile 

In [None]:
# Returns the estimated gender of a candidate
def gender_estimation(name):
    gender_gg = None
    gender_illinois = None
    gender_gd = None
    
    d = gg.Detector()
    # Predict gender by name
    gender_gg = d.get_gender(name)
    if "mostly" in gender_gg:
        gender_gg = gender_gg.split("_")
        gender_gg = gender_gg[1]
        
    
    gd = GenderDetector()
    gender_gd = gd.gender(name)
    
    try:
        # Get ethnicity based on name
        response = requests.get('http://abel.lis.illinois.edu/cgi-bin/ethnea/search.py?Fname='+name+'&format=json')
        if response.status_code == 200:
            gender_illinois = json.loads(response.text.replace("'","\""))['Genni'].lower()
    
    finally:
        # Normalize values for the gender attribute  
        if gender_gd == 'f':
            gender_gd = 'female'
        elif gender_gd == 'm':
            gender_gd = 'male'
        else:
            gender_gd = 'unknown'
    
        if gender_illinois == 'f':
            gender_illinois = 'female'
        elif gender_illinois == 'm':
            gender_illinois = 'male'
        else:
            gender_illinois = 'unknown'
            
        # Collect gender estimations for the 3 different methods    
        gender_estimations = {'gg': gender_gg, 'gd': gender_gd, 'illinois': gender_illinois}
        
        # Count number of occurrencies
        sum_female = sum(value == 'female' for value in gender_estimations.values())
        sum_male = sum(value == 'male' for value in gender_estimations.values())
        sum_unknown = sum(value == 'unknown' for value in gender_estimations.values())
        
        # Collect gender occurrencies
        gender_occurrencies = {'female': sum_female, 'male': sum_male, 'unknown': sum_unknown}
        
        # Get the gender with higher number of occurrencies (by voting)
        gender_max_occurrencies = (max(gender_occurrencies.items(), key=operator.itemgetter(1))[0])
        
        return {'gender': gender_max_occurrencies}


# Returns the estimated age of a candidate
def estimate_age(list_edu):
    age = None
    if len(list_edu) > 0:
        # First year of university
        first_year = list_edu[len(list_edu)-1].get('year_init', None)
        
        if first_year:
            # First year of university until now
            age = abs( int(first_year) - datetime.datetime.now().year) + 18
            
    return {'age': age}


# Returns the estimated experience level of a candidate
def estimate_experience_level(experience_years):
    # Experience category level based on years
    if experience_years>=0 and experience_years<=3:
        experience_level = 'entry-level'
    elif experience_years>3 and experience_years<=5:
        experience_level = 'mid-level'
    elif experience_years>5 and experience_years<=8:
        experience_level = 'senior-level'
    elif experience_years>8:
        experience_level = 'amateur-level'
    else: 
         experience_level = None
        
    return {'experience_level': experience_level}

# Returns the estimated ethnea of a candidate
def estimate_ethnicity(name, surnames, country):
    foreigner = None
    ethnicity = None
    try:
        # Get ethnea based on name
        response = requests.get('http://abel.lis.illinois.edu/cgi-bin/ethnea/search.py?Fname='+name+'&Lname='+surnames+'&format=json')
        if response.status_code == 200:
            ethnicity = json.loads(response.text.replace("'","\""))['Ethnea'].lower()
            
            # Other nationality
            foreigner = True
            
            # Spanish person
            if country == "Spain" and ethnicity == 'hispanic':
                foreigner = False
            
            # French perosn
            elif country == "France" and ethnicity == 'french':
                foreigner = False
                    
            # English person
            elif country == "United Kingdom" and ethnicity == 'ENGLISH':
                foreigner = english
    finally:
        return {'is_foreigner': foreigner, 'ethnicity': ethnicity}

#### Joining all information about a candidate

In [None]:
def collect_profile(url, session, rank_position, query, country):
    time.sleep(randint(10,40))
    session.driver.get(url)
    session.driver.execute_script("document.body.style.zoom='80%'") 
    time.sleep(5)
    
    html_content = session.driver.page_source
    f = open("linkedin_html/linkedin_"+country+"_"+query+"_"+str(rank_position)+".html","w+")
    f.write(html_content)
    f.close()
    
    
    # Basic information
    dict_basic_info = get_basic_info(session)
    
    # Experience section
    dict_exp_info = get_experience_info(session)
    
    # Education section
    dict_edu_info = get_education_info(session)
    
    # Accomplisments section
    dict_accomp_info = get_accomplishments_info(session)
    
    
    # Age estimation
    dict_age_est = estimate_age(dict_edu_info['education'])

    # Gender estimation
    dict_gender_estimation = gender_estimation(dict_basic_info['name'])

    # Experience category
    dict_level_est = estimate_experience_level(dict_exp_info['experience_years'])

    # Ethnea estimation
    dict_ethnicity_est = estimate_ethnicity(dict_basic_info['name'], dict_basic_info['surnames'],country)
    
    # Join all information
    profile = {}
    profile.update(dict_basic_info)
    profile.update(dict_exp_info)
    profile.update(dict_edu_info)
    profile.update(dict_accomp_info)
    profile.update(dict_age_est)
    profile.update(dict_gender_estimation)
    profile.update(dict_level_est)
    profile.update(dict_ethnicity_est)
    
    now = str(datetime.datetime.now())

    profile.update({'source':'linkedin','url':url, 'ranking': rank_position, 'query': query, 'country': country, 'ingest_time': now})
    
    # Full information of the candidate
    return profile


# Collects and stores all the information of a candidate into ES
def collect_and_store_info(url, session, rank_position, query, country, es):
    MAX_ATTEMPTS = 3

    # Maximum of 3 attempts to collect information
    attempts = 0
    profile_empty = True
    while (attempts < MAX_ATTEMPTS and profile_empty):
        # Gather profile's info
        profile = collect_profile(url, session, rank_position, query, country)
        profile_empty = len(profile['education']) == 0 or len(profile['experience']) == 0 
        attempts+=1

    # Store in ES
    insert_to_elasticsearch(es, profile)
    
    
# Gathers information about candidates of different countries
def scraping_linkedin(session, pair_candidate_query, country):  
    
    # Init Elasticsearch Instance
    es = init_elasticsearch()
    
    for pair in pair_candidate_query.items():
        # Query used for searching candidates
        
        query = pair[0] 
        # Url of the candidates profile
        url_list = pair[1]
        
        print("Scraping " + str(len(url_list)) + " candidates using as query '" + query +"'...")
        
        rank_position = 0

        # For each candidate profile
        if (len(url_list) > 0 ):
            times = []
            for url in url_list:
                start = time.time()

                # Collect and store information about candidate
                collect_and_store_info(url, session, rank_position, query, country, es)  

                end = time.time()            
                times.append(end-start)

                rank_position += 1

            # Execution times
            print("\t- Avg. time:", round(float(sum(times)/len(times))/60.0,2),"min")
            print("\t- Total time:", round(float(sum(times))/60.0,2),"min")

### Main executor

In [None]:
# Log in to Linkedin manually
session = login_linkedin()

#### Collect url of the candidates whose content will be scrapped

In [None]:
# Candidates per query Linkedin France
url_candidates_fr = get_candidates_url(session, 'fr')
f= open("linkedin_urls_fr.txt","w+")
f.write(json.dumps(url_candidates_fr))
f.close()

# Candidates per query Linkedin Spain
url_candidates_es = get_candidates_url(session, 'es')
f= open("linkedin_urls_es.txt","w+")
f.write(json.dumps(url_candidates_es))
f.close()

# Candidates per query Linkedin UK
url_candidates_uk = get_candidates_url(session, 'en')
f= open("linkedin_urls_uk.txt","w+")
f.write(json.dumps(url_candidates_uk))
f.close()

#### Start scraping

In [None]:
f = open("linkedin_urls_fr.txt", "r")
url_candidates_fr = json.loads(f.read())
scraping_linkedin(session, url_candidates_fr, country='France')

In [None]:
f = open("linkedin_urls_uk.txt", "r")
url_candidates_en = json.loads(f.read())
scraping_linkedin(session, url_candidates_en, country='United Kingdom')

In [None]:
f = open("linkedin_urls_es.txt", "r")
url_candidates_es = json.loads(f.read())
scraping_linkedin(session, url_candidates_es, country='Spain')