This work is licensed under the Creative Commons Attribution 4.0 Unported License. To view a copy of this license, visit http://creativecommons.org/licenses/by/4.0/.

#### Importing libraries

In [None]:
import getpass
import json
import requests
import operator
from bs4 import BeautifulSoup
from requestium import Session,Keys
import gender_guesser.detector as gg
from gender import GenderDetector
import datetime
from elasticsearch import Elasticsearch
import time
import re
from random import randint
import unidecode
from nameparser import HumanName
from selenium.common.exceptions import NoSuchElementException

#### Specialities by country

In [None]:
medical_specialities = {'Spain':['acupuntura', 'alergologia', 'alergologia-infantil', 'analisis-clinicos-especialidad', 'anatomia-patologica', 'andrologia', 'angiologia-y-cirugia-vascular', 'antiaging', 'aparato-digestivo', 'cardiologia-adultos', 'cardiologia-infantil', 'cirugia-cardiaca', 'cirugia-cardiaca-infantil', 'cirugia-general', 'cirugia-oral-y-maxilofacial', 'cirugia-pediatrica', 'cirugia-plastica-estetica-y-reparadora', 'cirugia-toracica', 'coloproctologia', 'dermatologia', 'dermatologia-infantil', 'endocrinologia', 'endocrinologia-infantil-especialidad', 'fisioterapia', 'flebologia-especialidad', 'geriatria', 'ginecologia-y-obstetricia', 'hematologia', 'inmunologia', 'logopedia', 'mastologia', 'medicina-a-domicilio', 'medicina-del-deporte-especialidad', 'medicina-estetica', 'medicina-familiar', 'medicina-fisica-y-rehabilitacion', 'neurorehabilitacion-especialidad', 'medicina-forense', 'medicina-intensiva', 'medicina-interna', 'medicina-nuclear-especialidad', 'medicina-regenerativa-especialidad', 'nefrologia-especialidad', 'nefrologia-infantil', 'neumologia', 'neumologia-infantil', 'neurocirugia', 'neurofisiologia-clinica-especialidad', 'neurologia', 'neurologia-infantil-especialidad', 'neurorradiologia-intervencionista', 'nutricion-y-dietetica', 'odontologia-y-estomatologia', 'oftalmologia', 'oftalmologia-infantil-especialidad', 'oncologia-medica-especialidad', 'oncologia-radioterapica', 'osteopatia', 'otorrinolaringologia', 'ozonoterapia-especialidad', 'pediatria', 'podologia-especialidad', 'psicologia', 'psiquiatria', 'psiquiatria-infantil-y-adolescente-especialidad', 'radiologia', 'reproduccion-asistida', 'reumatologia', 'sexologia', 'traumatologia', 'traumatologia-infantil', 'unidad-del-dolor', 'urologia', 'urologia-infantil'],
                       'Mexico':['acupuntura', 'alergologia', 'andrologia', 'angiologia-y-cirugia-vascular', 'audiologia', 'cardiologia-especialidad', 'cardiologia-pediatrica', 'cirugia-bariatrica-especialidad', 'cirugia-cardiaca', 'cirugia-cardiaca-pediatrica', 'cirugia-de-columna-especialidad', 'cirugia-general-y-bariatrica', 'cirugia-oncologica-especialidad', 'cirugia-oral-y-maxilofacial', 'cirugia-pediatrica', 'cirugia-plastica-estetica-y-reconstructiva', 'cirugia-toracica', 'coloproctologia', 'dermatologia', 'endocrinologia', 'endoscopia-digestiva', 'fisiatria-y-rehabilitacion', 'fisioterapia', 'foniatria', 'gastroenterologia-especialidad', 'genetica', 'geriatria', 'ginecologia-y-obstetricia', 'hematologia', 'inmunologia', 'mastologia', 'medicina-del-deporte-especialidad', 'medicina-del-dolor', 'medicina-familiar-y-comunitaria', 'medicina-interna', 'medicina-materno-fetal', 'medicina-nuclear-especialidad', 'medicina-y-terapia-intensiva', 'microbiologia-e-infectologia', 'nefrologia-especialidad', 'neumologia', 'neurocirugia', 'neurofisiologia-clinica-especialidad', 'neurologia', 'neurorehabilitacion', 'neurorradiologia-intervencionista', 'nutricion-y-dietetica-especialidad', 'odontologia-y-dentistas', 'oftalmologia', 'oncologia-medica-especialidad', 'oncologia-radioterapica', 'ortopedia-y-traumatologia', 'osteopatia', 'otorrinolaringologia', 'pediatria', 'psicologia', 'psiquiatria', 'psiquiatria-infantil-y-adolescente-especialidad', 'quiropractica', 'radiologia', 'reproduccion-asistida', 'reumatologia', 'traumatologia-pediatrica', 'urologia'],
                       'Colombia': ['acupuntura', 'alergologia', 'andrologia', 'audiologia', 'cardiologia-adultos', 'cardiologia-infantil', 'cirugia-bariatrica-especialidad', 'cirugia-cardiaca', 'cirugia-cardiaca-infantil', 'cirugia-general', 'cirugia-oncologica-especialidad', 'cirugia-oral-y-maxilofacial', 'cirugia-pediatrica', 'cirugia-plastica-facial', 'cirugia-plastica-estetica-y-reparadora', 'cirugia-toracica', 'angiologia-y-cirugia-vascular', 'coloproctologia', 'dermatologia', 'endocrinologia', 'endoscopia-digestiva', 'medicina-fisica-y-rehabilitacion', 'fisioterapia', 'foniatria', 'gastroenterologia-especialidad', 'genetica', 'geriatria', 'ginecologia-y-obstetricia', 'hematologia', 'hepatologia', 'homeopatia-especialidad', 'inmunologia', 'mastologia', 'medicina-alternativa', 'unidad-del-dolor', 'medicina-del-deporte-especialidad', 'medicina-estetica', 'medicina-familiar-y-comunitaria', 'medicina-interna', 'medicina-nuclear-especialidad', 'medicina-intensiva', 'infectologia-especialidad', 'nefrologia-especialidad', 'neumologia', 'neurocirugia', 'neurocirugia-pediatrica-especialidad', 'neurofisiologia-clinica-especialidad', 'neurologia', 'neurorehabilitacion', 'neurorradiologia-intervencionista', 'nutricion-y-dietetica', 'odontologia-y-estomatologia', 'oftalmologia', 'oncologia-medica-especialidad', 'oncologia-radioterapica', 'traumatologia-y-ortopedia', 'traumatologia-infantil', 'osteopatia', 'otorrinolaringologia', 'pediatria', 'psicologia', 'psiquiatria', 'psiquiatria-infantil-y-adolescente-especialidad', 'radiologia', 'reproduccion-asistida', 'reumatologia', 'urologia']}
top_doctors_url = {
                    'Spain': 'https://www.topdoctors.es/barcelona/$medical_speciality/',
                    'Mexico': 'https://www.topdoctors.mx/ciudad-de-mexico/$medical_speciality/',
                    'Colombia': 'https://www.topdoctors.com.co/bogota/$medical_speciality/'
                  }

#### Get profiles' urls

In [None]:
def get_doctors_url(medical_specialities, top_doctors_url):
    profiles = {}
    for country in top_doctors_url.keys():
        profiles[country] = {}
        for speciality in medical_specialities[country]:
            profiles[country][speciality] = []
            url_speciality = top_doctors_url[country].replace("$medical_speciality",speciality)
            session.driver.get(url_speciality)
            session.driver.execute_script("document.body.style.zoom='80%'") 
            time.sleep(5)        
            item_names = session.driver.find_elements_by_css_selector('h2.item_name')
            for name in item_names:
                profiles[country][speciality].append(name.find_element_by_tag_name('a').get_attribute('href'))
            session.driver.get(url_speciality + 'page:2/')
            item_names_page_2 = session.driver.find_elements_by_css_selector('h2.item_name')
            for name in item_names_page_2:
                profiles[country][speciality].append(name.find_element_by_tag_name('a').get_attribute('href'))
    return profiles

def save_doctors_url(medical_specialities, top_doctors_url):
    profiles = get_doctors_url(medical_specialities, top_doctors_url)
    f = open("topdoctors_url_candidates.txt","w+")
    f.write(json.dumps(profiles))
    f.close()
    
save_doctors_url(medical_specialities, top_doctors_url)

#### Main executor

In [None]:
# Create session
session = Session(webdriver_path='/usr/bin/chromedriver', 
                  browser='chrome', default_timeout=60, webdriver_options={'arguments':['--incognito']})#'headless'
# Start ES
es = init_elasticsearch()

#### Read urls collected

In [None]:
f = open("topdoctors_url_candidates.txt", "r")
profiles_doctors = json.loads(f.read())
profiles_doctors_es = profiles_doctors['Spain']
profiles_doctors_mx = profiles_doctors['Mexico']
profiles_doctors_co = profiles_doctors['Colombia']

#### Functions to crawl information from a top doctor's profile

In [None]:
# Returns a HTML element if is present
def get_element_if_exists(method, css_selector):
    for i in range(0,5):
        try:
            element = method(css_selector)
        except NoSuchElementException:
            element = None
    return element

In [None]:
# Returns the estimated sex of a candidate
def sex_estimation(name):
    name = name.lower()
    if "-" in name:
        name = " ".join(name.split("-"))
    unnaccented_name = unidecode.unidecode(name)
    name = unnaccented_name.capitalize()
    
    sex_gg = None
    sex_illinois = None
    sex_gd = None
    
    d = gg.Detector()
    # Predict sex by name
    sex_gg = d.get_gender(name)
    if "mostly" in sex_gg:
        sex_gg = sex_gg.split("_")
        sex_gg = sex_gg[1]
        
    
    gd = GenderDetector()
    sex_gd = gd.gender(name)
    
    try:
        # Get ethnea based on name
        response = requests.get('http://abel.lis.illinois.edu/cgi-bin/ethnea/search.py?Fname='+name+'&format=json')
        if response.status_code == 200:
            sex_illinois = json.loads(response.text.replace("'","\""))['Genni'].lower()
    
    finally:
        # Normalize values for sex attribute
        
        if sex_gd == 'f':
            sex_gd = 'female'
        elif sex_gd == 'm':
            sex_gd = 'male'
        else:
            sex_gd = 'unknown'
    
        if sex_illinois == 'f':
            sex_illinois = 'female'
        elif sex_illinois == 'm':
            sex_illinois = 'male'
        else:
            sex_illinois = 'unknown'
            
        # Collect sex estimations for the 3 different methods    
        sex_estimations = {'gg': sex_gg, 'gd': sex_gd, 'illinois': sex_illinois}
        
        # Count number of occurrencies
        sum_female = sum(value == 'female' for value in sex_estimations.values())
        sum_male = sum(value == 'male' for value in sex_estimations.values())
        sum_unknown = sum(value == 'unknown' for value in sex_estimations.values())
        
        # Collect sex occurrencies
        sex_occurrencies = {'female': sum_female, 'male': sum_male, 'unknown': sum_unknown}
        
        # Get the sex with higher number of occurrencies (by voting)
        sex_max_occurrencies = (max(sex_occurrencies.items(), key=operator.itemgetter(1))[0])
        
        return {'sex': sex_max_occurrencies}


In [None]:
def get_header_info(session):
    
    name = None
    surnames = None
    sex = None
    registration_number= None
    num_stars = 0
    num_reviews = 0
    has_photo = False
    total_num_stars = 0
    awarded=False
    header = get_element_if_exists(session.driver.find_element_by_css_selector,'section#profile_header_title')
    if header:
        full_name = header.find_element_by_tag_name('span').text

        
        # Complete name
        full_name_no_title = ' '.join(full_name.split(" ")[1:])
        split = full_name_no_title.split()
        name = None
        surnames = None
        if len(split) > 2:
            name = split[0]
            surnames = " ".join(split[1:])
        else:
            surnames = " ".join(split)

        # Sex
        if 'Dra.' in full_name:
            sex = 'female'

        elif 'Dr.' in full_name:
            sex = 'male'
        else:
            if name != None:
                sex = sex_estimation(name)['sex']
            else:
                sex = 'unknown'

        if sex is None:
            sex = 'unknown'
            
        # Registration info
        registration_info = get_element_if_exists(header.find_element_by_tag_name,'p')

        if registration_info:
            split = registration_info.text.split(" ")
            if "Profesional:" in split:
                registration_number = split[split.index("Profesional:")+1:]
            else:
                registration_number = split[-1]
        else:
            registration_number = None

        # Score
        score = get_element_if_exists(header.find_element_by_css_selector,'span.star_icons')
        total_num_stars = 0.0
        if score:
            num_star_icons = len(score.find_elements_by_css_selector('i.glyphicon-star'))
            num_half_star_icons = len(score.find_elements_by_css_selector('i.glyphicon-star-half')) * 0.5
            num_half_o_star_icons = len(score.find_elements_by_css_selector('i.glyphicon-star-half-o')) * 0.5
            total_num_stars = num_star_icons +num_half_star_icons + num_half_o_star_icons

        score_description = get_element_if_exists( header.find_element_by_tag_name,'strong')
        if score_description:
            num_reviews = [int(s) for s in score_description.text.split() if s.isdigit()][0]
        else:
            num_reviews = 0

        # Has default image?
        image = get_element_if_exists(header.find_element_by_xpath,"//img[contains(@src, '/img/placeholders/defaultm.jpg')]")

        if image:
            has_photo = False
        else:
            has_photo = True

        # Was awarded?
        award = get_element_if_exists(header.find_element_by_xpath,"//img[contains(@src, '/img/ico_td_awards_ganador.png')]")

        if award:
            awarded = True
        else:
            awarded = False

    return {'name':name,'surnames':surnames,'sex':sex,'registration_number':registration_number,'num_stars':total_num_stars,'num_reviews':num_reviews,'has_photo':has_photo,'was_awarded':awarded}

In [None]:
def get_content_info(session):
    content = get_element_if_exists(session.driver.find_element_by_css_selector,'section.item_summary_list')
    subspecialities = []
    education = []
    experience = []
    experience_years = None
    if content:
        divs = content.find_elements_by_css_selector('div.item')
        for div in divs:
            if "Subespecialización" in div.text:
                session.driver.execute_script("window.scrollTo(0," + str(div.location['y']) + ");")
                time.sleep(1)
                button = get_element_if_exists(div.find_element_by_xpath,"//a[contains(@href,'#cv_list_doctor_hidden')]")
                if button:
                    session.driver.execute_script("arguments[0].click();", button)
                    time.sleep(1)
                p = div.find_elements_by_tag_name('p')[1]
                split = p.text.split("•")
                for name in split:
                    name = name.strip().replace("\n","").replace(".","")
                    if len(name)>0:
                        subspecialities.append(name)

            elif "Logros académicos destacados" in div.text:
                session.driver.execute_script("window.scrollTo(0," + str(div.location['y']) + ");")
                time.sleep(1)
                button = get_element_if_exists(div.find_element_by_css_selector,"a.link_sm.link_toggle")
                if button:
                    session.driver.execute_script("arguments[0].click();", button)
                    time.sleep(1)
                p = div.find_elements_by_tag_name('p')[1]
                split = p.text.split("•")
                for name in split:
                    name = name.strip().replace("\n","").replace(".","").replace("(","").replace(")","")

                    if len(name)>0:
                        year = re.findall('.*([1-3][0-9]{3}-[0-9]{4})|([1-3][0-9]{3}-[0-9]{2})|([1-3][0-9]{3})', name)
                        # if there is a number in the description similar to a date format
                        allowed_words = ['licenciado', 'licenciatura', 'titulo', 'grado', 'graduado', 'doctorado', 'doctor', 'especialista', 'formado', 'diplomado']
                        if any(word in name.lower() for word in allowed_words):
                            
                            if len(year)>0:
                                year_init = None
                                year_end = None
                                # if 2 years are indicated (1976 1978)
                                if len(year)==2:
                                    year_init = "".join(year[0])
                                    if "-" in year_init:
                                        year_init = int(year_init.split('-')[0])
                                    else:
                                        year_init = int(year_init)
                                    year_end = "".join(year[1])
                                    if "-" in year_end:
                                        year_end = int(year_end.split("-")[0])
                                    else:
                                        year_end = int(year_end)
                                else:
                                    year = "".join(year[0])
                                    # if 2 years are indicated (1976-78 or 1976-1978)
                                    if '-' in year:
                                        years = year.split('-')
                                        year_init = years[0]
                                        second_year = years[1]
                                        if len(second_year) == 2:
                                            first_year = year_init[2:]
                                            if first_year > second_year:
                                                year_end = int(str(year_init[0:2]+1)+str(second_year)) 
                                            else:
                                                year_end = int(str(year_init[0:2])+str(second_year)) 
                                        elif len(second_year) == 4:
                                            year_end = int(second_year)
                                        year_init = int(year_init)
                                    # if only 1 year is indicated (1976)
                                    else:
                                        year_end = int(year)
                                education.append({'year_init':year_init, 'year_end': year_end,'name':name})

                            # if the year is not indicated
                            else:  
                                if len(name.split()) > 5:
                                    experience.append({'name':name.replace('"','')})
                                else:
                                    education.append({'name':name})

            elif "Experiencia profesional" in div.text:
                session.driver.execute_script("window.scrollTo(0," + str(div.location['y']) + ");")
                time.sleep(1)
                button = get_element_if_exists(div.find_element_by_css_selector,"a.link_sm.link_toggle")
                if button:
                    session.driver.execute_script("arguments[0].click();", button)
                    time.sleep(1)
                p = div.find_elements_by_tag_name('p')[1]
                split = p.text.split("•")
                for name in split:
                    name = name.strip().replace("\n","").replace(".","").replace("(","").replace(")","")
                    if len(name)>0:
                        year = re.findall('.*([1-3][0-9]{3}-[0-9]{4})|([1-3][0-9]{3}-[0-9]{2})|([1-3][0-9]{3})', name)
                        if len(year)>0:
                            year_init = None
                            year_end = None
                            if len(year)==2:
                                year_init = "".join(year[0])
                                if "-" in year_init:
                                    year_init = int(year_init.split('-')[0])
                                else:
                                    year_init = int(year_init)
                                year_end = "".join(year[1])
                                if "-" in year_end:
                                    year_end = int(year_end.split("-")[0])
                                else:
                                    year_end = int(year_end)
                            else:
                                year = "".join(year[0])

                                if '-' in year:
                                    years = year.split('-')
                                    year_init = years[0]
                                    second_year = years[1]
                                    if len(second_year) == 2:
                                        first_year = year_init[2:]
                                        if first_year > second_year:
                                            year_end = int(str(year_init[0:2]+1)+str(second_year)) 
                                        else:
                                            year_end = int(str(year_init[0:2])+str(second_year)) 
                                    elif len(second_year) == 4:
                                        year_end = int(second_year)
                                    year_init =int(year_init)
                                else:
                                    year_end = int(year)
                            experience.append({'year_init':year_init, 'year_end': year_end, 'name':name})
                        else:
                            if "años" in name.lower() and "experiencia" in name.lower() and len(name.split())==6:
                                experience_years = [int(s) for s in name.lower().split() if s.isdigit()][0]
                            else:
                                if len(name.split()) > 5:
                                    experience.append({'name':name.replace('"', '')})

    return {'subspecialities':subspecialities,'education':education,'experience':experience, 'experience_years': experience_years}

In [None]:
def get_accomplishments_info(session):
    sections = session.driver.find_elements_by_css_selector('section.item')
    languages = []
    for section in sections:
        if "Idiomas" in section.text:
            li = section.find_elements_by_tag_name('li')
            for i in li:
                languages.append(i.text.strip())
    return {'languages':languages}

In [None]:
def estimate_age(education):
    if len(education)>0:
        first_education = education[-1]
        year_init = first_education.get('year_init',None)
        year_end = first_education.get('year_end',None)

        if year_init:
            estimated_age = (datetime.datetime.now().year - int(year_init)) + 18
        elif year_end:
            if year_end:
                estimated_age = (datetime.datetime.now().year - int(year_end)) + 18 + 5.25
        else:
            estimated_age = None
    else:
        estimated_age = None
        
    return {'age':estimated_age}

In [None]:
# Returns the estimated ethnea of a candidate
def estimate_ethnea(name, surnames, country):
    foreigner = None
    ethnicity = None
    try:
        # Get ethnea based on name
        response = requests.get('http://abel.lis.illinois.edu/cgi-bin/ethnea/search.py?Fname='+name+'&Lname='+surnames+'&format=json')
        if response.status_code == 200:
            ethnicity = json.loads(response.text.replace("'","\""))['Ethnea'].lower()
            
            # Other nationality
            foreigner = True
            
            # Spanish, Mexican or Colombian person
            if (country == "Spain" or country == 'Mexico' or country == 'Colombia') and 'hispanic' in ethnicity:
                foreigner = False
    finally:
        return {'is_foreigner': foreigner, 'ethnicity': ethnicity}

In [None]:
# Returns Elasticsearch instance
def init_elasticsearch():
    es = Elasticsearch('localhost', port=9200)
    template = {"index_patterns":["medical_specialists"],"settings":{"number_of_shards":1},"mappings":{"doc":{"_source":{"enabled":True},"properties":{"ingest_time":{"type":"date"}, "source":{"type":"keyword"},"ranking":{"type":"integer"},"speciality":{"type":"keyword"},"country":{"type":"keyword"},"has_photo":{"type":"boolean"},"was_awarded":{"type":"boolean"},"experience":{"type":"nested","properties":{"name":{"type":"keyword"},"year_init":{"type":"integer"},"year_end":{"type":"integer"}}},"experience_years":{"type":"integer"},"experience_level":{"type":"keyword"},"ethnicity":{"type":"keyword"},"is_foreigner":{"type":"boolean"},"education":{"type":"nested","properties":{"name":{"type":"keyword"},"year_init":{"type":"integer"},"year_end":{"type":"integer"}}},"num_stars":{"type":"integer"},"num_reviews":{"type":"integer"}, "registration_number":{"type":"keyword"},"languages":{"type":"keyword"},"age":{"type":"integer"},"name":{"type":"keyword"},"surnames":{"type":"text"},"sex":{"type":"keyword"},"url":{"type":"keyword"}}}}}
    es.indices.put_template(name='template_medical_specialists', body=template)
    
    return es

# Indexes a document into a Elasticsearch index
def insert_to_elasticsearch(es, body):
    search = es.search(index='medical_specialists', body={ "query": { "term": { "url": { "value": body['url'] } } } }, size=1)
    if len(search['hits']['hits']) > 0:
        id_doc = search['hits']['hits'][0]['_id']
        es.index(index='medical_specialists', doc_type='doc', body=body, id=id_doc)
    else:
        print('not exists')
        es.index(index='medical_specialists', doc_type='doc', body=body)

#### Start crawling

In [None]:
def scraping_topdoctors(country, profiles_doctors):
    for speciality,urls in profiles_doctors.items():
        ranking = 0
        speciality = speciality.replace("-"," ")
        start = time.time()
        for url in urls:
            session.driver.get(url)
            session.driver.execute_script("document.body.style.zoom='50%'") 
            session.driver.execute_script("window.scrollTo(0,0);")
            time.sleep(randint(0,5))
            html_content = session.driver.page_source
            f = open("topdoctors_html/topdoctors_"+country+"_"+speciality+"_"+str(ranking)+".html","w+")
            f.write(html_content)
            f.close()

            
            profile = {}
            profile.update({'country': country, 'speciality': speciality, 'url': url, 'ranking': ranking})
            basic_info = get_header_info(session)
            profile.update(basic_info)
            profile.update(get_content_info(session))
            profile.update(get_accomplishments_info(session))
            profile.update(estimate_ethnea(basic_info['name'], basic_info['surnames'], country))
            profile.update(estimate_age(profile['education']))
            now = datetime.datetime.now()
            profile.update({'ingest_time': now, 'source': 'top doctors'})


            insert_to_elasticsearch(es, profile)

            ranking += 1
        end = time.time()
        print(speciality,round(float(end-start)/60.0,2),'min')
        print(len(urls))


In [None]:
scraping_topdoctors('Spain', profiles_doctors_es)

In [None]:
scraping_topdoctors('Mexico', profiles_doctors_mx)

In [None]:
scraping_topdoctors('Colombia', profiles_doctors_co)