In [None]:
import time
import datetime
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import os
import csv
import logging
from dotenv import load_dotenv

In [None]:
def create_logfile():
    date_time = datetime.datetime.today().strftime('%d-%b-%y_%H:%M:%S')
    logfile = f"log/{date_time}.log"
    logging.basicConfig(filename=logfile, filemode='w', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', datefmt='%d-%b-%y %H:%M:%S', force=True)
    logging.info(f'Log file {logfile} created')
    return logging

def create_file(file, logging):
    # delete existing file if re-running
    logging.info("Checking if current daily csv exists...")
    if os.path.exists(file):
        os.remove(file)
        logging.info(f"{file} deleted")
    else:
        logging.info(f"{file} ain't exist")
    
    # create file and add header
    logging.info("Creating daily csv file...")
    header = ['fecha', 'pais', 'busqueda', 'numero_resultados', 'titulo_vacantes']
    with open(file, 'w') as f:
        w = csv.writer(f)
        w.writerow(header)
        logging.info(f"{file} created")

def login(logging):
    url_login = "https://www.linkedin.com/"

    # pulls login information from file called '.env' 
    # this file added to .gitignore so login details not shared
    load_dotenv()
    # .env file is of structure:
    # LINKEDIN_USERNAME=email@gmail.com #Puedes poner su usuario y contraseña aqui en vez de .env (No es recomendable por temas de seguridad)
    # LINKEDIN_PASSWORD=password

    LINKEDIN_USERNAME = os.getenv('LINKEDIN_USERNAME')
    LINKEDIN_PASSWORD = os.getenv('LINKEDIN_PASSWORD')

    # setup chrome to run headless
    chrome_options = Options()
    # chrome_options.add_argument("--headless")
    chrome_options.add_argument("--window-size=1920,1080")

    # login to LinkedIn
    logging.info(f"Logging in to LinkedIn as {LINKEDIN_USERNAME}...")
    wd = webdriver.Chrome(executable_path='./chromedriver', options=chrome_options)
    wd.get(url_login)
    wd.find_element_by_id("session_key").send_keys(LINKEDIN_USERNAME)
    wd.find_element_by_id("session_password").send_keys(LINKEDIN_PASSWORD)
    wd.find_element(By.XPATH, "//button[@class='sign-in-form__submit-button']").click()

    # random confirm acount information pop up that may come up
    try: 
        wd.find_element(By.XPATH, "//button[@class='primary-action-new']").click()
    except:
        pass
    logging.info("Log in complete. Scraping data...")

    return wd

def page_search(wd, geo_id, search_location, search_keyword, search_remote, search_posted, search_page, search_count, file, logging):
    # Segundos a esperar entre búsquedas
    page_wait = 20
    list_jobs = []

    # when retrying, number of attempts
    attempts = 3

    # navigate to search page
    url_search = f"https://www.linkedin.com/jobs/search/?f_TPR={search_posted}&f_WRA={search_remote}&geoId={geo_id}&keywords={search_keyword}&location={search_location}&start={search_page}"
    # url_search = f"https://www.linkedin.com/jobs/search/?f_TPR={search_posted}&geoId={geo_id}&keywords={search_keyword}&location={search_location}&start={search_page}"
    print(url_search)
    wd.get(url_search)
    time.sleep(page_wait) # add sleep so don't get caught

 
    search_count = wd.find_element(By.XPATH, "/html/body/div[5]/div[3]/div[3]/div[2]/div/section[1]/div/header/div[1]/small").text # //header/div[1]/small[1] /html/body/div[5]/div[3]/div[3]/div[2]/div/section[1]/div/header/div[1]/small
    search_count = int(search_count.split(' ')[0].replace(',', '').replace(".", ""))  # get number before space & remove comma (ex. "1,245 results")
    logging.info(f"Loading page {round(search_page/25) + 1} of {round(search_count/25)} for {search_keyword}'s {search_count} results...")
    job_elements = wd.find_elements_by_class_name("job-card-list__title")
    job_titles = [title.text for title in job_elements]
    print("Registro de cambio", job_titles)
    
    date_time = datetime.datetime.now().strftime("%d%b%Y-%H:%M:%S")
    search_keyword = search_keyword.replace("%20", " ")
    list_job = [date_time, search_location, search_keyword, search_count, job_titles]
    list_jobs.append(list_job)

    with open(file, "a") as f:
        w = csv.writer(f)
        w.writerows(list_jobs)
        list_jobs = []
    
    logging.info(f"Page {round(search_page/25) + 1} of {round(search_count/25)} loaded for {search_keyword}")
    search_page += 25   

    # return search_page, search_count, url_search
    return 1, 1, url_search


In [None]:
# create logging file
logging = create_logfile()
search_location = ["Mexico", "Colombia", "Peru", "España", "Argentina", "Chile", "Ecuador", "Bolivia", "Guatemala", "Venezuela", "USA", "Remoto"] 
location_dict = {"España": "105646813", "Mexico": "103323778", "Colombia": "100876405", "Peru": "102927786",
                "Argentina": "100446943", "Chile": "104621616", "Ecuador": "106373116", "Bolivia": "104379274",
                "Guatemala": "100877388", "Venezuela": "101490751", "USA": "103644278"}
search_remote = "true" # filter for remote positions
# search_posted = "r86400" # vacantes del ultimas 24 horas
search_posted = "r2592000" # vacantes del ultimo mes

# create daily csv file
date = datetime.date.today().strftime('%d-%b-%y')
file = f"output/{date}_adicional.csv"
create_file(file, logging)

# login to linkedin and assign webdriver to variable
wd = login(logging)

# URL search terms focusing on what type of skills are required for Data Analyst & Data Scientist
search_keywords = ['tensorflow', "pytorch", "python", "scikit", "numpy", "azure", "ing machine learning", "machine learning", "pandas", "docker", "aws", "deep learning", 
                "inteligencia artificial", "redes neuronales", "vision por computadora", "cv2", "open cv", "hugging face", "MLOps", "Data Science",
                "GCP", "XGBoost", "NLP", "Chatbot", "Transformers", "GAN", "SQL", "Google cloud platform"]

# Counting Exceptions
exception_first = 0
exception_second = 0
conteo = []
for country in location_dict:
    for search_keyword in search_keywords:
        search_keyword = search_keyword.lower().replace(" ", "%20")

    # Loop through each page and write results to csv
        search_page = 0 # start on page 1
        search_count = 1 # initiate search count until looks on page
        while (search_page < search_count) and (search_page != 1000 ):
            # Search each page and return location after each completion
            print("PRIMER INTENTO")
            try:
                search_page, search_count, url_search = page_search(wd, location_dict[country], country, search_keyword, search_remote, search_posted, search_page, search_count, file, logging)
                conteo.append(search_count)
            except Exception as e:
                logging.error(f'(1) FIRST exception for {search_keyword} on {search_page} of {search_count}, retrying...')
                logging.error(f'Current URL: {url_search}')
                logging.error(e)
                logging.exception('Traceback ->')
                exception_first += 1
                time.sleep(5) 
                try:
                    search_page, search_count, url_search = page_search(wd, location_dict[country], country, search_keyword, search_remote, search_posted, search_page, search_count, file, logging)
                    logging.warning(f'Solved Exception for {search_keyword} on {search_page} of {search_count}')
                except Exception as e:
                    logging.error(f'(2) SECOND exception remains for {search_keyword}. Skipping to next page...')
                    logging.error(f'Current URL: {url_search}')
                    logging.error(e)
                    logging.exception('Traceback ->')
                    search_page += 25 # skip to next page to avoid entry
                    exception_second += 1
                    logging.error(f'Skipping to next page for {search_keyword}, on {search_page} of {search_count}...')

# close browser
wd.quit()

logging.info(f'LinkedIn data scraping complete with {exception_first} first and {exception_second} second exceptions')
logging.info(f'Regard all further alarms...')