In [26]:
from bs4 import BeautifulSoup
import pandas as pd
import polars as pl
import json
import requests
import time
import random
import google.generativeai as genai
from typing import Optional
from tqdm import tqdm

In [3]:
# Configuration
date_posted = 604800  # 86400 -> 1 day, 2592000 -> 1 month, 604800 -> 1 week
job_name = 'data engineer'
location = 'Chile'

In [4]:
# with open('config.json') as f:
#     config = json.load(f)

In [5]:
def name_format(job_name):
    return job_name.replace(' ', '%20')

In [None]:
def get_data(url):
    r = requests.get(url, headers={"headers": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"}, timeout=5)

    return BeautifulSoup(r.content, 'html.parser')

In [7]:
def get_jobcards_soup():
    formatted_job_name = name_format(job_name)
    url = f"https://linkedin.com/jobs/search?keywords={formatted_job_name}&location={location}&f_TPR=r{date_posted}"
    return get_data(url)


In [8]:
def get_list_of_jobcards(soup):
    # Parsing the job card info (title, company, location, date, job_url) from the beautiful soup object
    joblist = []
    try:
        divs = soup.find_all('div', class_='base-search-card__info')
    except:
        print("Empty page, no jobs found")
        return joblist
    for item in divs:
        title = item.find('h3').text.strip()
        company = item.find('a', class_='hidden-nested-link')
        location = item.find('span', class_='job-search-card__location')
        parent_div = item.parent
        entity_urn = parent_div['data-entity-urn']
        job_posting_id = entity_urn.split(':')[-1]
        job_url = 'https://www.linkedin.com/jobs/view/'+job_posting_id+'/'

        date_tag_new = item.find('time', class_ = 'job-search-card__listdate--new')
        date_tag = item.find('time', class_='job-search-card__listdate')
        date = date_tag['datetime'] if date_tag else date_tag_new['datetime'] if date_tag_new else ''
        job_description = ''
        job = {
            'title': title,
            'company': company.text.strip().replace('\n', ' ') if company else '',
            'location': location.text.strip() if location else '',
            'date': date,
            'job_url': job_url,
            'job_description': job_description,
        }
        joblist.append(job)

    return joblist

In [9]:
def get_job_info(soup):

    job_info = {}
    # Get the job description from the job page
    desc_div = soup.find('div', class_='description__text description__text--rich')
    if desc_div:
        # Remove unwanted elements
        for element in desc_div.find_all(['span', 'a']):
            element.decompose()

        # Replace bullet points
        for ul in desc_div.find_all('ul'):
            for li in ul.find_all('li'):
                li.insert(0, '-')

        text = desc_div.get_text(separator='\n').strip()
        text = text.replace('\n\n', '')
        text = text.replace('::marker', '-')
        text = text.replace('-\n', '- ')
        text = text.replace('Show less', '').replace('Show more', '')
        job_info['job_description'] = text
    else:
        job_info['job_description'] = "Could not find Job Description"
    
    # Get the job salary from the job page
    #TODO

    # Get the job contract type from the job page
    # Find the main list container (optional, but good practice if multiple lists exist)
    criteria_list_ul = soup.find('ul', class_='description__job-criteria-list')


    # Check if the main list was found
    if criteria_list_ul:
        # Find all list items within this specific list
        list_items = criteria_list_ul.find_all('li', class_='description__job-criteria-item')

        # Iterate through each list item
        for item in list_items:
            # Find the subheader (h3) for the criterion name
            subheader_tag = item.find('h3', class_='description__job-criteria-subheader')
            # Find the text span for the criterion value
            # Using the more specific class 'description__job-criteria-text--criteria' is slightly safer
            value_tag = item.find('span', class_='description__job-criteria-text--criteria')

            # Ensure both tags were found before extracting text
            if subheader_tag and value_tag:
                # Extract text and clean whitespace (strip removes leading/trailing spaces/newlines)
                criterion_name = subheader_tag.get_text(strip=True)
                criterion_value = value_tag.get_text(strip=True)

                # Add the key-value pair to the dictionary
                job_info[criterion_name] = criterion_value
            else:
                # Optional: Print a warning if the structure is unexpected within an item
                print(f"Warning: Skipping item, couldn't find expected h3/span: {item.prettify()}")

    else:
        print("Error: Could not find the 'ul' with class 'description__job-criteria-list'.")


    return job_info

In [10]:
keywords = ["data engineer",
            "data enginer",
            "ingeniero de datos",
            "ingeniero datos",
            ]

In [11]:
if __name__ == "__main__":

    # Obtiene el objeto soup que contiene las jobcards
    soup = get_jobcards_soup()

    # Devuelve una lista de diccionarios con la información de las jobcards (title, company, location, date, job_url)
    joblist = get_list_of_jobcards(soup)
    job_description = []



In [12]:
joblist

[{'title': 'Data Engineer Junior',
  'company': 'LISIT',
  'location': 'Santiago, Santiago Metropolitan Region, Chile',
  'date': '2025-04-10',
  'job_url': 'https://www.linkedin.com/jobs/view/4205294902/',
  'job_description': ''},
 {'title': 'Data Analyst I',
  'company': 'Principal Chile',
  'location': 'Santiago, Santiago Metropolitan Region, Chile',
  'date': '2025-04-11',
  'job_url': 'https://www.linkedin.com/jobs/view/4187514059/',
  'job_description': ''},
 {'title': 'Data Engineer',
  'company': 'Xepelin',
  'location': 'Santiago, Santiago Metropolitan Region, Chile',
  'date': '2025-04-09',
  'job_url': 'https://www.linkedin.com/jobs/view/4205474021/',
  'job_description': ''},
 {'title': 'Data Engineer',
  'company': 'BC Tecnología',
  'location': 'Santiago, Santiago Metropolitan Region, Chile',
  'date': '2025-04-10',
  'job_url': 'https://www.linkedin.com/jobs/view/4205297472/',
  'job_description': ''},
 {'title': 'Data Engineer',
  'company': '2Brains',
  'location': 'C

In [13]:
# A la lista de trabajos se le agrega info extra de cada trabajo
for job in joblist:
    
    # Verifica si el titulo del trabajo contiene alguna de las palabras clave
    if any(keyword in job['title'].lower() for keyword in keywords):
        
        # Agregar descripción del trabajo, salario, tipo de contrato
        try:
            print('-' * 30)
            print(f'Getting job description for {job["title"]} in {job["company"]}')

            time.sleep(random.randint(2, 5))

            # Acá se busca la descripcion del trabajo
            job_info = get_job_info(get_data(job['job_url']))
            job.update(job_info)
            print('Job description starts with:', job_info['job_description'][:10])
        
        except Exception as e:
            print(f'Error getting job description for {job["title"]} in {job["company"]}: {e}')
    
        


------------------------------
Getting job description for Data Engineer Junior in LISIT
Job description starts with: En Lisit, 
------------------------------
Getting job description for Data Engineer in Xepelin
Job description starts with: Somos una 
------------------------------
Getting job description for Data Engineer in BC Tecnología
Error: Could not find the 'ul' with class 'description__job-criteria-list'.
Job description starts with: Could not 
------------------------------
Getting job description for Data Engineer in 2Brains
Job description starts with: 2Brains es
------------------------------
Getting job description for Data Engineer in Falabella
Job description starts with: Descripció
------------------------------
Getting job description for Data Engineer in NeuralWorks
Job description starts with: NeuralWork
------------------------------
Getting job description for Ingeniero de Datos in Devaid
Job description starts with: En Devaid>
------------------------------
Gett

In [14]:

# Convert joblist to JSON
joblist_json = json.dumps(joblist, indent=4)

# Export to JSON file
with open('joblist.json', 'w') as json_file:
    json_file.write(joblist_json)

In [15]:
for job in joblist:
    print('-' * 30)
    print(job)
    # {print(v) for k, v in job.items() if job['job_description'] != ''}

------------------------------
{'title': 'Data Engineer Junior', 'company': 'LISIT', 'location': 'Santiago, Santiago Metropolitan Region, Chile', 'date': '2025-04-10', 'job_url': 'https://www.linkedin.com/jobs/view/4205294902/', 'job_description': 'En Lisit, nos dedicamos a crear, desarrollar e implementar herramientas y servicios de software que automatizan y optimizan procesos, siempre con un fuerte enfoque en la innovación y los desafíos que se presentan. Nuestro objetivo es fomentar la eficacia operativa de nuestros clientes, ayudándoles a alcanzar sus metas de transformación mediante un acompañamiento consultivo integral. Actualmente, estamos en búsqueda de un Data Engineer Junior que se una a nuestro equipo apasionado por la tecnología y el aprendizaje continuo.\n Funciones del Rol\nComo Data Engineer Junior, Serás Parte Esencial Del Equipo Encargado De Manejar y Optimizar El Flujo De Datos De La Organización. Tus Principales Responsabilidades Incluirán\n- Colaborar en la recopil

In [16]:
df = pl.DataFrame(joblist)

In [17]:
display(df.filter(pl.col('job_description') != ''))

title,company,location,date,job_url,job_description,Seniority level,Employment type,Job function,Industries
str,str,str,str,str,str,str,str,str,str
"""Data Engineer Junior""","""LISIT""","""Santiago, Santiago Metropolita…","""2025-04-10""","""https://www.linkedin.com/jobs/…","""En Lisit, nos dedicamos a crea…","""Entry level""","""Full-time""","""Information Technology""","""Technology, Information and In…"
"""Data Engineer""","""Xepelin""","""Santiago, Santiago Metropolita…","""2025-04-09""","""https://www.linkedin.com/jobs/…","""Somos una FinTech que busca de…","""Not Applicable""","""Full-time""","""Information Technology""","""Software Development, IT Servi…"
"""Data Engineer""","""BC Tecnología""","""Santiago, Santiago Metropolita…","""2025-04-10""","""https://www.linkedin.com/jobs/…","""Could not find Job Description""",,,,
"""Data Engineer""","""2Brains""","""Chile""","""2025-04-10""","""https://www.linkedin.com/jobs/…","""2Brains es una empresa dedicad…","""Mid-Senior level""","""Full-time""","""Information Technology""","""Technology, Information and In…"
"""Data Engineer""","""Falabella""","""Santiago, Santiago Metropolita…","""2025-04-11""","""https://www.linkedin.com/jobs/…","""Descripción Empresa Somos más …","""Mid-Senior level""","""Full-time""","""Information Technology""","""Retail"""
…,…,…,…,…,…,…,…,…,…
"""Ingeniero de Datos""","""Amaris Consulting""","""Chile""","""2025-04-10""","""https://www.linkedin.com/jobs/…","""Who are we? Amaris Consulting …","""Entry level""","""Full-time""","""Information Technology""","""IT Services and IT Consulting"""
"""Data Engineer - GCP Ssr""","""axity""","""Santiago, Santiago Metropolita…","""2025-04-09""","""https://www.linkedin.com/jobs/…","""Company Description: axity Job…","""Mid-Senior level""","""Full-time""","""Information Technology""","""IT Services and IT Consulting"""
"""Data Engineer - AWS Ssr""","""axity""","""Santiago, Santiago Metropolita…","""2025-04-09""","""https://www.linkedin.com/jobs/…","""Company Description: axity Job…","""Mid-Senior level""","""Full-time""","""Information Technology""","""IT Services and IT Consulting"""
"""Data Engineer Azure""","""""","""Santiago, Santiago Metropolita…","""2025-04-10""","""https://www.linkedin.com/jobs/…","""Resumen del Cargo: Busco Ingen…","""Entry level""","""Full-time""","""Information Technology""",


In [18]:
df = df.filter(pl.col('job_description') != '')

In [19]:
display(df)

title,company,location,date,job_url,job_description,Seniority level,Employment type,Job function,Industries
str,str,str,str,str,str,str,str,str,str
"""Data Engineer Junior""","""LISIT""","""Santiago, Santiago Metropolita…","""2025-04-10""","""https://www.linkedin.com/jobs/…","""En Lisit, nos dedicamos a crea…","""Entry level""","""Full-time""","""Information Technology""","""Technology, Information and In…"
"""Data Engineer""","""Xepelin""","""Santiago, Santiago Metropolita…","""2025-04-09""","""https://www.linkedin.com/jobs/…","""Somos una FinTech que busca de…","""Not Applicable""","""Full-time""","""Information Technology""","""Software Development, IT Servi…"
"""Data Engineer""","""BC Tecnología""","""Santiago, Santiago Metropolita…","""2025-04-10""","""https://www.linkedin.com/jobs/…","""Could not find Job Description""",,,,
"""Data Engineer""","""2Brains""","""Chile""","""2025-04-10""","""https://www.linkedin.com/jobs/…","""2Brains es una empresa dedicad…","""Mid-Senior level""","""Full-time""","""Information Technology""","""Technology, Information and In…"
"""Data Engineer""","""Falabella""","""Santiago, Santiago Metropolita…","""2025-04-11""","""https://www.linkedin.com/jobs/…","""Descripción Empresa Somos más …","""Mid-Senior level""","""Full-time""","""Information Technology""","""Retail"""
…,…,…,…,…,…,…,…,…,…
"""Ingeniero de Datos""","""Amaris Consulting""","""Chile""","""2025-04-10""","""https://www.linkedin.com/jobs/…","""Who are we? Amaris Consulting …","""Entry level""","""Full-time""","""Information Technology""","""IT Services and IT Consulting"""
"""Data Engineer - GCP Ssr""","""axity""","""Santiago, Santiago Metropolita…","""2025-04-09""","""https://www.linkedin.com/jobs/…","""Company Description: axity Job…","""Mid-Senior level""","""Full-time""","""Information Technology""","""IT Services and IT Consulting"""
"""Data Engineer - AWS Ssr""","""axity""","""Santiago, Santiago Metropolita…","""2025-04-09""","""https://www.linkedin.com/jobs/…","""Company Description: axity Job…","""Mid-Senior level""","""Full-time""","""Information Technology""","""IT Services and IT Consulting"""
"""Data Engineer Azure""","""""","""Santiago, Santiago Metropolita…","""2025-04-10""","""https://www.linkedin.com/jobs/…","""Resumen del Cargo: Busco Ingen…","""Entry level""","""Full-time""","""Information Technology""",


# GENAI PART

In [20]:
# --- Configuration ---

API_KEY = 'AIzaSyCaIGQDXLA-jmSCRl7NSE64uODswGHQ9tQ'
TEXT_COLUMN_NAME = "text_content" # CHANGE if your text column has a different name
OUTPUT_COLUMN_NAME = "cloud_focus"
# Optional: Add a small delay between API calls to avoid rate limits
API_CALL_DELAY_SECONDS = 0.5 # Adjust as needed, 0 for no delay

# --- Gemini Setup ---
if not API_KEY:
    raise ValueError("GOOGLE_API_KEY environment variable not set.")

genai.configure(api_key=API_KEY)

# Choose a Gemini model (e.g., 'gemini-1.5-flash' or 'gemini-pro')
# Flash is faster and cheaper, Pro might be slightly more capable.
model = genai.GenerativeModel('gemini-1.5-flash')

# Define allowed classification outputs (plus handling for errors/unknown)
ALLOWED_OUTPUTS = {"GCP", "Azure", "AWS", "Other"}
ERROR_OUTPUT = "API_Error"
UNKNOWN_OUTPUT = "Other" # Default if Gemini doesn't give a clear answer

In [21]:
def classify_cloud_focus(job_description_text: str) -> str:
    """
    Uses the Gemini API to classify the primary cloud focus of a job description.

    Args:
        job_description_text: The text of the job description.

    Returns:
        A string indicating the classification: "GCP", "Azure", "AWS", "Other",
        or "Error" if the API call fails.
    """
    if not job_description_text or not isinstance(job_description_text, str) or len(job_description_text.strip()) < 20:
         # Handle empty or very short descriptions to save API calls
        return "Invalid_Input"

    # --- Prompt Engineering ---
    # Be very specific about the desired output format.
    prompt = f"""
    Analyze the following job description text and determine its primary cloud platform focus.
    Possible categories are: GCP, Azure, AWS.

    - If the text strongly emphasizes Google Cloud Platform skills (like BigQuery, GKE, Cloud Functions, App Engine), classify it as 'GCP'.
    - If the text strongly emphasizes Microsoft Azure skills (like Azure Functions, AKS, Azure SQL, Cosmos DB, Entra ID), classify it as 'Azure'.
    - If the text strongly emphasizes Amazon Web Services skills (like EC2, S3, Lambda, RDS, EKS, DynamoDB), classify it as 'AWS'.
    - If multiple platforms are mentioned significantly without a clear primary focus, or if no cloud platform is the main focus, classify it as 'Other'.

    Job Description:
    ---
    {job_description_text}
    ---

    Output only one word: GCP, Azure, AWS, or Other.
    """

    # --- API Call with Error Handling ---
    max_retries = 3
    retry_delay = 5 # seconds
    for attempt in range(max_retries):
        try:
            response = model.generate_content(prompt)
            # Basic cleaning and validation
            classification = response.text.strip().upper()
            if classification in ["GCP", "AZURE", "AWS", "OTHER"]:
                return classification
            else:
                 # The model might sometimes output extra text or fail the instruction.
                 # Let's try to find the keyword within the response as a fallback.
                if "GCP" in classification: return "GCP"
                if "AZURE" in classification: return "Azure" # Keep consistent casing
                if "AWS" in classification: return "AWS"
                if "OTHER" in classification: return "Other" # Keep consistent casing
                print(f"Warning: Unexpected response format: '{response.text}'. Defaulting to 'Other'.")
                return "Other" # Fallback if model output is unexpected

        except Exception as e:
            print(f"API Error: {e}. Attempt {attempt + 1}/{max_retries}. Retrying in {retry_delay}s...")
            if attempt < max_retries - 1:
                 time.sleep(retry_delay) # Wait before retrying
                 retry_delay *= 2 # Exponential backoff
            else:
                print("API Error: Max retries reached.")
                return "API_Error" # Indicate an API failure

    return "API_Error" # Should not be reached if retries work, but good failsafe

In [None]:
classifications = []
# Iterating with tqdm to show progress
for desc in tqdm(df["job_description"], desc="Classifying Jobs"):
    classifications.append(classify_cloud_focus(desc))
    # Optional: Add a small delay to respect potential API rate limits
    time.sleep(2) # Adjust delay as needed (e.g., 1 second for free tier)


# Add the results as a new column
df = df.with_columns(
    pl.Series("cloud_focus", classifications)
)


# --- Display Results ---
print("\nDataFrame with Cloud Focus Classification:")
print(df)

Classifying Jobs:  52%|█████▏    | 16/31 [00:11<00:09,  1.60it/s]

API Error: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 2
}
]. Attempt 1/3. Retrying in 5s...


Classifying Jobs:  55%|█████▍    | 17/31 [00:16<00:30,  2.18s/it]

API Error: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 57
}
]. Attempt 1/3. Retrying in 5s...
API Error: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 51
}
]. Attempt 2/3. Retrying in 10s...


Classifying Jobs:  58%|█████▊    | 18/31 [00:32<01:21,  6.26s/it]

API Error: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 41
}
]. Attempt 3/3. Retrying in 20s...
API Error: Max retries reached.
API Error: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 41
}
]. Attempt 1/3. Retrying in 5s...
API Error: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-l

Classifying Jobs:  61%|██████▏   | 19/31 [00:48<01:49,  9.10s/it]

API Error: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 26
}
]. Attempt 3/3. Retrying in 20s...
API Error: Max retries reached.
API Error: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 25
}
]. Attempt 1/3. Retrying in 5s...
API Error: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-l

Classifying Jobs: 100%|██████████| 31/31 [01:10<00:00,  2.29s/it]


DataFrame with Cloud Focus Classification:
shape: (31, 11)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ title     ┆ company   ┆ location  ┆ date      ┆ … ┆ Employmen ┆ Job       ┆ Industrie ┆ cloud_fo │
│ ---       ┆ ---       ┆ ---       ┆ ---       ┆   ┆ t type    ┆ function  ┆ s         ┆ cus      │
│ str       ┆ str       ┆ str       ┆ str       ┆   ┆ ---       ┆ ---       ┆ ---       ┆ ---      │
│           ┆           ┆           ┆           ┆   ┆ str       ┆ str       ┆ str       ┆ str      │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ Data      ┆ LISIT     ┆ Santiago, ┆ 2025-04-1 ┆ … ┆ Full-time ┆ Informati ┆ Technolog ┆ OTHER    │
│ Engineer  ┆           ┆ Santiago  ┆ 0         ┆   ┆           ┆ on Techno ┆ y, Inform ┆          │
│ Junior    ┆           ┆ Metropoli ┆           ┆   ┆           ┆ logy      ┆ ation and ┆          │
│           ┆           ┆ ta…  




In [23]:
from datetime import date

# Get today's date
today = date.today()
print("Today's date:", today)

Today's date: 2025-04-13


In [24]:
# --- Export to CSV ---
df.write_csv(f'joblist_{today}_.csv')

In [29]:
df = pl.read_csv('joblist_2025-04-13_.csv')

In [33]:
df.select(pl.col('job_description'), pl.col('cloud_focus'))

job_description,cloud_focus
str,str
"""En Lisit, nos dedicamos a crea…","""OTHER"""
"""Somos una FinTech que busca de…","""GCP"""
"""Could not find Job Description""","""OTHER"""
"""2Brains es una empresa dedicad…","""OTHER"""
"""Descripción Empresa Somos más …","""GCP"""
…,…
"""Who are we? Amaris Consulting …","""OTHER"""
"""Company Description: axity Job…","""GCP"""
"""Company Description: axity Job…","""AWS"""
"""Resumen del Cargo: Busco Ingen…","""AZURE"""
