# LinkedIn Job Hunter (Anti-Algoritmo)

Este notebook busca vagas no LinkedIn usando o endpoint público `jobs-guest` e pagina automaticamente.

**Busca padrão**
- cargo: Analista de Dados
- localidade: Brasil
- modalidade: Remoto

**Saídas**
- Tabela com colunas: `cargo`, `empresa`, `publicada à`, `link da vaga no linkedin`
- Export para CSV


In [0]:
# Instala dependências se necessário
import importlib

pkgs_needed = ['requests', 'bs4', 'pandas', 'tqdm']
missing_pkgs = []
for pkg_name in pkgs_needed:
    try:
        importlib.import_module(pkg_name if pkg_name != 'bs4' else 'bs4')
    except Exception:
        missing_pkgs.append(pkg_name)

if len(missing_pkgs) > 0:
    print('Installing: ' + ', '.join(missing_pkgs))
    get_ipython().run_line_magic('pip', 'install ' + ' '.join(missing_pkgs))
else:
    print('All required packages already installed')


In [0]:
# Imports
import time as time_mod
import requests
import pandas as pd
from bs4 import BeautifulSoup
from tqdm.auto import tqdm


In [0]:
# Configurações da busca (adaptadas aos filtros do link fornecido)
# Link base (referência): https://www.linkedin.com/jobs/search/?currentJobId=4377411077&distance=25&f_TPR=r86400&f_WT=2&geoId=106057199&keywords=analista%20de%20dados&origin=JOB_SEARCH_PAGE_JOB_FILTER&refresh=true

query_role = 'analista de dados'

# geoId do LinkedIn (substitui location textual para bater com o filtro)
geo_id = '106057199' #BRASIL

# Remoto no LinkedIn costuma ser f_WT=2
workplace_type = '2' 

# f_TPR=r86400 = últimas 24 horas
time_posted_range = 'r86400' #r604800 ultima semana  

# distance=25 (nem sempre é respeitado no endpoint guest, mas enviamos o parâmetro)
distance_miles = '9999'

# Paginação
max_pages = 10
page_size = 50
sleep_s = 1.2
timeout_s = 360

base_url = 'https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search'

headers_obj = {
    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Language': 'pt-BR,pt;q=0.9,en-US;q=0.8,en;q=0.7',
    'Connection': 'keep-alive'
}


In [0]:
# Filtro de cargo: manter somente vagas cujo título contenha termos alvo
import re

title_whitelist_terms = [
    'analista de dados',
    'data analyst'
]

title_whitelist_regex = re.compile('(' + '|'.join([re.escape(t) for t in title_whitelist_terms]) + ')', flags=re.IGNORECASE)

def title_matches(title_txt):
    if title_txt is None:
        return False
    title_norm = str(title_txt).strip()
    if title_norm == '':
        return False
    return title_whitelist_regex.search(title_norm) is not None


In [0]:
# Funções de extração
def parse_job_cards(html_txt):
    soup_obj = BeautifulSoup(html_txt, 'html.parser')
    job_lis = soup_obj.select('li')

    rows_list = []
    for li_obj in job_lis:
        title_el = li_obj.select_one('h3.base-search-card__title')
        company_el = li_obj.select_one('h4.base-search-card__subtitle')
        time_el = li_obj.select_one('time')
        link_el = li_obj.select_one('a.base-card__full-link')

        cargo_val = title_el.get_text(strip=True) if title_el else None
        if not title_matches(cargo_val):
            continue

        empresa_val = company_el.get_text(strip=True) if company_el else None
        publicada_val = time_el.get_text(strip=True) if time_el else None
        link_val = link_el.get('href') if link_el else None

        if link_val is not None:
            link_val = link_val.split('?')[0]

        if cargo_val or empresa_val or publicada_val or link_val:
            rows_list.append({
                'cargo': cargo_val,
                'empresa': empresa_val,
                'publicada à': publicada_val,
                'link da vaga no linkedin': link_val
            })

    return rows_list

def fetch_page(start_idx):
    params_obj = {
        'keywords': query_role,
        'geoId': geo_id,
        'f_WT': workplace_type,
        'f_TPR': time_posted_range,
        'distance': distance_miles,
        'start': int(start_idx)
    }
    resp_obj = requests.get(base_url, params=params_obj, headers=headers_obj, timeout=timeout_s)
    return resp_obj


In [0]:
# Coleta paginada
all_rows = []
seen_links = set()

for page_idx in tqdm(range(max_pages)):
    start_idx = page_idx * page_size
    resp_obj = fetch_page(start_idx)

    if resp_obj.status_code != 200:
        print('HTTP ' + str(resp_obj.status_code) + ' at start=' + str(start_idx))
        break

    html_txt = resp_obj.text
    rows_list = parse_job_cards(html_txt)

    if len(rows_list) == 0:
        print('No more results at start=' + str(start_idx))
        break

    new_count = 0
    for row_obj in rows_list:
        link_val = row_obj.get('link da vaga no linkedin')
        if link_val is None:
            continue
        if link_val in seen_links:
            continue
        seen_links.add(link_val)
        all_rows.append(row_obj)
        new_count += 1

    if new_count == 0:
        print('No new unique jobs at start=' + str(start_idx) + ' (stopping)')
        break

    time_mod.sleep(sleep_s)

jobs_df = pd.DataFrame(all_rows)
jobs_df = jobs_df.dropna(how='all').drop_duplicates()
jobs_df = jobs_df[['cargo', 'empresa', 'publicada à', 'link da vaga no linkedin']]

print(jobs_df.shape[0])
display(jobs_df.head(100))


In [0]:
# Exporta CSV
out_csv_name = 'vagas_linkedin_analista_de_dados_brasil_remoto.csv'
jobs_df.to_csv(out_csv_name, index=False, encoding='utf-8-sig')
out_csv_name
