# 📊 LinkedIn Data Science Jobs Tracker  
*por Renan*  

**Objetivo**: Coletar vagas de Data Science em São Paulo diretamente do LinkedIn para análise de mercado.  
**Tecnologias**: Python, BeautifulSoup, Requests, Pandas. 

In [137]:
#Bibliotecas utilizadas
import requests
from bs4 import BeautifulSoup
import pandas as pd
from time import sleep
from IPython.display import clear_output  

## 🔍 Etapa 1: Coletar IDs de Vagas  
O LinkedIn limita a 25 vagas por requisição. Esta função:  
- Usa paginação (`start=0`, `start=25`, etc.)  
- Evita bloqueios com `sleep(1)`  

In [139]:
def coletar_job_ids(keyword, location, limit=50):
    base_url = "https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search"
    job_ids = []
    start = 0

    while len(job_ids) < limit:
        parametros_busca = {"keywords":keyword, "location":location, "start": start}
        try:
            response = requests.get(base_url, params=parametros_busca, timeout=10)
            print(f"URL da requisição: {response.url}")

            soup = BeautifulSoup(response.text, "html.parser")
            jobs_list = soup.find_all("li")
            if not jobs_list:
                print("Fim da busca de vagas!")
                break

            for job_found in jobs_list:
                job_id = job_found.find("div", {"class": "base-card"}).get("data-entity-urn", "").split(":")[-1]
                
                if job_id:
                    job_ids.append(job_id)
            start += 25  # Avança para a próxima página
            sleep(1)  # Evita bloqueio

        except Exception as e:
            print(f"Erro na requisição: {e}")
            break

    return job_ids[:limit]

In [140]:
def coletar_job_detalhes(job_id):
    url = f"https://www.linkedin.com/jobs-guest/jobs/api/jobPosting/{job_id}"

    try:
        response = requests.get(url, timeout=10)
        soup = BeautifulSoup(response.text, "html.parser")


        job_details = {
            "title": soup.find("h2", class_="top-card-layout__title").get_text(strip=True) 
                    if soup.find("h2", class_="top-card-layout__title") else "N/A",
            "company": soup.find("a", class_="topcard__org-name-link").get_text(strip=True) 
                    if soup.find("a", class_="topcard__org-name-link") else "N/A",
            "location": soup.find("span", class_="topcard__flavor--bullet").get_text(strip=True) 
                    if soup.find("span", class_="topcard__flavor--bullet") else "N/A",
            "posted": soup.find("span", class_="posted-time-ago__text").get_text(strip=True) 
                    if soup.find("span", class_="posted-time-ago__text") else "N/A",
            "applicants": soup.find("span", class_="num-applicants__figure").get_text(strip=True) 
                    if soup.find("span", class_="num-applicants__figure") else "N/A",
            "work_mode": "Remote" if "remote" in soup.get_text().lower() else "On-site/Hybrid",
            "link": url
        }
        return job_details

    except Exception as e:
        print(f"Erro ao processar vaga {job_id}: {e}")
        return None

In [141]:
job_ids = coletar_job_ids("Data Science", "São Paulo")
print(f"IDs coletados: {job_ids}")

URL da requisição: https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords=Data+Science&location=S%C3%A3o+Paulo&start=0
URL da requisição: https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords=Data+Science&location=S%C3%A3o+Paulo&start=25
URL da requisição: https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords=Data+Science&location=S%C3%A3o+Paulo&start=50
URL da requisição: https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords=Data+Science&location=S%C3%A3o+Paulo&start=75
URL da requisição: https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords=Data+Science&location=S%C3%A3o+Paulo&start=100
IDs coletados: ['4211914738', '4203679114', '4075021707', '4182696673', '4198275911', '4206454807', '4094546542', '4207125333', '4192619747', '4198283062', '4130559690', '4196165300', '4214840838', '4205994264', '4184918031', '4208855015', '4198283063', '4216741444', '421013

In [142]:
jobs = []
for i, job_id in enumerate(job_ids):
    job = coletar_job_detalhes(job_id)
    if job:
        jobs.append(job)
    sleep(1)  # Delay entre requisições
    clear_output(wait=True)
    print(f"Progresso: {i+1}/{len(job_ids)} vagas processadas")

Progresso: 50/50 vagas processadas


In [144]:
# Cria DataFrame
df = pd.DataFrame(jobs)

In [146]:
display(df.head()) 

Unnamed: 0,title,company,location,posted,applicants,work_mode,link
0,Programa de Estágio Pfizer 2025,Pfizer,"São Paulo, São Paulo, Brazil",1 week ago,,On-site/Hybrid,https://www.linkedin.com/jobs-guest/jobs/api/j...
1,Analista Estratégia e Inovação Jr,Azul Linhas Aéreas Brasileiras,"Barueri, São Paulo, Brazil",2 weeks ago,,On-site/Hybrid,https://www.linkedin.com/jobs-guest/jobs/api/j...
2,Faça sua Carreira de Design no Itaú 🚀🧡,Itaú Unibanco,"São Paulo, São Paulo, Brazil",2 weeks ago,,On-site/Hybrid,https://www.linkedin.com/jobs-guest/jobs/api/j...
3,Cientista de dados (S&OP),Shopee,"São Paulo, São Paulo, Brazil",3 days ago,,On-site/Hybrid,https://www.linkedin.com/jobs-guest/jobs/api/j...
4,Banco de Talentos - Business Analytics,XP Inc.,"São Paulo, São Paulo, Brazil",6 days ago,,On-site/Hybrid,https://www.linkedin.com/jobs-guest/jobs/api/j...


In [None]:
df.to_csv("linkedin_datasciece_jobs.csv", index=False, encoding="utf-8-sig")
print("Dados salvos em 'linkedin_datasciece_jobs.csv'")

In [80]:
#para mudar o tamanho da lista, basta alterar o número ao fim do código para aumentar o tamanho da busca
#jobs_jobs-search-bar_search-submit&start="25"---> 25 vagas buscadas
list_url = "https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords=Data%2BScientist&location=S%C3%A3o%2BPaulo%2C%2BS%C3%A3o%2BPaulo%2C%2BBrasil&geoId=104746682&trk=public_jobs_jobs-search-bar_search-submit&start=25"

In [81]:
list_data = requests.get(list_url).text

In [82]:
list_data

'<!DOCTYPE html>\n\n      <li>\n        \n    \n\n    \n    \n    \n      <div class="base-card relative w-full hover:no-underline focus:no-underline base-card--link base-search-card base-search-card--link job-search-card" data-entity-urn="urn:li:jobPosting:4161301056" data-impression-id="jobs-search-result-0" data-reference-id="w21qGT2QEm5sAQj079cwIg==" data-tracking-id="ME0kOy2+rfCHDxZ8f4IKdA==" data-column="1" data-row="81">\n        \n\n        <a class="base-card__full-link absolute top-0 right-0 bottom-0 left-0 p-0 z-[2]" href="https://br.linkedin.com/jobs/view/cientista-de-dados-pleno-s%C3%AAnior-at-accenture-brasil-4161301056?position=1&amp;pageNum=8&amp;refId=w21qGT2QEm5sAQj079cwIg%3D%3D&amp;trackingId=ME0kOy2%2BrfCHDxZ8f4IKdA%3D%3D" data-tracking-control-name="public_jobs_jserp-result_search-card" data-tracking-client-ingraph data-tracking-will-navigate>\n          \n          <span class="sr-only">\n              \n        \n        Cientista de Dados (Pleno/Sênior)\n      \

In [83]:
list_soup = BeautifulSoup(list_data, "html5lib")

In [84]:
job_page = list_soup.find_all("li")

In [85]:
id_jobs_list = []
for job in job_page:
    base_card_div = job.find("div", {"class":"base-card"})
    job_id = base_card_div.get("data-entity-urn").split(":")[3]
    id_jobs_list.append(job_id)

In [86]:
job_list = []

for job_id in id_jobs_list:
    job_url = f"https://www.linkedin.com/jobs-guest/jobs/api/jobPosting/{job_id}"
    job_response = requests.get(job_url)
    job_soup = BeautifulSoup(job_response.text, "html.parser")
    job_post = {}

    try:
        job_post["job_title"] = job_soup.find("h2", {"class": "top-card-layout__title font-sans text-lg papabear:text-xl font-bold leading-open text-color-text mb-0 topcard__title"}).text.strip()
    except:
        job_post["job_title"] =None
    try:
        job_post["company_name"]= job_soup.find("a", {"class": "topcard__org-name-link topcard__flavor--black-link"}).text.strip()
    except:
        job_post["company_name"]=None
    try:
        job_post["time_posted"]= job_soup.find("span", {"class": "posted-time-ago__text topcard__flavor--metadata"}).text.strip()
    except:
        job_post["time_posted"]=None
    try:
        job_post["applicants_number"]= job_soup.find("span", {"class": "num-applicants__figure topcard__flavor--metadata topcard__flavor--bullet"}).text.strip()
    except:
        job_post["applicants_number"]=None

    job_list.append(job_post)
    

In [87]:
job_list

[{'job_title': 'Vem ser Ituber – Oportunidades direcionadas para Mulheres na Tecnologia 💪🏽',
  'company_name': 'Itaú Unibanco',
  'time_posted': '4 days ago',
  'applicants_number': None},
 {'job_title': 'Data Scientist Sr',
  'company_name': 'Banco Bradesco',
  'time_posted': '2 weeks ago',
  'applicants_number': None},
 {'job_title': 'Cientista de dados',
  'company_name': 'Outly',
  'time_posted': '2 weeks ago',
  'applicants_number': None},
 {'job_title': 'Especialista Cientista de Dados II',
  'company_name': 'Serasa Experian',
  'time_posted': '3 days ago',
  'applicants_number': None},
 {'job_title': 'Cientista de Dados - Python Jr - SP',
  'company_name': 'innolevels',
  'time_posted': '4 days ago',
  'applicants_number': None},
 {'job_title': 'Data Scientist Sr',
  'company_name': 'Banco Bradesco',
  'time_posted': '3 days ago',
  'applicants_number': None},
 {'job_title': 'Data Scientist, Research, Search Quality Trust (English, Portuguese)',
  'company_name': 'Google',
  'ti

In [89]:
df = pd.DataFrame(job_list)
df

Unnamed: 0,job_title,company_name,time_posted,applicants_number
0,Vem ser Ituber – Oportunidades direcionadas pa...,Itaú Unibanco,4 days ago,
1,Data Scientist Sr,Banco Bradesco,2 weeks ago,
2,Cientista de dados,Outly,2 weeks ago,
3,Especialista Cientista de Dados II,Serasa Experian,3 days ago,
4,Cientista de Dados - Python Jr - SP,innolevels,4 days ago,
5,Data Scientist Sr,Banco Bradesco,3 days ago,
6,"Data Scientist, Research, Search Quality Trust...",Google,3 days ago,
7,Data Scientist,BrandLovers,3 weeks ago,
8,Data Scientist Specialist - Vaga Afirmativa pa...,iFood,2 weeks ago,
9,Data Scientist,Enter (formerly Talisman),2 weeks ago,


In [91]:
df.to_csv("ds-jobs.csv", header = None)