# Scraping Indeed.com

Indeed has their filters in the URL, so we simply need Pylenium to visit a URL to start scraping.

> Their filtering and searching is pretty bad, so you'll see lots of irrelevant results.
> However, with this list of jobs, we can have an AI analyze all of them and return what we want!

The current implementation will paginate through 5 pages of results and capture all Job Posts

In [1]:
from pylenium.driver import Pylenium, PyleniumConfig
from jobs import indeed

py = Pylenium(PyleniumConfig())
jobs = indeed.scrape(py, indeed.REMOTE_QA_ENGINEER)

print("Found jobs:", len(jobs))

Found jobs: 75


We can view some of the Jobs we found

In [2]:
jobs[:5]

[IndeedJob(origin='https://indeed.com', title='Software Engineer', company='Convera', location='Remote in Denver, CO', share_link='https://indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0A1hwZZFfe5kqTC0mSB_a3yjAxe1XH-gVzwrkdS4aQAoTCVGPuKr0fJa3a4Z55zWr-PM8x3A1U-Tfbkl7I7WQM11_4YU7rxyGfU_maeka9S4bosphTKuGGY9rdNgtZVOHR2G1iD90F-jO08nI73C2xgmLWZp6n-ZJlvJ6QzNkAoQTFquoVpFShQ2K8K0aTSff5SSTjTDOGO18YR0MI5kjeUGQ5Y2dXswmyNdzhndu4NHdUFmJcp3S0NcyMd6f3bmgHT3LtkvkQEF1H2jaJbtAV6VMhCV3nwlKSOYM-WGTF9_BHPsdO63ktSi_CenXFIE-M6QrO-4ZlMPEJS6xFHboQNTrWttyqLrM8kxxdAhzDAjb6zsvQtkzYwD788Jl0Bnx04H93rJ2owreHyyO_Td3nQPivPlfcLnPXibvfEioeFdQJQkKpjN4kgTBfa8jb99NCj7NUpGi93WWSvPr0DX6jp334E3b9rteYqLVQIykQj1RBCYuEBxmXIDuWziKCEywthMwoNvBB3tCNHbEx6s2p-BzGey2YdaGPJxL-CUvotErQW97jIl4l8ulPZrgl23-CFnQD2qPjFEEwOZ3n7pemg&xkcb=SoBI6_M3FK2MMZgniL0LbzkdCdPP&p=0&fvj=0&vjs=3', salary=None),
 IndeedJob(origin='https://indeed.com', title='Full Stack Software Engineer, Core and Monetization', company='Pinterest', location='Remote in San Francisco,

## Save results to the database

In [3]:
from jobs import database

database.create()
database.insert_indeed_jobs(jobs)

## Query the database

Which jobs have their salary listed?

In [4]:
database.query(
    """
    SELECT id, title, company, location, salary FROM indeed_jobs
    WHERE salary LIKE '%year%'
    LIMIT 2
    """
)

[(4, 'QA Engineer', 'Actabl', 'Remote in Florida', '$70,000 - $90,000 a year'),
 (5,
  'Software Engineer, Frontend - Consumer',
  'Coinbase',
  'Remote',
  '$140,250 - $165,000 a year')]

Which jobs are remote?

In [5]:
database.query(
    """
    SELECT id, title, company, location, salary FROM indeed_jobs
    WHERE location LIKE '%remote%'
    LIMIT 4
    """
)

[(1,
  'Full Stack Software Engineer, Core and Monetization',
  'Pinterest',
  'Remote in San Francisco, CA',
  None),
 (2, 'Frontend Software Engineer', 'Resourcely', 'Remote', None),
 (3, 'Software QA Engineer', 'Peraton', 'Remote in United States', None),
 (4, 'QA Engineer', 'Actabl', 'Remote in Florida', '$70,000 - $90,000 a year')]

## Convert query results back to Pydantic Models

If needed, we can convert query results back into our `IndeedJob` pydantic model to continue working with them in python!

In [6]:
results = database.query(
    """
    SELECT origin, title, company, location, share_link, salary FROM indeed_jobs
    LIMIT 5
    """
)

results

[('https://indeed.com',
  'Full Stack Software Engineer, Core and Monetization',
  'Pinterest',
  'Remote in San Francisco, CA',
  'https://indeed.com/rc/clk?jk=d28f6864ca2641ec&bb=UPkVe9GUNmPlaDhnXCdzjrdcTHA8XaMFb_XO3GlzxXlC78pxHzHOCTQ0eLuyEVbrNc9IWlZ8GXamoM056sMyNjceuXGSUTkBFhtQ8c2VC4Y%3D&xkcb=SoDa67M3FKygqZUEJp0LbzkdCdPP&fccid=43014b1412e0a7b6&vjs=3',
  None),
 ('https://indeed.com',
  'Frontend Software Engineer',
  'Resourcely',
  'Remote',
  'https://indeed.com/rc/clk?jk=0edbac704d39d4f9&bb=UPkVe9GUNmPlaDhnXCdzjn8vw2ulcQugCL_KmFrCnG10Uy9na2MxdNVaxoc-gHX2Pt_Kk82YzeIyv-XLnSGvbMnO03XI7vg52AdKaqwGE3g%3D&xkcb=SoBu67M3FKygqZUEJp0KbzkdCdPP&fccid=b17b7450885ceb79&vjs=3',
  None),
 ('https://indeed.com',
  'Software QA Engineer',
  'Peraton',
  'Remote in United States',
  'https://indeed.com/rc/clk?jk=62e6126a8ebcfce1&bb=UPkVe9GUNmPlaDhnXCdzjrbZ5hRD77BtwgUYR1QeClbyvzDJcgg0g7daWnb1H_zpO2as-ZSiFC3CiJjP2lRpv9YH3rHkXpYUii1N6lU04EM%3D&xkcb=SoDz67M3FKygqZUEJp0JbzkdCdPP&fccid=7dc8be9efe945d3a&v

In [7]:
from jobs.models import IndeedJob

[IndeedJob(title=row[1], company=row[2], location=row[3], share_link=row[4], salary=row[5]) for row in results]

[IndeedJob(origin='https://indeed.com', title='Full Stack Software Engineer, Core and Monetization', company='Pinterest', location='Remote in San Francisco, CA', share_link='https://indeed.com/rc/clk?jk=d28f6864ca2641ec&bb=UPkVe9GUNmPlaDhnXCdzjrdcTHA8XaMFb_XO3GlzxXlC78pxHzHOCTQ0eLuyEVbrNc9IWlZ8GXamoM056sMyNjceuXGSUTkBFhtQ8c2VC4Y%3D&xkcb=SoDa67M3FKygqZUEJp0LbzkdCdPP&fccid=43014b1412e0a7b6&vjs=3', salary=None),
 IndeedJob(origin='https://indeed.com', title='Frontend Software Engineer', company='Resourcely', location='Remote', share_link='https://indeed.com/rc/clk?jk=0edbac704d39d4f9&bb=UPkVe9GUNmPlaDhnXCdzjn8vw2ulcQugCL_KmFrCnG10Uy9na2MxdNVaxoc-gHX2Pt_Kk82YzeIyv-XLnSGvbMnO03XI7vg52AdKaqwGE3g%3D&xkcb=SoBu67M3FKygqZUEJp0KbzkdCdPP&fccid=b17b7450885ceb79&vjs=3', salary=None),
 IndeedJob(origin='https://indeed.com', title='Software QA Engineer', company='Peraton', location='Remote in United States', share_link='https://indeed.com/rc/clk?jk=62e6126a8ebcfce1&bb=UPkVe9GUNmPlaDhnXCdzjrbZ5hRD77Btw