In [11]:
import sqlite3
import httpx
import json
from bs4 import BeautifulSoup
from enum import Enum
from playwright.async_api import async_playwright
from pydantic import BaseModel, Field, PrivateAttr, field_validator
from typing import Optional
from urllib.parse import urlencode, quote_plus


In [2]:
class JobCard(BaseModel):
    title: str
    company: str
    location: str
    url: str

class ExperienceLevel(str, Enum):
    INTERNSHIP = "internship"
    ENTRY_LEVEL = "entry_level"
    ASSOCIATE = "associate"
    MID_SENIOR = "mid_senior"
    DIRECTOR = "director"
    EXECUTIVE = "executive"

class Salary(str, Enum):
    S100K = "100k"
    S120K = "120k"
    S140K = "140k"
    S160K = "160k"
    S180K = "180k"
    S200K = "200k"


class Config(BaseModel):
    keywords: str = Field(
        ..., 
        description="Keywords to search for jobs, just as you would type them in the LinkedIn search bar"
    )
    location: str = "United States"
    time_since_post: int =  Field(
            None, 
            gt=1, 
            le=2592000, 
            description="Time since post in seconds, must be between 1 and 2592000 (30 days)"
        )
    remote: bool = False
    max_results: int = 10
    experience_levels: Optional[list[ExperienceLevel]] = None
    salary: Optional[Salary] = None

    _f_E: Optional[str] = PrivateAttr(default=None)
    _f_TPR: Optional[str] = PrivateAttr(default=None)
    _f_WT: Optional[str] = PrivateAttr(default=1)
    _f_SB2: Optional[str] = PrivateAttr(default=None)

    def model_post_init(self, __context):
        level_map = {
            ExperienceLevel.INTERNSHIP: "1",
            ExperienceLevel.ENTRY_LEVEL: "2",
            ExperienceLevel.ASSOCIATE: "3",
            ExperienceLevel.MID_SENIOR: "4",
            ExperienceLevel.DIRECTOR: "5",
            ExperienceLevel.EXECUTIVE: "6",
        }
        salary_map = {
            Salary.S100K: "4",
            Salary.S120K: "5",
            Salary.S140K: "6",
            Salary.S160K: "7",
            Salary.S180K: "8",
            Salary.S200K: "9",
        }

        if self.experience_levels:
            self._f_E = ",".join(level_map[l] for l in self.experience_levels)
        if self.time_since_post:
            self._f_TPR = f"r{self.time_since_post}"
        if self.remote:
            self._f_WT = "2"
        if self.salary:
            self._f_SB2 = salary_map.get(self.salary)

    @field_validator("keywords")
    def validate_keywords(cls, v):
        if not v:
            raise ValueError("Keywords cannot be empty")
        return v

In [3]:
def build_search_url(config: Config, start=0):
    base_url = "https://www.linkedin.com/jobs/search"
    query = {
        "keywords": config.keywords,
        "location": config.location,
        "start": start,
        "f_WT": config._f_WT,
    }

    if config._f_E:
        query["f_E"] = config._f_E
    if config._f_TPR:
        query["f_TPR"] = config._f_TPR
    if config._f_SB2:
        query["f_SB2"] = config._f_SB2


    return base_url + "?" + urlencode(query, quote_via=quote_plus)

In [4]:
def scrape_job_links_from_search_page(page_html):
    soup = BeautifulSoup(page_html, "html.parser")
    job_links = []

    for a_tag in soup.select("a.base-card__full-link"):
        href = a_tag.get("href")
        if href and "/jobs/view/" in href:
            job_links.append(href.split("?")[0])
    
    return list(set(job_links))

In [12]:
async def scrape_job_details(url):
    with async_playwright() as sp:
        browser = sp.chromium.launch(headless=True)
        page = browser.new_page()
        page.goto(url, timeout=60)
        page.wait_for_selector("h1", timeout=60)

        try:
            title = page.locator("h1").text_content().strip()
            company = page.locator("a.topcard__ord-name-link").text_content().strip()
        except Exception as e:
            print(e)
            company = page.locator("span.topcard__flavor").nth(0).text_content().strip()
        
        try:
            location = page.locator("span.topcard__flavor--bullet").text_content().strip()
        except:
            location = ""

        try:
            description = page.locator("div.description__text").text_content().strip()
        except:
            description = ""

        browser.close()
        return {
            "title": title,
            "company": company,
            "location": location,
            "description": description,
            "link": url
        }
        


In [6]:
def load_config(file_path: str) -> Config:
    with open(file_path, 'r') as file:
        config_data = json.load(file)
    return Config(**config_data)

In [7]:
config = load_config('./config_example.json')

In [8]:
def scrape_job_cards(html):
    soup = BeautifulSoup(html, 'html.parser')
    job_cards = soup.find_all('div', class_='job-card-container')
    jobs = []

    for card in job_cards:
        try:
            title = card.select_one('h3').get_text(strip=True)
            company = card.select_one('h4').get_text(strip=True)
            location = card.select_one('.job-search-card_location').get_text(strip=True)
            job_link = "https://www.linkedin.com" + card.select_one('a')['href'].split('?')[0]
            jobs.append({
                "title": title,
                "company": company,
                "location": location,
                "link": job_link
            })
        except Exception as e:
            print(f"Exception raised: {e}")

    return jobs

In [9]:
url = build_search_url(config)

In [13]:
with async_playwright() as sp:
    browser = sp.chromium.launch(headless=True)
    page = browser.new_page()
    page.goto(url, timeout=60)
    page.wait_for_event("ul.jobs-search__results-list", timeout=60)
    html = page.content()
    browser.close()

    job_links = await scrape_job_links_from_search_page(html)

TypeError: 'PlaywrightContextManager' object does not support the context manager protocol

In [10]:
soup = BeautifulSoup(response.text, 'html.parser')
job_cards = soup.find_all('div', class_='job-card-container')

In [12]:
soup

<!DOCTYPE html>

<html lang="en">
<head>
<meta content="d_jobs_guest_search" name="pageKey"/>
<!-- --><!-- --> <meta content="urlType=jserp_custom;emptyResult=false" name="linkedin:pageTag"/>
<meta content="en_US" name="locale"/>
<meta data-app-version="2.0.2480" data-browser-id="7c3fba49-dfcd-4d5e-8451-48a93e58f714" data-call-tree-id="AAY7ZueTXv+GniXg1ybUWQ==" data-dfp-member-lix-treatment="control" data-disable-jsbeacon-pagekey-suffix="false" data-dna-member-lix-treatment="enabled" data-enable-page-view-heartbeat-tracking="" data-human-member-lix-treatment="enabled" data-member-id="0" data-multiproduct-name="jobs-guest-frontend" data-network-interceptor-lix-value="control" data-page-instance="urn:li:page:d_jobs_guest_search;ugG9B9L9S8SWi3GbTQoYkg==" data-recaptcha-v3-integration-lix-value="control" data-service-name="jobs-guest-frontend" data-should-use-full-url-in-pve-path="true" data-sync-apfc-cb-lix-treatment="enabled" data-sync-apfc-headers-lix-treatment="control" id="config"/>
<

In [3]:
import json

with open("../../scraped_jobs_10.json", "r") as f:
    jobs_json = json.load(f)

In [10]:
import os
from openai import OpenAI
from dotenv import load_dotenv

load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")


In [13]:
def vectorize(json_path: str):

    with open(json_path, "r") as f:
        jobs_json = json.load(f)

    client = OpenAI(api_key=OPENAI_API_KEY)

    metadatas = []
    ids = []
    descriptions = []

    for job in jobs_json:
        metadata = {
            "title": job["title"],
            "link": job["link"],
            "location": job["location"],
            "company": job["company"],
        }
        metadatas.append(metadata)

        ids.append(job["link"].split("-")[-1]) # get job id from end of job link\

        descriptions.append(job["description"])

    # response = client.embeddings.create(
    #     model="text-embedding-3-small",
    #     input=descriptions
    # )
    return metadata, ids, descriptions

In [None]:
metadata, ids, descriptions = vectorize()