In [2]:
# %%
import os
import datetime
import time
import requests
import json
from dotenv import load_dotenv
from firecrawl import FirecrawlApp
from pydantic import BaseModel, Field
from typing import List

# Load environment variables
load_dotenv()

# Retrieve API keys from environment variables
firecrawl_api_key = os.getenv("FIRECRAWL_API_KEY")

# Initialize the FirecrawlApp with your API key
app = FirecrawlApp(api_key=firecrawl_api_key)

# Set the jobs page URL
jobs_page_url = "https://openai.com/careers"


In [3]:
# %%
# Use the Firecrawl Map API to get the sitemap
api_url = "https://api.firecrawl.dev/v1/map"
payload = {
    "url": jobs_page_url,
    "search": "",  # Empty search term to get all pages
    "limit": 15
}
headers = {
    "Authorization": f"Bearer {firecrawl_api_key}",
    "Content-Type": "application/json"
}
response = requests.post(api_url, json=payload, headers=headers)

if response.status_code == 200:
    map_result = response.json()
    if map_result.get('success'):
        links = [link for link in map_result.get('links', []) if link != jobs_page_url]
        print(f"Total pages mapped (excluding original URL): {len(links)}")
        print(links)
    else:
        print("Map API request was not successful")
        exit(1)
else:
    print(f"Error: {response.status_code}")
    print(response.text)
    exit(1)


Total pages mapped (excluding original URL): 14
['https://openai.com/careers/research-scientist', 'https://openai.com/careers/analytics-engineer', 'https://openai.com/careers/solutions-architect', 'https://openai.com/careers/iam-engineer', 'https://openai.com/careers/talent-partnerships', 'https://openai.com/careers/product-designer', 'https://openai.com/careers/recruiting-coordinator', 'https://openai.com/careers/av-specialist', 'https://openai.com/careers/it-support', 'https://openai.com/careers/director-edu', 'https://openai.com/careers/research-engineer', 'https://openai.com/careers/solutions-engineer', 'https://openai.com/careers/software-engineer-networking', 'https://openai.com/careers/revenue-operations-leader']


In [4]:
# %%
# Define the extraction schema
extract_schema = {
    "type": "object",
    "properties": {
        "job_title": {
            "type": "string"
        },
        "sub_division_of_organization": {
            "type": "string"
        },
        "key_skills": {
            "type": "array",
            "items": {
                "type": "string"
            }
        },
        "compensation": {
            "type": "string"
        },
        "apply_link": {
            "type": "string"
        }
    },
    "required": ["job_title", "sub_division_of_organization", "key_skills", "compensation", "apply_link"]
}

# Initialize a list to store the extracted data
extracted_data = []

# Process each link in the map result
for index, link in enumerate(links):
    try:
        response = requests.post(
            "https://api.firecrawl.dev/v1/scrape",
            headers={
                "Content-Type": "application/json",
                "Authorization": f"Bearer {firecrawl_api_key}"
            },
            json={
                "url": link,
                "formats": ["extract"],
                "extract": {
                    "schema": extract_schema
                }
            }
        )
        
        if response.status_code == 200:
            result = response.json()
            if result.get('success'):
                extracted_data.append(result['data']['extract'])
                print(f"Data extracted for page {index}")
            else:
                print(f"No data extracted for page {index}")
        else:
            print(f"Error {response.status_code} for page {index}: {response.text}")
    except Exception as e:
        print(f"An error occurred for page {index}: {str(e)}")


Error 500 for page 0: {"success":false,"error":"(Internal server error) - JSON parsing error(s): must be object\n\nLLM extraction did not match the extraction schema you provided. This could be because of a model hallucination, or an Error on our side. Try adjusting your prompt, and if it doesn't work reach out to support.  - Could be due to LLM parsing issues"}
Data extracted for page 1
Data extracted for page 2
Data extracted for page 3
Data extracted for page 4
Data extracted for page 5
Data extracted for page 6
Data extracted for page 7
Data extracted for page 8
Data extracted for page 9
Data extracted for page 10
Data extracted for page 11
Data extracted for page 12
Data extracted for page 13


In [5]:
# %%
# Print the extracted data
print("Extracted data:")
for job in extracted_data:
    print(json.dumps(job, indent=2))
    print("-" * 50)  # Separator between jobs

# Save as CSV
import csv
import os

# Get the current directory
current_dir = os.getcwd()

# Create the full path for the CSV file
csv_file = os.path.join(current_dir, "openai_jobs.csv")

try:
    with open(csv_file, "w", newline="") as f:
        if extracted_data:
            writer = csv.DictWriter(f, fieldnames=extracted_data[0].keys())
            writer.writeheader()
            for job in extracted_data:
                writer.writerow(job)
            print(f"Extracted data saved to {csv_file}")
        else:
            print("No data to save.")
except IOError as e:
    print(f"Error saving CSV file: {e}")


Extracted data:
{
  "job_title": "Analytics Engineer",
  "sub_division_of_organization": "Growth",
  "key_skills": [
    "SQL",
    "Python",
    "business intelligence tools",
    "ETL workflows",
    "data analysis",
    "dashboards",
    "data storytelling"
  ],
  "compensation": "$245K \u2013 $385K + Offers Equity",
  "apply_link": "https://jobs.ashbyhq.com/openai/340ef89c-a746-439a-888a-19580eb8c881/application"
}
--------------------------------------------------
{
  "job_title": "Solutions Architect",
  "sub_division_of_organization": "Technical Success",
  "key_skills": [
    "technical consulting",
    "Generative AI",
    "ML solutions",
    "network architecture",
    "cloud architecture",
    "Python",
    "Javascript"
  ],
  "compensation": "",
  "apply_link": "https://jobs.ashbyhq.com/openai/51721dfd-7bf5-4112-bb28-da5e4fd86e36/application"
}
--------------------------------------------------
{
  "job_title": "IAM Engineer",
  "sub_division_of_organization": "IT",
  "key_

In [6]:
from openai import OpenAI

# Resume
resume_paste = """"
Eric Ciarla
Co-Founder @ Firecrawl
San Francisco, California, United States
Summary
Building…
Experience
Firecrawl
Co-Founder
April 2024 - Present (6 months)
San Francisco, California, United States
Firecrawl by Mendable. Building data extraction infrastructure for AI. Used by
Amazon, Zapier, and Nvidia (YC S22)
Mendable
2 years 7 months
Co-Founder @ Mendable.ai
March 2022 - Present (2 years 7 months)
San Francisco, California, United States
- Built an AI powered search platform that that served millions of queries for
hundreds of customers (YC S22)
- We were one of the first LLM powered apps adopted by industry leaders like
Coinbase, Snap, DoorDash, and MongoDB
Co-Founder @ SideGuide
March 2022 - Present (2 years 7 months)
San Francisco, California, United States
- Built and scaled an online course platform with a community of over 50,000
developers
- Selected for Y Combinator S22 batch, 2% acceptance rate
Fracta
Data Engineer
2022 - 2022 (less than a year)
Palo Alto, California, United States
- Demoed tool during sales calls and provided technical support during the
entire customer lifecycle
Page 1 of 2
- Mined, wrangled, & visualized geospatial and water utility data for predictive
analytics & ML workflows (Python, QGIS)
Ford Motor Company
Data Scientist
2021 - 2021 (less than a year)
Dearborn, Michigan, United States
- Extracted, cleaned, and joined data from multiple sources using SQL,
Hadoop, and Alteryx
- Used Bayesian Network Structure Learning (BNLearn, R) to uncover the
relationships between survey free response verbatim topics (derived from
natural language processing models) and numerical customer experience
scores
MDRemindME
Co-Founder
2018 - 2020 (2 years)
Durham, New Hampshire, United States
- Founded and led a healthtech startup aimed at improving patient adherence
to treatment plans through an innovative engagement and retention tool
- Piloted the product with healthcare providers and patients, gathering critical
insights to refine functionality and enhance user experience
- Secured funding through National Science Foundation I-CORPS Grant and
UNH Entrepreneurship Center Seed Grant
Education
Y Combinator
S22
University of New Hampshire
Economics and Philosophy
"""

# Use o1-preview to choose which jobs should be applied to based on the resume
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

prompt = f"""
Please analyze the resume and job listings, and return a JSON list of the top 3 roles that best fit the candidate's experience and skills. Include only the job title, compensation, and apply link for each recommended role. The output should be a valid JSON array of objects in the following format, with no additional text:

[
  {{
    "job_title": "Job Title",
    "compensation": "Compensation (if available, otherwise empty string)",
    "apply_link": "Application URL"
  }},
  ...
]

Based on the following resume:
{resume_paste}

And the following job listings:
{json.dumps(extracted_data, indent=2)}
"""

completion = client.chat.completions.create(
    model="o1-preview",
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": prompt
                }
            ]
        }
    ]
)

recommended_jobs = json.loads(completion.choices[0].message.content.strip())

print("Recommended jobs:")
print(json.dumps(recommended_jobs, indent=2))

Recommended jobs:
[
  {
    "job_title": "Analytics Engineer",
    "compensation": "$245K \u2013 $385K + Offers Equity",
    "apply_link": "https://jobs.ashbyhq.com/openai/340ef89c-a746-439a-888a-19580eb8c881/application"
  },
  {
    "job_title": "Solutions Architect",
    "compensation": "",
    "apply_link": "https://jobs.ashbyhq.com/openai/51721dfd-7bf5-4112-bb28-da5e4fd86e36/application"
  },
  {
    "job_title": "Research Engineer",
    "compensation": "$295K \u2013 $440K + Offers Equity",
    "apply_link": "https://jobs.ashbyhq.com/openai/240d459b-696d-43eb-8497-fab3e56ecd9b/application"
  },
  {
    "job_title": "Solutions Engineer",
    "compensation": "",
    "apply_link": "https://jobs.ashbyhq.com/openai/dbfef1b0-9a77-46bd-ad36-67f3d0286924/application"
  }
]


In [None]:
# scrape each of the apply links with firecrawl /v1/scrape
import requests

firecrawl_api_key = os.getenv("FIRECRAWL_API_KEY")

def scrape_apply_link(url):
    api_url = "https://api.firecrawl.dev/v1/scrape"
    headers = {
        "Authorization": f"Bearer {firecrawl_api_key}",
        "Content-Type": "application/json"
    }
    payload = {
        "url": url
    }
    
    response = requests.post(api_url, json=payload, headers=headers)
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error scraping {url}: {response.status_code}")
        return None

scraped_job_data = []
for job in recommended_jobs:
    apply_link = job.get('apply_link')
    if apply_link:
        scraped_data = scrape_apply_link(apply_link)
        if scraped_data:
            scraped_job_data.append({
                'job_title': job['job_title'],
                'compensation': job['compensation'],
                'apply_link': apply_link,
                'scraped_content': scraped_data
            })

print(f"Scraped {len(scraped_job_data)} job application pages")

In [None]:
# use o1 to write the application for you and return in json
import json


def generate_application(job_data, resume_paste):
    # Extract relevant information from scraped content
    scraped_text = job_data['scraped_content'].get('text', '')
    
    prompt = f"""
    Based on the following job information, scraped content from the application page, and the provided resume, write a tailored job application:

    Job Title: {job_data['job_title']}
    Compensation: {job_data['compensation']}
    Scraped Content: {scraped_text[:1000]}  # Limit to first 1000 characters to avoid token limits

    Resume:
    {resume_paste}

    Please format the application as a JSON object with the following fields:
    - cover_letter: A personalized cover letter addressing key points from the scraped content and highlighting relevant experience from the resume
    - resume_highlights: Key points from the resume that align with the job requirements mentioned in the scraped content
    - questions: Any questions you have about the position, derived from the available information

    Ensure the content is specifically tailored to the information provided in the scraped content and leverages the experience detailed in the resume.
    """

    try:
        completion = client.chat.completions.create(
            model="o1-preview",
            messages=[
    
                {"role": "user", "content": prompt}
            ]
        )
        return json.loads(completion.choices[0].message.content)
    except Exception as e:
        print(f"Error generating application: {str(e)}")
        return None



applications = []
for job in scraped_job_data:
    application = generate_application(job, resume_paste)
    if application:
        applications.append({
            "job_title": job["job_title"],
            "apply_link": job["apply_link"],
            "application": application
        })

print(f"Generated {len(applications)} job applications based on scraped content and resume")
print(json.dumps(applications, indent=2))

# Save the JSON to a file
output_file = "generated_applications.json"
with open(output_file, "w") as f:
    json.dump(applications, f, indent=2)

print(f"Saved generated applications to {output_file}")