# Generate Synthetic Queries with GPT-4o-mini (batch)

Based on example from [here](https://sbert.net/docs/sentence_transformer/training_overview.html#trainer)

### imports

In [1]:
import pandas as pd
from openai import OpenAI
import json
from datasets import load_dataset

from top_secret import my_sk
client = OpenAI(api_key=my_sk)

### functions

In [2]:
prompt_template = lambda job_description : f"""Read the following job description and create a concise job search query with at most 3 specialized skills or \
areas of expertise that are distinct to the role. Exclude generic data science or software engineering skills like AI, machine \
learning, and coding languages unless they are explicitly highlighted as unique or advanced. Keep the query short and human-like, \
suitable for typing into a search engine. 

Here's the job description: {job_description}"""

In [3]:
def generate_query(job_description):
    """
        Function to generate synthetic query to input job description.
    """

    # generate prompt
    prompt = prompt_template(job_description)

    # make api call
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "user", "content": prompt}
        ], 
        temperature = 0.7
    )
    
    # return response
    return response.choices[0].message.content

### Load data

In [4]:
# load data from HF hub
ds = load_dataset("datastax/linkedin_job_listings")

# convert to pandas df
df = ds['train'].to_pandas()

# keep only title and description
df = df[['title', 'description']]
df.shape

(123849, 2)

In [5]:
# List of strings to search for
search_terms = ["Data Scientist", "Data Analyst", "Machine Learning Engineer", 
                "Data Engineer", "AI Engineer", "Deep Learning"]

# Create a regex pattern to match any of the strings
pattern = '|'.join(search_terms)

# Filter rows that contain any of the search terms
df = df[df['title'].str.contains(pattern, case=False, na=False)]
df.shape

(1179, 2)

In [16]:
# save to file
df.to_csv('data/job_data.csv')

### Generates queries for JDs

In [6]:
job_description_list = df['description'].to_list()

In [15]:
# # non-batch approach
# synthetic_query_list = []
# for job_description in job_description_list:
#     # generate synthetic query and append to list
#     synthetic_query_list.append(generate_query(job_description).replace('"',''))

# # add queries to df
# df['query'] = synthetic_query_list

# df.to_csv('data/job_data_w_query.csv')

#### 1) Create batch request file

In [8]:
# create batch requests
batch_requests = [
    {
        "custom_id": f"request-{i+1}",  # Custom ID for tracking
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            "model": "gpt-4o-mini",
            "messages": [
                {"role": "user", "content": prompt_template(job_description)}
            ],
            "temperature": 0.7
        }
    }
    for i, job_description in enumerate(job_description_list)
]

In [9]:
# Convert to JSONL format (newline-delimited JSON)
batch_jsonl = "\n".join(json.dumps(request) for request in batch_requests)

In [10]:
# Save to a .jsonl file
with open("data/batch_requests.jsonl", "w") as file:
    file.write(batch_jsonl)

#### 2) Upload batch request file

In [11]:
batch_input_file = client.files.create(
    file=open("data/batch_requests.jsonl", "rb"),
    purpose="batch"
)

print(batch_input_file)

FileObject(id='file-3vhvpk8unsjMkWe7U8AfP6', bytes=5047825, created_at=1737469194, filename='batch_requests.jsonl', object='file', purpose='batch', status='processed', status_details=None)


#### 3) Create batch

In [12]:
# create batch job
batch_object = client.batches.create(
    input_file_id=batch_input_file.id,
    endpoint="/v1/chat/completions",
    completion_window="24h",
    metadata={
        "description": "synthetic queries from job descriptions"
    }
)

In [13]:
print(batch_object)

Batch(id='batch_678fad0c34dc8190990016c0ac76b539', completion_window='24h', created_at=1737469197, endpoint='/v1/chat/completions', input_file_id='file-3vhvpk8unsjMkWe7U8AfP6', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1737555597, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'synthetic queries from job descriptions'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))
