In [8]:
import pandas as pd
import httpx, re
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_openai import ChatOpenAI
from langchain.schema import HumanMessage, SystemMessage

In [None]:
EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
LLM_MODEL = "gemini-pro"
OPENAI_KEY = ""

In [10]:
def update_base_url(request: httpx.Request):
  if request.url.path == "/chat/completions":
    request.url = request.url.copy_with(path="/v1/chat")

llm = ChatOpenAI(
    default_headers={"Ocp-Apim-Subscription-Key": OPENAI_KEY},
    base_url=OPENAI_ENDPOINT,
    api_key=False,
    http_client=httpx.Client(
      event_hooks={
        "request": [update_base_url],
    }),
  )

In [11]:
df = pd.read_csv("../data/main-data/data-checkpoints/jd5.csv")
job_title_list = list(df["Job Title"])
job_description_list = list(df["Job Description"])

for i in range(len(job_description_list)):
  job_description_list[i] = job_title_list[i] + "\n" + job_description_list[i] 

In [30]:
generated_resumes_list = []

for job_description in [job_description_list[3]]:
  user_message = HumanMessage(content="""
    You are an AI assistant that helps create resumes for a given job description.
    Generate 2 resumes for each job description so that one resume is an almost perfect match, while the other resume is only slightly relevant. 
    Use a combination of skills, different industry/project work experience, education, and certifications to produce resume data.
    You may add some KPIs to make work experience realistic.
    Do not include any note or explanation of how you generate the resumes. 
  """)
  
  system_message = SystemMessage(content=f"""
    Here is the Job Description (Note that all required skills may not be present in resume and some nonrelevant details can be present). 
    The length of a resume should only be between 200 and 500 words. 
    {job_description}
  """)

  response = llm.invoke([user_message, system_message])
  generated_resumes_list.append(response)

In [131]:
generated_resumes = []
ground_truth_resume = []

for resume_tuple in generated_resumes_list:
  content_str = resume_tuple.content
  resume_relevant, resume_irrelevant = re.sub(r"(?i)(Resume 1|Perfect Match|Almost Perfect Match|Slightly Relevant)", "", content_str).split("Resume 2")  
  generated_resumes.append(resume_relevant)
  generated_resumes.append(resume_irrelevant)
  ground_truth_resume.append(resume_relevant)

In [132]:
testset_df = pd.DataFrame({"Job Description": job_description_list, "Ground Truth": ground_truth_resume})
testset_df.to_csv("../data/main-data/test-sets/testset-5.csv", index=False)

In [133]:
generated_resumes_df = pd.DataFrame({"Resume": generated_resumes}).sample(frac=1, random_state=13)
generated_resumes_df.to_csv("../data/main-data/generated-resumes/synthetic-resume-5.csv", index=False)