# Generating synthetic Resumes

## Importing libraries

In [1]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_ollama.llms import OllamaLLM
from langchain_nvidia_ai_endpoints import ChatNVIDIA
import asyncio
import time
import nest_asyncio
import json
from faker import Faker

import os
import pandas as pd

# Set Pandas to display all columns
pd.set_option('display.max_columns', None)

In [2]:
import os
import sys
from pathlib import Path

path = Path(os.path.dirname(os.getcwd()))
path = str(path)
print(path)
sys.path.insert(1, path)

/Users/saideepbunny/Projects/Application_Ranking_System


## Reading the data

In [3]:
# There are 3 sets, this notebook will be run for 3 times by changing the set_number value. 
# This is because of the NVIDIA API credit limitation.
set_number = 3

In [4]:
df = pd.read_csv(f"{path}/data/sampled_linkedin_jd_skills/sampled_jd_skills_set{set_number}.csv")
df

Unnamed: 0,job_link,last_processed_time,got_summary,got_ner,is_being_worked,job_title,company,job_location,first_seen,search_city,search_country,search_position,job_level,job_type,job_skills,job_summary,id,summary_len,skill_count
0,https://www.linkedin.com/jobs/view/master-sche...,2024-01-21 13:08:00.689444+00,t,t,f,Master Scheduler,Baltimore Aircoil Company,"Dayton, TN",2024-01-17,Tennessee,United States,Traffic Agent,Mid senior,Onsite,"Operations, Supply Chain, MRP systems, SAP ERP...","Baltimore Aircoil Company (BAC), an Amsted Ind...",866062,5864,77
1,https://www.linkedin.com/jobs/view/master-tech...,2024-01-19 09:45:09.215838+00,t,t,f,Master Technician,A. O. Smith Corporation,"Lebanon, TN",2024-01-12,Watertown,United States,Assembler,Mid senior,Onsite,"Engineering, Product Development, Innovation, ...",Company / Location Information\nWater technolo...,531893,4514,32
2,https://www.linkedin.com/jobs/view/material-ha...,2024-01-21 15:59:00.747387+00,t,t,f,Material Handler,IMI Critical Engineering,"Rancho Santa Margarita, CA",2024-01-16,Corona,United States,Route Supervisor,Associate,Onsite,"Forklift operation, Stacker operation, Metal c...","About IMI\nIMI plc, the specialist engineering...",927220,6292,80
3,https://www.linkedin.com/jobs/view/materials-m...,2024-01-21 01:30:05.765203+00,t,t,f,Materials Manager,Pregis,"Bensenville, IL",2024-01-16,Elgin,United States,Manager Procurement Services,Mid senior,Onsite,"Leadership, Talent management, 5S program, Pro...",We're Not Just in the Packaging Business - We'...,1195068,5323,42
4,https://www.linkedin.com/jobs/view/materials-p...,2024-01-19 13:06:03.847966+00,t,t,f,Materials Planner,Faraday Future,"Hanford, CA",2024-01-14,Selma,United States,Scheduler,Mid senior,Onsite,"SAP, MRP software, Excel, Word, PowerPoint, Ou...",The Company:\nFaraday Future (FF) is a Califor...,818707,5205,30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,https://uk.linkedin.com/jobs/view/workshop-con...,2024-01-19 09:45:09.215838+00,t,t,f,Workshop Controller,Activate Group Limited,"Bury St. Edmunds, England, United Kingdom",2024-01-13,Ipswich,United Kingdom,Controller,Mid senior,Onsite,"Workshop Control, Body Shop Management, HighQu...",Job title: Workshop Controller\nDepartment: Ac...,321651,4703,32
496,https://www.linkedin.com/jobs/view/x-ray-techn...,2024-01-21 04:16:54.473867+00,t,t,f,X-Ray Technologist,Fast Pace Health,"Grant, AL",2024-01-17,Guntersville,United States,Ultrasound Technologist,Mid senior,Onsite,"XRay Technologist, HIPAA, Patient intake, Nurs...",Posting Title: X-Ray Technologist Overview:\nF...,1230454,5704,38
497,https://www.linkedin.com/jobs/view/xfinity-ret...,2024-01-19 23:17:04.320124+00,t,t,f,Xfinity Retail Sales Consultant,XFINITY Store by Comcast,"East Brunswick, NJ",2024-01-14,Long Branch,United States,Sales-Service Promoter,Mid senior,Onsite,"Customer service, Product demonstration, Probl...",Job Summary\nResponsible for assisting and con...,159258,4839,32
498,https://www.linkedin.com/jobs/view/yard-associ...,2024-01-19 09:45:09.215838+00,t,t,f,Yard Associate (SkillBridge),ARMOR Initiative,"Kansas City, MO",2024-01-13,Leavenworth,United States,Orderly,Associate,Onsite,"DoD SkillBridge Internship, Fulltime training,...",The ARMOR Initiative DoD SkillBridge Internshi...,774643,2277,30


In [5]:
df['id'].nunique()

500

## Selecting required columns

In [6]:
data_df = df[['id', 'job_title', 'company', 'job_skills', 'job_summary']].copy()
data_df

Unnamed: 0,id,job_title,company,job_skills,job_summary
0,866062,Master Scheduler,Baltimore Aircoil Company,"Operations, Supply Chain, MRP systems, SAP ERP...","Baltimore Aircoil Company (BAC), an Amsted Ind..."
1,531893,Master Technician,A. O. Smith Corporation,"Engineering, Product Development, Innovation, ...",Company / Location Information\nWater technolo...
2,927220,Material Handler,IMI Critical Engineering,"Forklift operation, Stacker operation, Metal c...","About IMI\nIMI plc, the specialist engineering..."
3,1195068,Materials Manager,Pregis,"Leadership, Talent management, 5S program, Pro...",We're Not Just in the Packaging Business - We'...
4,818707,Materials Planner,Faraday Future,"SAP, MRP software, Excel, Word, PowerPoint, Ou...",The Company:\nFaraday Future (FF) is a Califor...
...,...,...,...,...,...
495,321651,Workshop Controller,Activate Group Limited,"Workshop Control, Body Shop Management, HighQu...",Job title: Workshop Controller\nDepartment: Ac...
496,1230454,X-Ray Technologist,Fast Pace Health,"XRay Technologist, HIPAA, Patient intake, Nurs...",Posting Title: X-Ray Technologist Overview:\nF...
497,159258,Xfinity Retail Sales Consultant,XFINITY Store by Comcast,"Customer service, Product demonstration, Probl...",Job Summary\nResponsible for assisting and con...
498,774643,Yard Associate (SkillBridge),ARMOR Initiative,"DoD SkillBridge Internship, Fulltime training,...",The ARMOR Initiative DoD SkillBridge Internshi...


## Defining prompt template

In [7]:
category_requirements = {
    "Complete Mismatch": """
    The resume is highly professional but entirely unsuitable for the job. There is no meaningful alignment between the candidate's background and the job requirements.

    - **Career Path:**  
      - The candidate has spent their career in an industry that is entirely unrelated to the job. 

    - **Skills:**  
      - There is no significant overlap between the candidate’s skills and the job requirements.  
      - The candidate specializes in an entirely different domain and lacks both fundamental and advanced job-related skills.  
      - Example: A mechanical engineer with expertise in CAD software applying for a data science role requiring Python and ML expertise.  

    - **Experience:**  
      - The candidate has extensive work experience, but all of it is in a completely different field.  
      - Past roles have no transferable experience applicable to the new position.  

    - **Education & Certifications:**  
      - The candidate has degrees and certifications that do not contribute to the job requirements.  
      - Their educational background and certifications are completely irrelevant to the job requirements.
      
    Be creative in choosing different background for the candidates that is irrelevant to the job requirements.
    """,
    
    "Underwhelming": """
    The resume has some relevant aspects but falls noticeably short of the job requirements. The candidate may show potential but lacks key qualifications or experience to perform effectively in the role. The resume must satisfy at least one of the following conditions, though it may meet multiple.

    - **Experience:**  
      - The candidate has experience, but it is 30-50% less than the required level.  
      - Applying for a mid-level role but has only entry-level experience or internship work.  
      - Experience is somewhat related but lacks depth or key responsibilities expected for the role.  
      - Example: A candidate applying for a data science position with only one year of experience in data entry and no hands-on machine learning projects.  

    - **Skills:**  
      - The candidate possesses 40-60% of the required skills but lacks proficiency in essential ones.  
      - Skills are at a basic level when the job requires advanced knowledge.  
      - Example: A software engineer applying for a full-stack role but only has frontend development experience and lacks backend/database knowledge.  

    - **Projects/Work:**  
      -- The experience falls short of the Responsibilities mentioned in **Job Description**
      - The candidate has worked on projects, but they are limited in scope and do not demonstrate complex problem-solving.  
      - Example: A candidate applying for an AI role who has only completed beginner-level projects like basic data visualization but has no experience building predictive models or deploying solutions.  

    - **Education & Certifications:**  
      - The candidate has a degree that is somewhat related but lacks specialized coursework, certifications, or domain expertise required for the job.  
      - Example: An applicant with a general engineering degree applying for a cybersecurity role without security-specific training or certification.  
    """,
    
    "Good Fit": """
    The resume is a strong match for the job and meets all key expectations. The candidate is well-qualified and aligns well with the role requirements.

    - **Experience:**  
      - The candidate has experience that meets or slightly exceeds the required level.  
      - A progressive career trajectory is evident, showing growth in responsibility and expertise.  
      - Example: A candidate applying for a mid-level data science role with 3+ years of experience in building machine learning models and optimizing data pipelines.  

    - **Skills:**  
      - 95-100% alignment with job requirements.  
      - The candidate has demonstrated real-world application of skills, rather than just listing them.  
      - Example: A full-stack developer listing React, Node.js, and PostgreSQL as key technologies and showing their application in past work.  

    - **Projects/Work:**  
      - The experience must be closer to the Responsibilities mentioned in **Job Description**.
      - The candidate presents projects using an **Action-Problem-Result (APR)** format:  
        - Performed [advanced strategic action] to resolve [complex systemic problem] resulting in [exceptional measurable outcomes with broad impact]. Use some kind of number as a metric for measurable outcome.
      - Example: *Engineered an end-to-end claim complexity scoring ML pipeline, negating ~3 minutes of manual classification per claim, improving processing time by 20%.*  
      - Build 4-5 bullet points for work experience.

    - **Education & Certifications:**  
      - The candidate's educational background is an exact match for the job, including relevant degrees and certifications.  
      - Example: A software engineer applying for a backend development role with a Computer Science degree and AWS certification.  
    """,
    
    "Overqualified": """
    The resume significantly exceeds job requirements, indicating that the candidate may be too advanced for the role.

    - **Experience:**  
      - The candidate has 3-5+ years more experience than required.  
      - Holding leadership or advanced roles (e.g., senior manager, lead engineer) while applying for non-leadership positions.  
      - Example: A data scientist with 10 years of experience and a track record of leading AI teams applying for a data analyst position.  

    - **Skills:**  
      - The candidate possesses every required skill and additional high-level expertise beyond what the job demands.  
      - Example: A candidate applying for a software engineering role listing expertise in not only Python, Java, and C++, but also niche skills like CUDA programming for GPU acceleration.  

    - **Projects/Work:**  
      - The experience must be completely similar to the Responsibilities mentioned in **Job Description**.
      - The candidate presents projects using an **Action-Problem-Result (APR)** format:  
        - Performed [advanced strategic action] to resolve [complex systemic problem] resulting in [exceptional measurable outcomes with broad impact]. Use some kind of number as a metric for measurable outcome.
      - Example: *Revised the existing claim assignment process to automatically assign claims classified by scoring engine to agents by prioritizing highly complex claims, achieving 100% automation.* 
      - Build 6 bullet points for work experience. 

    - **Education & Certifications:**  
      - The candidate holds degrees and certifications beyond what is required for the role.  
      - Example: A PhD in Artificial Intelligence applying for a mid-level software engineering position that only requires a Masters's degree.  

    - **Career Progression:**  
      - The candidate has a track record of rapid career growth, indicating they may be better suited for a senior role rather than the one they are applying for.  
      - Example: Someone who went from junior developer to senior software architect in five years applying for a mid-level role.  
    """
}


# Candidate fit categories
categories = ["Complete Mismatch", "Underwhelming", "Good Fit", "Overqualified"]

In [8]:
# Resume Generation Prompt for LLaMA Models

resume_template = """
You are an expert resume writer with specialized knowledge in talent acquisition and hiring practices helping {name} to apply for a job. Your task is to generate a tailored, content-rich resume based on the provided inputs while strictly adhering to the specified fit category constraints. Below are the details of the job that the candidate will be applying to:

## **Job Details**:
{name} is applying for the company {company}. Below are the details for the job that the candidate will be applying to:
- **Role:** {job_role}
- **Job Description:** {jd}
- **Required Skills:** {skills}

**NOTE**: {name} will not have access to Required skills at the time of applying.

## **Fit Category:** {fit_category}
## **Category Requirements:**  
{category_requirements}

## **Instructions**:
1. **Resume Objective**  
   - Generate a professional resume that precisely aligns with the specified fit category ({fit_category}).
   - Ensure the resume maintains industry standards and meets professional expectations.
   - Structure the resume to be at least **600 words** with clearly defined sections.

2. **Mandatory Sections**  
   - **Education**: University, degree title, major, graduation year, GPA.
   - **Skills**: Comma separated list of skills that include technical skills, soft skills, technologies, and other skills. List at least 15 and at most 40. Try not to exceed 40.
   - **Work Experience**: Consists of company names, roles, employment type, duration, and detailed bullet points. Constitutes the work experience and work done for the company. Choose a company name, employment type and duration without leaving it blank. Be creative with the company names. **The company name should not be same as the one mentioned in Job Description**.
   

3. **Optional Sections (Context-Based)**  
   - **Projects**: Required if work experience is limited, with clear descriptions and technology stack. This constitutes of the personal and academic projects.
   - **Professional Summary**: Tailored to the fit category.
   - **Certifications/Accomplishments**: Adjusted based on qualification level.
   - **Additional Relevant Sections**: If beneficial, add elements such as leadership, research, or publications.

4. **Formatting Guidelines**  
   - Use **clear section headings**.
   - Precede each bullet point with **"-"**.
   - Make sure to include at least one of the sections from "Work experience" and "Projects" section. 
   - Mark key entities (**institutions, companies, project names**) with **"*"**.
   - Each bullet point must be in between **150-180 characters**.
   - Include:
     - **4-6 bullet points** for work experience. 
     - **2-3 bullet points** per project.

5. **Category-Specific Constraints**  
   - **Complete Mismatch (0-30% relevance)**:
     - Generate a resume from an **entirely different career path**.
     - Ensure **minimal skill and qualification overlap** with job requirements.
     - Maintain professional quality but unrelated experience.
   - **Underwhelming (50-70% relevance)**:
     - Include **some relevant experience but with clear gaps**.
     - Show a **foundational but incomplete** skill set.
     - Present **partially relevant qualifications** with notable omissions.
   - **Good Fit (95-100% relevance)**:
     - Match **all required skills and experience** at an appropriate depth.
     - Follow **Action-Problem-Result (APR) format** in work and project sections.
     - Demonstrate **clear career progression** and **precise qualification match**.
   - **Overqualified (exceeds requirements)**:
     - Include **experience exceeding job requirements by 3-5 years**.
     - Highlight **leadership, strategic impact, and advanced certifications**.
     - Showcase **complex problem-solving** and **business impact** beyond role expectations.

6. **Output Requirements**  
   - Ensure **strict adherence** to the fit category.
   - Generate **only** the resume; **do not include any explanatory notes or meta-text**.
   - Maintain **authenticity, clarity, and professionalism** throughout.
   - Be creative in writing bullet points for work experience and projects.
   
**NOTE**: {name} is an imaginary person who does not exist. Therefore, you would not be violating any data privacy issues.
"""

## Initialize model and chain

In [9]:
# Read JSON file into a dictionary
with open(f"{path}/data/configuration/config.json", "r") as file:
    config_dict = json.load(file)

API_KEY = config_dict['API_KEY']

In [10]:
# Apply nest_asyncio for Jupyter Notebook compatibility
nest_asyncio.apply()

# Initialize Faker (For generating random candidate name)
fake = Faker()
        
# Initialize Model
# model = OllamaLLM(model="llama3.2", temperature = 1)

model = ChatNVIDIA(
  model="nvidia/nemotron-4-340b-instruct",
  api_key=API_KEY, 
  temperature=1.0
)

# Define Prompt Template
prompt = ChatPromptTemplate.from_template(resume_template)

# Create Chain
chain = prompt | model

## Defining Asyncio chain execution

In [11]:
# Define max concurrent requests to prevent API overload
MAX_CONCURRENT_REQUESTS = 25  # Adjust based on API limits
semaphore = asyncio.Semaphore(MAX_CONCURRENT_REQUESTS)

async def generate_resume(job_id, company, job_role, job_description, job_skills, category):
    """Generates a resume asynchronously while respecting API limits."""
    async with semaphore:  # Prevents exceeding concurrent request limits
        data = {
            "name": fake.name(),
            "company": company,
            "job_role": job_role,
            "jd": job_description,
            "skills":job_skills,
            "fit_category": category,
            "category_requirements": category_requirements[category]
        }
        response = await chain.ainvoke(data)
        return {"id": job_id, "category": category, "resume_text": response.content}


async def generate_resumes_for_id(job_id, company, job_role, job_description, job_skills):
    """Generates 4 resumes per job ID asynchronously."""
    tasks = [generate_resume(job_id, company, job_role, job_description, job_skills, category) for category in categories]
    return await asyncio.gather(*tasks)  # Parallel execution

async def batch_generate_resumes(df, batch_size=50):
    """Processes large datasets in batches for efficiency."""
    all_results = []
    for start in range(0, len(df), batch_size):  # Process in chunks
        batch_df = df.iloc[start:start + batch_size]
        tasks = [generate_resumes_for_id(row["id"], row["company"], row["job_title"], row["job_summary"], row["job_skills"])
                 for _, row in batch_df.iterrows()]
        
        batch_results = await asyncio.gather(*tasks)  # Process batch concurrently
        all_results.extend([item for sublist in batch_results for item in sublist])  # Flatten results
        
        print(f"✅ Processed {min(start + batch_size, len(df))}/{len(df)} records...")
    
    return all_results


## Generating data

In [12]:
# ------------------ 🚀 Run Optimized Async Processing ------------------ #
start_time = time.time()

loop = asyncio.get_event_loop()
resumes = loop.run_until_complete(batch_generate_resumes(data_df, batch_size=50))  # Adjust batch size as needed

end_time = time.time()
print(f"\n⚡ Async Execution Time: {end_time - start_time:.2f} seconds")
# 
# # ⚡ Async Execution Time: 7998.52 seconds

✅ Processed 50/500 records...
✅ Processed 100/500 records...
✅ Processed 150/500 records...
✅ Processed 200/500 records...
✅ Processed 250/500 records...
✅ Processed 300/500 records...
✅ Processed 350/500 records...
✅ Processed 400/500 records...
✅ Processed 450/500 records...
✅ Processed 500/500 records...

⚡ Async Execution Time: 16421.81 seconds


In [13]:
# Convert results to DataFrame and append to original dataset
resume_df = pd.DataFrame(resumes)
resume_df

Unnamed: 0,id,category,resume_text
0,866062,Complete Mismatch,# *John Miller*\n\n**Professional Summary**\n\...
1,866062,Underwhelming,# James Casey\n\n## Professional Summary\n\nHi...
2,866062,Good Fit,# Rachel Vargas\n\n## Professional Summary\n\n...
3,866062,Overqualified,# Susan Bradley\n\n**Professional Summary**\n\...
4,531893,Complete Mismatch,# Sean Johnson\n\n## Professional Summary\n\nD...
...,...,...,...
1995,774643,Overqualified,# Jonathan Martinez\n\n## Professional Summary...
1996,700821,Complete Mismatch,**Lori Townsend**\n\n**Professional Summary**\...
1997,700821,Underwhelming,**KATRINA LOPEZ**\n*Yoga Enthusiast & Fitness ...
1998,700821,Good Fit,# Alexis Cruz\n\n## Professional Summary\n\nDe...


In [14]:
df_expanded = data_df.merge(resume_df, on="id", how="right")  # Merge back with original DataFrame
df_expanded

Unnamed: 0,id,job_title,company,job_skills,job_summary,category,resume_text
0,866062,Master Scheduler,Baltimore Aircoil Company,"Operations, Supply Chain, MRP systems, SAP ERP...","Baltimore Aircoil Company (BAC), an Amsted Ind...",Complete Mismatch,# *John Miller*\n\n**Professional Summary**\n\...
1,866062,Master Scheduler,Baltimore Aircoil Company,"Operations, Supply Chain, MRP systems, SAP ERP...","Baltimore Aircoil Company (BAC), an Amsted Ind...",Underwhelming,# James Casey\n\n## Professional Summary\n\nHi...
2,866062,Master Scheduler,Baltimore Aircoil Company,"Operations, Supply Chain, MRP systems, SAP ERP...","Baltimore Aircoil Company (BAC), an Amsted Ind...",Good Fit,# Rachel Vargas\n\n## Professional Summary\n\n...
3,866062,Master Scheduler,Baltimore Aircoil Company,"Operations, Supply Chain, MRP systems, SAP ERP...","Baltimore Aircoil Company (BAC), an Amsted Ind...",Overqualified,# Susan Bradley\n\n**Professional Summary**\n\...
4,531893,Master Technician,A. O. Smith Corporation,"Engineering, Product Development, Innovation, ...",Company / Location Information\nWater technolo...,Complete Mismatch,# Sean Johnson\n\n## Professional Summary\n\nD...
...,...,...,...,...,...,...,...
1995,774643,Yard Associate (SkillBridge),ARMOR Initiative,"DoD SkillBridge Internship, Fulltime training,...",The ARMOR Initiative DoD SkillBridge Internshi...,Overqualified,# Jonathan Martinez\n\n## Professional Summary...
1996,700821,Yoga Instructor,Active Wellness,"Yoga, Pranayama, Handson direction, Safe and e...",Job Description\nJob Title: Yoga Instructor\nP...,Complete Mismatch,**Lori Townsend**\n\n**Professional Summary**\...
1997,700821,Yoga Instructor,Active Wellness,"Yoga, Pranayama, Handson direction, Safe and e...",Job Description\nJob Title: Yoga Instructor\nP...,Underwhelming,**KATRINA LOPEZ**\n*Yoga Enthusiast & Fitness ...
1998,700821,Yoga Instructor,Active Wellness,"Yoga, Pranayama, Handson direction, Safe and e...",Job Description\nJob Title: Yoga Instructor\nP...,Good Fit,# Alexis Cruz\n\n## Professional Summary\n\nDe...


## Writing the data

In [15]:
assert(df_expanded.shape[0] == 4 * df.shape[0]) 
# For each input record, 4 output records must be generated (one for each category)

In [16]:
df_expanded.to_csv(f"{path}/data/synthetic_data/synthetic_jd_resume_set{set_number}.csv", index=False)