# Generating synthetic Resumes

## Importing libraries

In [1]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_ollama.llms import OllamaLLM
from langchain_nvidia_ai_endpoints import ChatNVIDIA
import asyncio
import time
import nest_asyncio
import json
from faker import Faker
import random

import os
import pandas as pd

# Set Pandas to display all columns
pd.set_option('display.max_columns', None)

In [2]:
path = "/Users/saideepbunny/Projects/Application_Ranking_System"

## Reading the data

In [3]:
# There are 2 sets, this notebook will be run for 2 times by changing the set_number value and twice for each set (iteration number). 
# This is because of the NVIDIA API credit limitation.
set_number = 2
iteration_number = 2

In [4]:
df = pd.read_csv(f"{path}/data/sampled_linkedin_jd_skills/sampled_data_v2/sampled_jd_resume_set{set_number}.csv")
df

Unnamed: 0,job_link,last_processed_time,got_summary,got_ner,is_being_worked,job_title,company,job_location,first_seen,search_city,search_country,search_position,job_level,job_type,job_skills,job_summary,id,summary_len,skill_count
0,https://www.linkedin.com/jobs/view/research-sc...,2024-01-19 09:45:09.215838+00,t,t,f,Research Scientist,Massachusetts General Hospital,"Boston, MA",2024-01-12,Ipswich,United States,Chemist,Mid senior,Onsite,"Markov models, Monte Carlo simulation, Compart...",General Summary/Overview Statement\nThe Medica...,504467,6900,22
1,https://www.linkedin.com/jobs/view/research-sc...,2024-01-19 09:45:09.215838+00,t,t,f,Research Scientist,University of Utah,"Salt Lake City, UT",2024-01-12,Layton,United States,Biologist,Mid senior,Onsite,"Research projects, Scientific design, Research...",Details\nOpen Date\n05/25/2023\nRequisition Nu...,511568,6836,41
2,https://ca.linkedin.com/jobs/view/research-sci...,2024-01-19 09:45:09.215838+00,t,t,f,Research Scientist,Carbon Engineering,"Squamish, British Columbia, Canada",2024-01-12,Sarnia-Clearwater,Canada,Biologist,Mid senior,Onsite,"Chemistry, Material Science, Physics, Analytic...",Grow your career. Be the difference.\nLeading ...,292876,6623,28
3,https://www.linkedin.com/jobs/view/research-sc...,2024-01-21 05:37:01.337526+00,t,t,f,Research Scientist,Accenture Federal Services,"Annapolis Junction, MD",2024-01-14,Bowie,United States,Mechanical Research Engineer,Mid senior,Onsite,"AI, Machine Learning, Natural Language Process...","At Accenture Federal Services, nothing matters...",1246246,6288,23
4,https://www.linkedin.com/jobs/view/research-sc...,2024-01-19 09:45:09.215838+00,t,t,f,Research Scientist,Sylvatex (SVX),"Alameda, CA",2024-01-12,Concord,United States,Biologist,Mid senior,Onsite,"Chemistry, Material Science, Physics, Chemical...",Research Scientist Job Description\nSylvatex (...,295224,6156,33
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
245,https://www.linkedin.com/jobs/view/test-engine...,2024-01-19 20:05:43.91119+00,t,t,f,Test Engineer,Stoke Space,"Moses Lake, WA",2024-01-14,Ephrata,United States,Agricultural-Research Engineer,Mid senior,Onsite,"Fluid systems, Mechanical systems, Instrumenta...",A thriving economy in space is needed to make ...,14991,4365,15
246,https://www.linkedin.com/jobs/view/test-engine...,2024-01-19 09:45:09.215838+00,t,t,f,Test Engineer,Akkodis,"Tempe, AZ",2024-01-12,Canyon,United States,Agricultural-Research Engineer,Associate,Onsite,"Aerospace, Mechanical test environment, Mechan...",Akkodis is seeking a Test Engineer for a posit...,189702,3999,33
247,https://www.linkedin.com/jobs/view/test-engine...,2024-01-19 09:45:09.215838+00,t,t,f,Test Engineer,nFocus Solutions®,"Orlando, FL",2024-01-13,Avondale,United States,Computer Operator,Mid senior,Onsite,"Agile Software Development, System Testing, Te...",Role Summary\nWork within an Agile Software De...,16993,3581,28
248,https://www.linkedin.com/jobs/view/test-engine...,2024-01-19 09:45:09.215838+00,t,t,f,Test Engineer,Micross Components,"Los Alamitos, CA",2024-01-12,Pasadena,United States,Agricultural-Research Engineer,Associate,Onsite,"Semiconductor Testing, Automated Test Equipmen...","Job Summary:\nPerforms LAT testing, builds bur...",568816,3510,40


In [5]:
df['id'].nunique()

250

## Selecting required columns

In [6]:
data_df = df[['id', 'job_title', 'company', 'job_skills', 'job_summary']].copy()
data_df

Unnamed: 0,id,job_title,company,job_skills,job_summary
0,504467,Research Scientist,Massachusetts General Hospital,"Markov models, Monte Carlo simulation, Compart...",General Summary/Overview Statement\nThe Medica...
1,511568,Research Scientist,University of Utah,"Research projects, Scientific design, Research...",Details\nOpen Date\n05/25/2023\nRequisition Nu...
2,292876,Research Scientist,Carbon Engineering,"Chemistry, Material Science, Physics, Analytic...",Grow your career. Be the difference.\nLeading ...
3,1246246,Research Scientist,Accenture Federal Services,"AI, Machine Learning, Natural Language Process...","At Accenture Federal Services, nothing matters..."
4,295224,Research Scientist,Sylvatex (SVX),"Chemistry, Material Science, Physics, Chemical...",Research Scientist Job Description\nSylvatex (...
...,...,...,...,...,...
245,14991,Test Engineer,Stoke Space,"Fluid systems, Mechanical systems, Instrumenta...",A thriving economy in space is needed to make ...
246,189702,Test Engineer,Akkodis,"Aerospace, Mechanical test environment, Mechan...",Akkodis is seeking a Test Engineer for a posit...
247,16993,Test Engineer,nFocus Solutions®,"Agile Software Development, System Testing, Te...",Role Summary\nWork within an Agile Software De...
248,568816,Test Engineer,Micross Components,"Semiconductor Testing, Automated Test Equipmen...","Job Summary:\nPerforms LAT testing, builds bur..."


## Defining prompt template

In [7]:
# Updated category requirements with enhanced specifications
category_requirements = {
    "Complete Mismatch": """
The resume is highly professional but entirely unsuitable for the job. There is no meaningful alignment between the candidate's background and the job requirements. The resume must contain 500-700 words in total.

- **Career Path:**  
  - The candidate has spent their career in an industry that is entirely unrelated to the job.  
  - The candidate has worked with technologies completely unrelated to the job.
  - Create a logical but irrelevant career progression with 2-3 positions showing growth in the wrong field.

- **Skills:**  
  - Core Skills: Show expertise in a completely different domain with maximum 10% overlap with required job skills.
  - Technical Skills: Focus on technologies from an unrelated field or outdated/irrelevant technologies.
  - Additional Skills: Include transferable skills but avoid those specifically mentioned in the job description.

- **Work or Project Experience:**
  - Generate 3-4 bullet points for each work experience.
  - Generate 2-3 bullet points for each project experience.
  - Use actual metrics and quantifiable results but in completely unrelated domains.
  - Analyze the job description to understand any minimum work experience or skill experience required for the job. Make sure to miss all of those requirements.

- **Education & Certifications:**  
  - The candidate has degrees that do not contribute to the job requirements.
  - The candidate has certifications (if any) that do not contribute to the job requirement.
  - Education should be chronologically consistent with work experience.
""",
    
    "Underwhelming": """
The resume is highly professional but falls noticeably short of the job requirements. The candidate may show potential but lacks key qualifications or experience to perform effectively in the role. The resume must contain 500-600 words in total.

- **Career Path:**  
  - The candidate has experience in a somewhat related industry but lacks specific job-domain experience.
  - The candidate has worked with some related technologies but lacks experience with critical ones.
  - Show a career trajectory that suggests interest in transitioning to the job responsibilities, but insufficient specific experience.

- **Skills:**  
  - Core Skills: Include 2-3 highly relevant skills but miss several critical ones.
  - Technical Skills: Show 40-60% overlap with required job skills, with gaps in key areas.
  - Additional Skills: Include relevant soft skills that partially compensate for technical gaps.

- **Work or Project Experience:**
  - Generate 3-4 bullet points for each work experience.
  - Generate 2-3 bullet points for each project experience.
  - Focus on somewhat relevant but insufficient experiences.
  - Include 1-2 projects or responsibilities that demonstrate potential but not proven expertise.
  - Analyze the job description to understand any minimum work experience or skill experience required for the job. Make sure to miss some of those requirements.

- **Education & Certifications:**  
  - The candidate can satisfy the educational requirements.
  - The candidate might lack any required certifications.
  - Include 1-2 relevant online courses or training to suggest self-improvement efforts.
""",
    
    "Good Fit": """
The resume is highly professional and is a strong match for the job and meets all key expectations. The candidate is well-qualified and aligns well with the role requirements. The resume must contain 500-600 words in total.

- **Career Path:**  
  - The candidate has experience in the same or highly related industry as the job requires.
  - The candidate has worked with most or all of the technologies mentioned in the job requirements.
  - Show a logical progression of roles with increasing responsibility in relevant domains.

- **Skills:**  
  - Core Skills: Demonstrate 90-100% match with critical job requirements.
  - Technical Skills: Show comprehensive coverage of technical requirements with appropriate proficiency levels.
  - Additional Skills: Include complementary skills that enhance value (e.g., relevant soft skills, domain knowledge).

- **Work or Project Experience:**
  - Generate 4-5 bullet points for each work experience.
  - Generate 3-4 bullet points for each project experience.
  - Include relevant experiences that directly apply to the job requirements.
  - For each role, include at least 2 achievements with specific metrics that demonstrate success.
  - Analyze the job description to understand any minimum work experience or skill experience required for the job. Make sure to match each of those requirements.
  
- **Education & Certifications:**  
  - The candidate's educational background is an exact match for the job requirements.
  - The candidate has relevant certifications that directly contribute to job performance.
  - Include any specialized training or continuing education relevant to the role.
""",
    
    "Overqualified": """
The resume is highly professional and significantly exceeds job requirements, indicating that the candidate may be too advanced for the role. The resume must contain at least 600-800 words in total.

- **Career Path:**  
  - The candidate has extensive experience in the same industry, exceeding the required experience by 40-100%.
  - The candidate has mastered all technologies mentioned in the job requirements plus additional advanced ones.
  - Include leadership roles and strategic responsibilities beyond the scope of the target position.

- **Skills:**  
  - Core Skills: Demonstrate expert-level mastery of all required skills plus additional advanced competencies.
  - Technical Skills: Show expertise across broader technology ecosystem relevant to the role.
  - Additional Skills: Include leadership, mentoring, and strategic planning abilities beyond the role requirements.

- **Work or Project Experience:**
  - Generate 5-6 bullet points for each work experience.
  - Generate 4-5 bullet points for each project experience.
  - Highlight advanced expertise, leadership, and strategic impact.
  - Include significant achievements with impressive metrics that exceed typical expectations for the role.
  - Document experience mentoring, managing, or leading initiatives beyond the scope of the target role.
  - Analyze the job description to understand any minimum work experience or skill experience required for the job. Make sure to exceed each of those requirements by at least 40%.

- **Education & Certifications:**  
  - The candidate holds degrees beyond what is required for the role (higher level or additional specialized degrees).
  - The candidate possesses advanced or numerous certifications beyond what the job requires.
  - Include publications, speaking engagements, or industry recognition where relevant.
  - Document advanced training or specialized education beyond job requirements.
"""
}

categories = ['Complete Mismatch', 'Underwhelming', 'Good Fit', 'Overqualified']
categories

['Complete Mismatch', 'Underwhelming', 'Good Fit', 'Overqualified']

In [8]:
resume_template = """
You are an expert resume writer with specialized knowledge in talent acquisition and hiring practices at {company}. You are tasked to create a highly professional resume that can be used by Human Resources as a reference to categorize applicants as **{fit_category}**. Your task is to generate a tailored, content-rich resume based on the provided inputs while strictly adhering to the specified fit category constraints. Below are the details of the job that the applicants will be applying to:

## **Job Details**:
Assume {name} is applying for the company {company}. Below are the details for the job that the candidate will be applying to:
- **Role:** {job_role}
- **Job Description:** {jd}
- **Required Skills:** {skills}

## **Fit Category:** {fit_category}
## **Category Requirements:**  
{category_requirements}

## **Resume Format:** {resume_format} # Chronological, Functional, or Hybrid

## **Instructions**:
1. **Resume Objective**  
   - Generate a highly professional resume that precisely aligns with the specified fit category ({fit_category}).
   - Ensure the resume maintains industry standards and meets professional expectations.
   - Structure the resume to contain clearly defined sections.

2. **Candidate Profile**
   - **Contact Information**: Generate realistic but fictional phone, email, LinkedIn URL.
   - **Professional Summary**: 2-4 sentences highlighting career focus, expertise level, and key strengths aligned with the fit category.

3. **Mandatory Sections**  
   - **Education**: University name, degree title, major, graduation year, GPA if applicable. For higher education levels, ensure chronological consistency with work experience.
   - **Skills**: Comma separated set of skills. Skills must contain 
       - 5-10 primary skills with highest proficiency
       - 10-20 technical competencies with details on proficiency level
       - 5-10 complementary abilities, particularly soft skills

4. **Optional Sections**  
   - **Work Experience**: 
       - Ignore this section if the job is expecting entry level candidates who are just out of college.
       - Generate company names, roles, employment type, duration showing logical career progression.
       - Ensure role seniority aligns with experience level and fit category.
       - Company sectors should be consistent with career trajectory (avoid random industry jumps unless specified).
       - Bullet points must follow APR structure (Action-Project-Result) with appropriate metrics.
       - For "Overqualified" and "Good Fit" categories, show progression in responsibilities.
       
   - **Projects**: 
       - Compensate work experience with Project work if the candidate has no prior work experience.
       - Include relevant project names (internal, academic, or personal).
       - Specify technology stack relevant to the time period of the project.
       - For technical roles, include GitHub/portfolio links when appropriate.
       - Ensure project complexity scales with fit category.

   - **Certifications**: Include industry-relevant certifications with appropriate dates.
   - **Publications/Research**: For academic or research-intensive roles where applicable.
   - **Professional Associations**: For industry-specific positions.
   - **Awards/Recognition**: Scale according to fit category.

5. **Bullet Point Constraints**
   - A typical resume bullet point format starts with a strong action verb, describes the specific task or project you undertook, and then highlights the quantifiable result or impact you achieved.
   - Bullet points follows the "Action + Project/Problem + Result" (APR) structure, keeping each bullet point concise and focused on accomplishments.
        - *Action Verb*: Begin with a powerful action verb that clearly describes what you did (e.g., "developed," "managed," "implemented," "analyzed"). 
        - *Specific Details*: Briefly explain the project, task, or responsibility you were involved in. 
        - *Quantifiable Result*: Include numbers, percentages, or other metrics to demonstrate the impact of your work whenever possible. 

6. **Formatting and Quantification Guidelines**  
   - Use **clear section headings**.
   - Precede each bullet point with **"-"**.
   - Mark key entities (**institutions, companies, project names**) with **"*"**.
   - Each bullet point must be between **150-180 characters**.
   - Quantify achievements using metrics appropriate to the role type:
       - Engineering: Performance improvements, scale, efficiency gains
       - Sales/Marketing: Revenue impact, growth percentages, lead generation
       - Management: Team size, budget responsibility, project outcomes
       - Operations: Process improvements, cost reductions, time savings

7. **Temporal Consistency Requirements**
   - Ensure all technologies mentioned align with their actual market availability dates.
   - Maintain logical progression of responsibilities and achievements.
   - Avoid anachronisms (e.g., claiming experience with technologies before they existed).

8. **Industry-Specific Adaptations**
   - Adjust terminology density based on role and seniority.
   - Include industry-specific metrics and achievements.
   - Adapt format slightly based on industry conventions.

9. **Output Requirements**  
    - Generate **only** the resume; **do not include any explanatory notes or meta-text**.
    - Maintain **authenticity, clarity, and professionalism** throughout.
    - Total word count must adhere to fit category specifications.
    - **DO NOT** mention the fit category in the generated content.
    - **DO NOT** include any NOTE at the end of the generated content.
    
**NOTE**: {name} is an imaginary person who does not exist. Therefore, you would not be violating any data privacy issues.
"""

# Additional parameters for enhanced resume generation
resume_formats = ["Chronological", "Functional", "Hybrid"]



## Initialize model and chain

In [9]:
# Read JSON file into a dictionary
with open(f"{path}/data/configuration/config.json", "r") as file:
    config_dict = json.load(file)

API_KEY = config_dict['API_KEY']

In [10]:
# Apply nest_asyncio for Jupyter Notebook compatibility
nest_asyncio.apply()

# Initialize Faker (For generating random candidate name)
fake = Faker()
        
# Initialize Model
# model = OllamaLLM(model="llama3.2", temperature = 1)

model = ChatNVIDIA(
  model="nvidia/nemotron-4-340b-instruct",
  api_key=API_KEY, 
  temperature=1.0,
  seed = fake.random_number()
)

# Define Prompt Template
prompt = ChatPromptTemplate.from_template(resume_template)

# Create Chain
chain = prompt | model

## Defining Asyncio chain execution

In [11]:
# Define max concurrent requests to prevent API overload
MAX_CONCURRENT_REQUESTS = 25  # Adjust based on API limits
semaphore = asyncio.Semaphore(MAX_CONCURRENT_REQUESTS)

async def generate_resume(job_id, company, job_role, job_description, job_skills, category):
    """Generates a resume asynchronously while respecting API limits."""
    async with semaphore:  # Prevents exceeding concurrent request limits
        data = {
            "name": fake.name(),
            "company": company,
            "job_role": job_role,
            "jd": job_description,
            "skills":job_skills,
            "fit_category": category,
            "category_requirements": category_requirements[category],
            "resume_format": resume_formats[random.choice([0, 1, 2])]
        }
        response = await chain.ainvoke(data)

        return {"id": job_id, "category": category, "resume_text": response.content}


async def generate_resumes_for_id(job_id, company, job_role, job_description, job_skills):
    """Generates 4 resumes per job ID asynchronously."""
    tasks = [generate_resume(job_id, company, job_role, job_description, job_skills, category) for category in categories]
    return await asyncio.gather(*tasks)  # Parallel execution

async def batch_generate_resumes(df, batch_size=50):
    """Processes large datasets in batches for efficiency."""
    all_results = []
    for start in range(0, len(df), batch_size):  # Process in chunks
        batch_df = df.iloc[start:start + batch_size]
        tasks = [generate_resumes_for_id(row["id"], row["company"], row["job_title"], row["job_summary"], row["job_skills"])
                 for _, row in batch_df.iterrows()]
        
        batch_results = await asyncio.gather(*tasks)  # Process batch concurrently
        all_results.extend([item for sublist in batch_results for item in sublist])  # Flatten results
        
        print(f"- Processed {min(start + batch_size, len(df))}/{len(df)} records...")
    
    return all_results


## Generating data

In [12]:
# ------------------ Run Optimized Async Processing ------------------ #
start_time = time.time()

loop = asyncio.get_event_loop()
resumes = loop.run_until_complete(batch_generate_resumes(data_df, batch_size=50))  # Adjust batch size as needed

end_time = time.time()
print(f"\n⚡ Async Execution Time: {end_time - start_time:.2f} seconds")
# 
# # ⚡ Async Execution Time: 7998.52 seconds

- Processed 50/250 records...
- Processed 100/250 records...
- Processed 150/250 records...
- Processed 200/250 records...
- Processed 250/250 records...

⚡ Async Execution Time: 7460.01 seconds


In [13]:
# Convert results to DataFrame and append to original dataset
resume_df = pd.DataFrame(resumes)
resume_df

Unnamed: 0,id,category,resume_text
0,504467,Complete Mismatch,**Franklin Sanchez DDS**\n\n*Phone:* 555-555-5...
1,504467,Underwhelming,"**KRYSTAL JACKSON**\n*Boston, MA* | 555-555-55..."
2,504467,Good Fit,"**WILLIAM BROWN**\n*Boston, MA | (555) 555-555..."
3,504467,Overqualified,"# Nicholas Bush\n\n*Boston, MA* | (555) 555-12..."
4,511568,Complete Mismatch,"# Nicholas Patterson\n\n*Salt Lake City, UT* *..."
...,...,...,...
995,568816,Overqualified,# Darren Roberts\n\n*Email*: darren.roberts@em...
996,452512,Complete Mismatch,**Lisa Wright**\n*+1 (111) 111-1111* *|* *lisa...
997,452512,Underwhelming,"# Daniel Meza\n\n*Greensboro, NC* *·* *(555) 5..."
998,452512,Good Fit,"# William Torres\n\n*Greensboro, NC* *|* *will..."


In [14]:
df_expanded = data_df.merge(resume_df, on="id", how="right")  # Merge back with original DataFrame
df_expanded

Unnamed: 0,id,job_title,company,job_skills,job_summary,category,resume_text
0,504467,Research Scientist,Massachusetts General Hospital,"Markov models, Monte Carlo simulation, Compart...",General Summary/Overview Statement\nThe Medica...,Complete Mismatch,**Franklin Sanchez DDS**\n\n*Phone:* 555-555-5...
1,504467,Research Scientist,Massachusetts General Hospital,"Markov models, Monte Carlo simulation, Compart...",General Summary/Overview Statement\nThe Medica...,Underwhelming,"**KRYSTAL JACKSON**\n*Boston, MA* | 555-555-55..."
2,504467,Research Scientist,Massachusetts General Hospital,"Markov models, Monte Carlo simulation, Compart...",General Summary/Overview Statement\nThe Medica...,Good Fit,"**WILLIAM BROWN**\n*Boston, MA | (555) 555-555..."
3,504467,Research Scientist,Massachusetts General Hospital,"Markov models, Monte Carlo simulation, Compart...",General Summary/Overview Statement\nThe Medica...,Overqualified,"# Nicholas Bush\n\n*Boston, MA* | (555) 555-12..."
4,511568,Research Scientist,University of Utah,"Research projects, Scientific design, Research...",Details\nOpen Date\n05/25/2023\nRequisition Nu...,Complete Mismatch,"# Nicholas Patterson\n\n*Salt Lake City, UT* *..."
...,...,...,...,...,...,...,...
995,568816,Test Engineer,Micross Components,"Semiconductor Testing, Automated Test Equipmen...","Job Summary:\nPerforms LAT testing, builds bur...",Overqualified,# Darren Roberts\n\n*Email*: darren.roberts@em...
996,452512,Test Engineer,SEGULA Technologies,"Data acquisition, Data processing, Mechanical ...",Company Description\nMUST be authorized to wor...,Complete Mismatch,**Lisa Wright**\n*+1 (111) 111-1111* *|* *lisa...
997,452512,Test Engineer,SEGULA Technologies,"Data acquisition, Data processing, Mechanical ...",Company Description\nMUST be authorized to wor...,Underwhelming,"# Daniel Meza\n\n*Greensboro, NC* *·* *(555) 5..."
998,452512,Test Engineer,SEGULA Technologies,"Data acquisition, Data processing, Mechanical ...",Company Description\nMUST be authorized to wor...,Good Fit,"# William Torres\n\n*Greensboro, NC* *|* *will..."


## Writing the data

In [15]:
assert(df_expanded.shape[0] == 4 * df.shape[0]) 
# For each input record, 4 output records must be generated (one for each category)

In [16]:
df_expanded.to_csv(f"{path}/data/synthetic_data/synthetic_data_v2/synthetic_jd_resume_set{set_number}_{iteration_number}.csv", index=False)