In [4]:
import pandas as pd
import re
import numpy as np
import ast
import asyncio
import os

In [5]:
resumes = pd.read_csv("./data/resume_data.csv")
resumes.head()

Unnamed: 0,address,career_objective,skills,educational_institution_name,degree_names,passing_years,educational_results,result_types,major_field_of_studies,professional_company_names,...,online_links,issue_dates,expiry_dates,﻿job_position_name,educationaL_requirements,experiencere_requirement,age_requirement,responsibilities.1,skills_required,matched_score
0,,Big data analytics working and database wareho...,"['Big Data', 'Hadoop', 'Hive', 'Python', 'Mapr...",['The Amity School of Engineering & Technology...,['B.Tech'],['2019'],['N/A'],[None],['Electronics'],['Coca-COla'],...,,,,Senior Software Engineer,B.Sc in Computer Science & Engineering from a ...,At least 1 year,,Technical Support\nTroubleshooting\nCollaborat...,,0.85
1,,Fresher looking to join as a data analyst and ...,"['Data Analysis', 'Data Analytics', 'Business ...","['Delhi University - Hansraj College', 'Delhi ...","['B.Sc (Maths)', 'M.Sc (Science) (Statistics)']","['2015', '2018']","['N/A', 'N/A']","['N/A', 'N/A']","['Mathematics', 'Statistics']",['BIB Consultancy'],...,,,,Machine Learning (ML) Engineer,M.Sc in Computer Science & Engineering or in a...,At least 5 year(s),,Machine Learning Leadership\nCross-Functional ...,,0.75
2,,,"['Software Development', 'Machine Learning', '...","['Birla Institute of Technology (BIT), Ranchi']",['B.Tech'],['2018'],['N/A'],['N/A'],['Electronics/Telecommunication'],['Axis Bank Limited'],...,,,,"Executive/ Senior Executive- Trade Marketing, ...",Master of Business Administration (MBA),At least 3 years,,"Trade Marketing Executive\nBrand Visibility, S...",Brand Promotion\nCampaign Management\nField Su...,0.416667
3,,To obtain a position in a fast-paced business ...,"['accounts payables', 'accounts receivables', ...","['Martinez Adult Education, Business Training ...",['Computer Applications Specialist Certificate...,['2008'],[None],[None],['Computer Applications'],"['Company Name ï¼ City , State', 'Company Name...",...,,,,Business Development Executive,Bachelor/Honors,1 to 3 years,Age 22 to 30 years,Apparel Sourcing\nQuality Garment Sourcing\nRe...,Fast typing skill\nIELTSInternet browsing & on...,0.76
4,,Professional accountant with an outstanding wo...,"['Analytical reasoning', 'Compliance testing k...",['Kent State University'],['Bachelor of Business Administration'],[None],['3.84'],[None],['Accounting'],"['Company Name', 'Company Name', 'Company Name...",...,[None],[None],"['February 15, 2021']",Senior iOS Engineer,Bachelor of Science (BSc) in Computer Science,At least 4 years,,iOS Lifecycle\nRequirement Analysis\nNative Fr...,iOS\niOS App Developer\niOS Application Develo...,0.65


### Clean job related fields and build experience column

In [6]:
resumes = resumes.replace(dict(end_dates={np.nan: None,  }
                               , start_dates={np.nan: None}
                              , related_skils_in_job={np.nan: None}))

Convert string of list to actual list

In [7]:
def convert_string_of_list_to_list(serialized_value: str):
    # try:
    results = [None]
    if serialized_value is not None:
        results = ast.literal_eval(serialized_value)
       
    return results

In [8]:
resumes.end_dates = resumes.end_dates.apply(lambda x: convert_string_of_list_to_list(x))
resumes.start_dates = resumes.start_dates.apply(lambda x: convert_string_of_list_to_list(x))
resumes.related_skils_in_job = resumes.related_skils_in_job.apply(lambda x: convert_string_of_list_to_list(x))

#### Get Max end date

In [9]:
def get_max_end_yr(dates):
    yrs = []
    for date in dates:
        if date is not None:
            extracted_years = re.findall(r'[\d]{4}', date)
            if len(extracted_years) > 0:
                yrs.append(int(extracted_years[0]))

    
    if len(yrs) > 0:
        return(max(yrs))
    else: return 0

        
resumes.loc[:, "end_yr"] = resumes.end_dates.apply(lambda x:get_max_end_yr(x))
print(max(resumes.end_yr))

2023


In [10]:
resumes.loc[resumes.end_yr == 0,["start_dates","end_dates"]].sample(5)

Unnamed: 0,start_dates,end_dates
7117,[May 2019],[N/A]
6004,[Jan 2021],[Till Date]
1691,[Feb 2020],[Ongoing]
2340,[Apr 2019],[Till Date]
1236,[None],[None]


#### Build experience column

Build experience column for each skill. this will help in defining the experience across skills

In [11]:
import itertools

def create_experience(row):
# for index, row in resumes.loc[:1][["start_dates","end_dates","related_skils_in_job"]].iterrows():
    experiences = []
    for start_date, end_date, related_skills in zip(row.start_dates, row.end_dates, row.related_skils_in_job):
        try:
            if end_date is not None and end_date.strip().lower() in ['till date','current','present','ongoing','n/a']:
                end_date = "January 2025"
            
            experience = int(re.findall(r'[\d]{4}', end_date)[0]) - int(re.findall(r'[\d]{4}', start_date)[0])
        except Exception as ex:
            # print(ex, row.name)
            experience = 0

        # print(type(related_skills))
        if type(related_skills) == list:
            experiences.append({skill: experience for skill in set(related_skills)})
                                                                    
    df = pd.DataFrame(experiences)
    return df.sum(axis = 0).to_dict()
        
    

In [12]:
resumes.loc[:5][["start_dates","end_dates","related_skils_in_job"]].apply(lambda x: create_experience(x), axis = 1)

0                                      {'Big Data': 6}
1    {'Business Analysis': 6, 'Data Analysis': 6, '...
2    {'Unified Payment Interface': 7, 'Risk Predict...
3    {'accounts payables': 4.0, 'accounts receivabl...
4    {'sales and use tax audits': 9.0, 'financial r...
5    {'Juniper Net-screen': 7.0, 'Mac OS and IOS': ...
dtype: object

In [13]:
resumes.loc[:,"experience_in_yrs"] = resumes.apply(lambda x: create_experience(x), axis = 1)

In [14]:
resumes.sample(1)[["start_dates","end_dates","related_skils_in_job","experience_in_yrs"]]

Unnamed: 0,start_dates,end_dates,related_skils_in_job,experience_in_yrs
7831,[Apr 2019],[till date],"[[Machine Learning, Natural Language Processing]]","{'Natural Language Processing': 6, 'Machine Le..."


### Extract values as per schema

In [15]:
from model import ResumeFields
from langchain.chat_models import init_chat_model
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
import pickle

In [23]:
llm = init_chat_model("gemini-2.0-flash-001", model_provider="google_vertexai", temperature = 0.3)

In [24]:
system_prompt = """
You are a highly skilled AI assistant specializing in analyzing candidate resumes and extracting key fields. 
Your goal is to meticulously identify and categorize the essential skills, qualifications, experience, personality traits.

**Your Task:**

Given a job resume fields as input, extract the required fields based on schema into a structured format. Be comprehensive and specific, avoiding vague or generic terms.

**Input schema:**
1. career_objective: Career objective
2. skills: List of Skills candidate poses
3. educational_institution_name: List of Education institute name
4. degree_names: List of Degrees.  
5. major_field_of_studies: List of Major field of studies
6. responsibilities
7. experience_in_yrs: Experience in years in dictionary. Feel free to deduce this from career objective if not present

**Instructions:**

1.  **Read the resume fields carefully and thoroughly.** 
2.  **Focus on explicitly stated requirements.** 
3.  **Be specific and detailed.** Avoid generalizations.
4.  **Maintain accuracy.**  
5.  **Conciseness is important,** but don't sacrifice accuracy or completeness.**
"""

In [25]:
prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            system_prompt
        ),
        ("human", "{text}"),
    ]
)

runnable = prompt | llm.with_structured_output(
    schema=ResumeFields
)



Key '$defs' is not supported in schema, ignoring


In [None]:
import asyncio

required_columns = ["career_objective", "skills", "educational_institution_name",
                    "degree_names", "major_field_of_studies", "responsibilities",
                    "experience_in_yrs"]

# Create output directory for candidates
candidates_resumes_folder = "./outputs/candidates"
if not os.path.exists(candidates_resumes_folder):
    os.makedirs(candidates_resumes_folder)

req_per_min = 1000
time_per_req = 60 // req_per_min
async def extract_fields(index, row):
    try:
        text = row.to_dict()    
        fields = await runnable.ainvoke({"text": (
            f"{text}"
            "Extract all the relevant fields from the text given"
        )})
    
        with open(f'{candidates_resumes_folder}/{index}.pkl', 'wb') as f:
            pickle.dump(fields, f)
    except Exception as ex:
        print(index, ex)

bg_tasks = []
for index, row in resumes.loc[:][required_columns].iterrows():
    bg_tasks.append(asyncio.create_task(extract_fields(index,row)))
    await asyncio.sleep(time_per_req)

await asyncio.gather(*bg_tasks)
    

In [34]:
from rich import print as rprint
index = 5600
rprint(resumes.loc[index][required_columns+["start_dates","end_dates"]].to_dict())
with open(f'{candidates_resumes_folder}/{index}.pkl', 'rb') as f:
    tmp = pickle.load(f)
rprint(tmp)