### Embedding Vectors

In [36]:
import ast
from langchain.sql_database import SQLDatabase

def get_resumes_from_db(db_connection_str, query, info, field_name):
    db = SQLDatabase.from_uri(db_connection_str)
    results = ast.literal_eval(db.run(query, fetch='all'))
    
    for result in results:
        deets_list = [{result[1].split('-->')[0]:result[1].split('-->')[1]}]
        for col in result[2:]:
            if col is not None:
                deets_list.append({col.split('-->')[0]:col.split('-->')[1]})
        
        if result[0] not in info.keys():
            info[result[0]] = {field_name:deets_list}
        else:
            info[result[0]][field_name] = deets_list
    
    return info

def get_resume_info():
    info = dict()
    db_connection_str = "mysql://root:@127.0.0.1/resume"
    
    # Personal Info
    personal_query = '''
    SELECT 
        id,
        CONCAT('name-->', GROUP_CONCAT(name ORDER BY id)) AS name,
        CONCAT('email-->', GROUP_CONCAT(email ORDER BY id)) AS email,
        CONCAT('phone_number-->', GROUP_CONCAT(phone_number ORDER BY id)) AS phone_number,
        CONCAT('address-->', GROUP_CONCAT(address ORDER BY id)) AS address,
        CONCAT('linkedin_url-->', GROUP_CONCAT(linkedin_url ORDER BY id)) AS linkedin_url
    FROM 
        personal_information
    GROUP BY 
        id;
    '''
    info = get_resumes_from_db(db_connection_str, personal_query, info, "Personal Information")
    
    # Education
    education_query = '''
    SELECT 
        personal_information_id,
        CONCAT('degree_courses-->', GROUP_CONCAT(degree_course ORDER BY id)) AS degree_courses,
        CONCAT('fields_of_study-->', GROUP_CONCAT(field_of_study ORDER BY id)) AS fields_of_study,
        CONCAT('institutes-->', GROUP_CONCAT(institute ORDER BY id)) AS institutes,
        CONCAT('marks_percentages_gpas-->', GROUP_CONCAT(marks_percentage_gpa ORDER BY id)) AS marks_percentages_gpas
    FROM 
        education_details
    GROUP BY 
        personal_information_id;
    '''
    info = get_resumes_from_db(db_connection_str, education_query, info, "Education Details")
    
    # Certification
    certification_query = '''
    SELECT 
        personal_information_id,
        CONCAT('certification_title-->', GROUP_CONCAT(certification_title ORDER BY id)) AS certification_title,
        CONCAT('date_of_issue-->', GROUP_CONCAT(date_of_issue ORDER BY id)) AS date_of_issue,
        CONCAT('issuing_organization-->', GROUP_CONCAT(issuing_organization ORDER BY id)) AS issuing_organization
    FROM 
        certification_details
    GROUP BY 
        personal_information_id;
    '''
    info = get_resumes_from_db(db_connection_str, certification_query, info, "Certifications")

    # Achievements
    achi_query = '''
    SELECT 
        personal_information_id,
        CONCAT('achievement_description-->', GROUP_CONCAT(achievement_description ORDER BY id)) AS achievement_description
    FROM 
        achievements
    GROUP BY 
        personal_information_id;
    '''
    info = get_resumes_from_db(db_connection_str, achi_query, info, "Achievements")
    
    # Languages
    language_query = '''
    SELECT 
        personal_information_id,
        CONCAT('language-->', GROUP_CONCAT(language ORDER BY id)) AS language,
        CONCAT('proficiency_level-->', GROUP_CONCAT(proficiency_level ORDER BY id)) AS proficiency_level
    FROM 
        language_competencies
    GROUP BY 
        personal_information_id;
    '''
    info = get_resumes_from_db(db_connection_str, language_query, info, "Languages")
    
    # Projects
    project_query = '''
    SELECT 
        personal_information_id,
        CONCAT('project_name-->', GROUP_CONCAT(project_name ORDER BY id)) AS project_name,
        CONCAT('description-->', GROUP_CONCAT(description ORDER BY id)) AS description
    FROM 
        project_details
    GROUP BY 
        personal_information_id;
    '''
    info = get_resumes_from_db(db_connection_str, project_query, info, "Projects")
    
    # Skills
    skill_query = '''
    SELECT 
        personal_information_id,
        CONCAT('skill-->', GROUP_CONCAT(skill ORDER BY id)) AS skill
    FROM 
        skills
    GROUP BY 
        personal_information_id;
    '''
    info = get_resumes_from_db(db_connection_str, skill_query, info, "Skills")
    
    # Work Experience
    we_query = '''
    SELECT 
        personal_information_id,
        CONCAT('job_title-->', GROUP_CONCAT(job_title ORDER BY id)) AS job_title,
        CONCAT('company_name-->', GROUP_CONCAT(company_name ORDER BY id)) AS company_name,
        CONCAT('description-->', GROUP_CONCAT(description ORDER BY id)) AS description
    FROM 
        work_experience
    GROUP BY 
        personal_information_id;
    '''
    info = get_resumes_from_db(db_connection_str, we_query, info, "Work Experience")
    
    return info

resume_info = get_resume_info()
resume_info

{1: {'Personal Information': [{'name': 'Viges D'},
   {'email': 'dhananjeyanvigesh@gmail.com'},
   {'phone_number': '8778403686'},
   {'address': 'Marathalli, Kundalahalli Gate, Bengalore, India'},
   {'linkedin_url': 'https://linkedin.com/in/vigesh-d-329715272'}],
  'Education Details': [{'degree_courses': "BCA - Bachelor's in Computer Application,HIGHER SECONDARY,SSLC"},
   {'fields_of_study': 'None,None,None'},
   {'institutes': 'Islamiah College (Autonomous) - ThiruvalluvarUniversity,Bethel Matric Hr Sec School,Bethel Matric Hr Sec School'},
   {'marks_percentages_gpas': '7.5 CGPA,50%,76.5%'}],
  'Certifications': [{'certification_title': 'HTML5 and CSS3 basic to advance course'},
   {'issuing_organization': 'Udemy'}],
  'Languages': [{'language': 'English,Tamil'},
   {'proficiency_level': 'Professional Working Proﬁciency,Native or Bilingual Proﬁciency'}],
  'Projects': [{'project_name': 'Static WebPage,Animated WebPage,Landing and Animated Web Page'},
   {'description': 'Created a

### Sample Dataset

In [138]:
import re
import string
import pandas as pd
import numpy as np

df = pd.read_csv("UpdatedResumeDataSet.csv")
df = df.drop_duplicates().reset_index(drop=True)

invalid_char = ['â¢', 'âª', 'â', 'ï', 'ï', 'ï·',
                'Â', 'Ã¼', 'Ã¼Â', 'Ã±', 'ÃÂ', '·', '*']

# Taking off invalid chars from resume
def clean_resume(text):
    text = re.sub('[^\x00-\x7f]', ' ', text)
    text = re.sub('\r', ' ', text)
    text = re.sub('\n', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = re.sub(('[%s]' % re.escape(string.punctuation)), ' ', text)
    return text

df['Processed Resume'] = df['Resume'].apply(lambda x: clean_resume(x))
df.drop('Resume', axis=1, inplace=True)

categories, resume_info = [], []

for i in df.iterrows():
    categories.append(i[1]['Category'])
    resume_info.append(i[1]['Processed Resume'])
    
len(categories), len(resume_info)

(166, 166)

In [139]:
df

Unnamed: 0,Category,Processed Resume
0,Data Science,Skills Programming Languages Python pandas...
1,Data Science,Education Details May 2013 to May 2017 B E UIT...
2,Data Science,Areas of Interest Deep Learning Control Syste...
3,Data Science,Skills R Python SAP HANA Tableau SAP HANA SQL ...
4,Data Science,Education Details MCA YMCAUST Faridabad Hary...
...,...,...
161,Testing,Computer Skills Proficient in MS office Word...
162,Testing,Willingness to accept the challenges Positiv...
163,Testing,PERSONAL SKILLS Quick learner Eagerness to le...
164,Testing,COMPUTER SKILLS SOFTWARE KNOWLEDGE MS Power ...


In [134]:
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

model_name = 'bert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

def get_embeddings(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    
    with torch.no_grad():
        outputs = model(**inputs)
    
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings

def compare_documents(resume, job_embedding):
    resume_embedding = get_embeddings(resume, tokenizer, model)
    
    similarity = cosine_similarity(resume_embedding.numpy(), job_embedding.numpy())
    return similarity[0][0]

job_description_text = open(r"S:\resume_parsing\job_descriptions\Prof.-CS-Sitare-University.txt", encoding='utf-8').read()
job_embedding = get_embeddings(job_description_text, tokenizer, model)

similarity_scores = dict()
for idx, resume in enumerate(resume_info):
    similarity_scores[idx] = compare_documents(resume, job_embedding)
    print(f"Resume Index: {idx}\nCategory: {categories[idx]}\nSemantic similarity: {similarity_scores[idx]:.4f}\n\n")

Resume Index: 0
Category: Data Science
Semantic similarity: 0.9151


Resume Index: 1
Category: Data Science
Semantic similarity: 0.9202


Resume Index: 2
Category: Data Science
Semantic similarity: 0.9322


Resume Index: 3
Category: Data Science
Semantic similarity: 0.9112


Resume Index: 4
Category: Data Science
Semantic similarity: 0.8708


Resume Index: 5
Category: Data Science
Semantic similarity: 0.9165


Resume Index: 6
Category: Data Science
Semantic similarity: 0.9210


Resume Index: 7
Category: Data Science
Semantic similarity: 0.9352


Resume Index: 8
Category: Data Science
Semantic similarity: 0.9413


Resume Index: 9
Category: Data Science
Semantic similarity: 0.9409


Resume Index: 10
Category: HR
Semantic similarity: 0.8973


Resume Index: 11
Category: HR
Semantic similarity: 0.9278


Resume Index: 12
Category: HR
Semantic similarity: 0.8886


Resume Index: 13
Category: HR
Semantic similarity: 0.8999


Resume Index: 14
Category: HR
Semantic similarity: 0.8951


Resume Ind

Resume Index: 116
Category: Network Security Engineer
Semantic similarity: 0.9181


Resume Index: 117
Category: Network Security Engineer
Semantic similarity: 0.9028


Resume Index: 118
Category: Network Security Engineer
Semantic similarity: 0.9518


Resume Index: 119
Category: Network Security Engineer
Semantic similarity: 0.9068


Resume Index: 120
Category: Network Security Engineer
Semantic similarity: 0.8946


Resume Index: 121
Category: PMO
Semantic similarity: 0.9421


Resume Index: 122
Category: PMO
Semantic similarity: 0.9347


Resume Index: 123
Category: PMO
Semantic similarity: 0.9303


Resume Index: 124
Category: Database
Semantic similarity: 0.9186


Resume Index: 125
Category: Database
Semantic similarity: 0.9098


Resume Index: 126
Category: Database
Semantic similarity: 0.9152


Resume Index: 127
Category: Database
Semantic similarity: 0.8997


Resume Index: 128
Category: Database
Semantic similarity: 0.7772


Resume Index: 129
Category: Database
Semantic similarity: 0

### Using Semantic Search

In [176]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("multi-qa-mpnet-base-cos-v1")

job_description_text = open(r"S:\resume_parsing\job_descriptions\Prof.-CS-Sitare-University.txt", encoding='utf-8').read()

query_embedding = model.encode("Given the job description, find the best resume for the job. Job description: " + job_description_text)
passage_embeddings = model.encode([
    '''
    {1: {'Personal Information': [{'name': 'Viges D'},
   {'email': 'dhananjeyanvigesh@gmail.com'},
   {'phone_number': '8778403686'},
   {'address': 'Marathalli, Kundalahalli Gate, Bengalore, India'},
   {'linkedin_url': 'https://linkedin.com/in/vigesh-d-329715272'}],
  'Education Details': [{'degree_courses': "BCA - Bachelor's in Computer Application,HIGHER SECONDARY,SSLC"},
   {'fields_of_study': 'None,None,None'},
   {'institutes': 'Islamiah College (Autonomous) - ThiruvalluvarUniversity,Bethel Matric Hr Sec School,Bethel Matric Hr Sec School'},
   {'marks_percentages_gpas': '7.5 CGPA,50%,76.5%'}],
  'Certifications': [{'certification_title': 'HTML5 and CSS3 basic to advance course'},
   {'issuing_organization': 'Udemy'}],
  'Languages': [{'language': 'English,Tamil'},
   {'proficiency_level': 'Professional Working Proﬁciency,Native or Bilingual Proﬁciency'}],
  'Projects': [{'project_name': 'Static WebPage,Animated WebPage,Landing and Animated Web Page'},
   {'description': 'Created an static webpage using HTML and CSS. I have successfully built\r\nresponsive and visually appealing webpages. With a focus on clean code\r\nand attention to detail, I strive to deliver user-friendly experiences.,Created a user-friendly animated webpage using HTML and CSS. With...'}],
  'Skills': [{'skill': 'HTML5,CSS3,JavaScript,BootStrap,ReactJS,Redux,Context API,GIT,GITHUB,VS Code,StackBlitz,Netlify,Animation Librarys'}]}}''',
    '''{2: {'Personal Information': [{'name': 'Swastik Banerjee'},
   {'email': 'swastikbanerjee2001@gmail.com'},
   {'phone_number': '+91-9330412996'},
   {'address': 'None'},
   {'linkedin_url': 'https://www.linkedin.com/in/swastikbanerjee'}],
  'Education Details': [{'degree_courses': 'Master of Science - Artificial Intelligence & Machine Learning,Bachelor of Science - Computer Science,Class 12 - WBCHSE,Class 10 - WBBSE'},
   {'fields_of_study': 'None,None,None,None'},
   {'institutes': 'Christ (Deemed to be University), Bengaluru, India,St. Xavier’s College (Autonomous), Kolkata, India,Patha Bhavan High School, Kolkata, India,Patha Bhavan High School, Kolkata, India'},
   {'marks_percentages_gpas': '3.97 / 4 (GPA),82.4%,94.4%,91%'}],
  'Achievements': [{'achievement_description': 'IEEE RAICS 2024 Presenter, Novel efficient methodology for Image Classification was presented by my team in this flagship conference on 17th of May at MACE, Kerala.,Intel oneAPI Hackathon 2024 Winner, Secured victory among 190 competitors, engineering a groundbreaking...'}],
  'Languages': [{'language': 'English,Bengali,Hindi'},
   {'proficiency_level': 'Professional proficiency,Native proficiency,Working proficiency'}],
  'Projects': [{'project_name': 'PREACH Application – Multimodal Inputs (Video/Audio/Text) to PowerPoint Presentations,mysticML Library Development – Python Library for Automated Data Preprocessing,Traffic Flow Optimization Solution – Dynamic Approach to Optimizing Traffic at Crossroads,Early Disease Detection &...'},
   {'description': '- Utilized LLMs, NLP, and CV techniques to convert video/audio/text inputs into PowerPoint presentations, offering a 30% more interactive and engaging presentation experience.\r\n\r\n- Won 1st place and 350$ prize money at Intel oneAPI Hackathon 2024.,- Converts raw data into...'}],
  'Skills': [{'skill': 'Python,Computer Vision,NLP,GenAI,LLM,RAG,Hugging Face,Transformers,AWS,SQL,Tableau,Flask,ETL,Java,C,C++,HTML,CSS,JavaScript,PHP,Exceptional Attention to Detail,Strong Leadership,Time Management,High Emotional Quotient,Effective Communicator,Networking Skills'}],
  'Work Experience': [{'job_title': 'ML Engineer - Intern,Cloud Engineer - Intern'},
   {'company_name': 'Hackveda,SkillVertex'},
   {'description': 'Remote, Intern,Remote, Intern'}]}}
    '''
])

similarity = model.similarity(query_embedding, passage_embeddings)

In [None]:
df['Similarity'] = similarity.numpy()[0]
df.loc[df.Similarity.sort_values()[-10:].index]

In [177]:
similarity

tensor([[0.2328, 0.2430]])

### Looking at the top resumes

In [135]:
from itertools import islice

def take(n, iterable):
    """Return the first n items of the iterable as a list."""
    return list(islice(iterable, n))

for i in take(50, dict(sorted(similarity_scores.items(), key=lambda item: item[1], reverse=True)).items()):
    print(i[0], categories[i[0]])

87 Automation Testing
118 Network Security Engineer
49 Sales
54 Health and fitness
32 Arts
85 SAP Developer
50 Health and fitness
91 Automation Testing
28 Advocate
121 PMO
99 Operations Manager
146 ETL Developer
8 Data Science
24 Advocate
64 Java Developer
9 Data Science
162 Testing
134 Database
73 Java Developer
36 Web Designing
55 Health and fitness
144 ETL Developer
104 Python Developer
160 Testing
155 Blockchain
20 Advocate
79 Business Analyst
83 SAP Developer
7 Data Science
94 Electrical Engineering
31 Arts
122 PMO
78 Business Analyst
56 Civil Engineer
92 Automation Testing
38 Web Designing
43 Mechanical Engineer
103 Python Developer
41 Mechanical Engineer
102 Operations Manager
2 Data Science
39 Web Designing
76 Business Analyst
137 Hadoop
66 Java Developer
96 Electrical Engineering
47 Sales
19 HR
75 Business Analyst
58 Civil Engineer


In [1]:
from sklearn.metrics.pairwise import cosine_similarity
import transformers

# Parsing resume and job description
resume_text = open(r"S:\resume_parsing\Resume-Manager\Resume_Manager\uploads\resume.txt", encoding='utf-8').read()
job_description_text = open(r"S:\resume_parsing\job_descriptions\Prof.-CS-Sitare-University.txt", encoding='utf-8').read()

# BERT
tokenizer = transformers.AutoTokenizer.from_pretrained('bert-base-cased')
model = transformers.AutoModel.from_pretrained('bert-base-cased')

def get_embeddings(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=512)
    outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1)

resume_embedding = get_embeddings(resume_text)
job_embedding = get_embeddings(job_description_text)
bert_similarity_score = cosine_similarity(resume_embedding.detach().numpy(), job_embedding.detach().numpy())[0][0]

print(f"BERT Similarity Score: {bert_similarity_score}")

BERT Similarity Score: 0.9052064418792725


### LangChain Embeddings

In [8]:
import json

job_description = '''
{'education': ['M.Tech./M.S.',
  'Ph.D. in Computer Science or a closely related discipline'],
 'achievements': [],
 'certifications': [],
 'languages': [],
 'projects': [],
 'skills': ['Machine Learning', 'Systems', 'Security', 'HCI'],
 'work_experience': ['Over 2 years of teaching or industry experience in a relevant field']}
'''

job_description = json.loads(job_description)
job_description_text = job_description.get('requirements', '')
job_description_tokens = preprocess_text(job_description_text)

process_text(job_description_tokens)

JSONDecodeError: Expecting property name enclosed in double quotes: line 2 column 2 (char 2)

In [5]:
import json
import re
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS

# Helper function to preprocess text
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    tokens = text.split()
    return tokens

# Function to extract keywords using LangChain
def extract_keywords(text, model="text-embedding-ada-002"):
    embeddings = OpenAIEmbeddings(model=model)
    vector_store = FAISS.from_texts([text], embeddings)
    return vector_store.get_nearest_neighbors(text, k=10)

# Function to calculate keyword matching score
def keyword_matching(resume_tokens, job_description_tokens):
    common_keywords = set(resume_tokens) & set(job_description_tokens)
    return len(common_keywords) / len(set(job_description_tokens))

# Function to calculate experience alignment score
def experience_alignment(resume, job_description):
    resume_experience = resume.get('experience', [])
    job_exp_pattern = re.compile(r'(\d+)\s+years?\s+of\s+experience')
    job_exp_match = job_exp_pattern.search(job_description)
    
    if job_exp_match:
        required_experience = int(job_exp_match.group(1))
        relevant_experience = 0
        
        for exp in resume_experience:
            if exp.get('years', 0) >= required_experience:
                relevant_experience += 1
        
        return relevant_experience / len(resume_experience) if resume_experience else 0
    return 0

# Function to calculate skill matching score
def skill_matching(resume_skills, job_description):
    job_skills = re.findall(r'\b(skill|requirement|qualifications?):? (.*?)\b', job_description, re.IGNORECASE)
    job_skills = [skill[1].strip().lower() for skill in job_skills]
    common_skills = set(resume_skills) & set(job_skills)
    return len(common_skills) / len(job_skills) if job_skills else 0

# Function to score the resume
def score_resume(resume_json, job_description):
    resume = json.loads(resume_json)
    resume_text = resume.get('summary', '') + ' ' + ' '.join([exp.get('description', '') for exp in resume.get('experience', [])])
    resume_tokens = preprocess_text(resume_text)
    
    job_description = json.loads(job_description)
    job_description_text = job_description.get('requirements', '')
    job_description_tokens = preprocess_text(job_description_text)

    keyword_score = keyword_matching(resume_tokens, job_description_tokens)
    experience_score = experience_alignment(resume, job_description)
    skill_score = skill_matching(resume.get('skills', []), job_description)

    final_score = (0.4 * keyword_score) + (0.3 * experience_score) + (0.3 * skill_score)
    return final_score

resume_json = '''
{1: {'Personal Information': [{'name': 'Viges D'},
   {'email': 'dhananjeyanvigesh@gmail.com'},
   {'phone_number': '8778403686'},
   {'address': 'Marathalli, Kundalahalli Gate, Bengalore, India'},
   {'linkedin_url': 'https://linkedin.com/in/vigesh-d-329715272'}],
  'Education Details': [{'degree_courses': "BCA - Bachelor's in Computer Application,HIGHER SECONDARY,SSLC"},
   {'fields_of_study': 'None,None,None'},
   {'institutes': 'Islamiah College (Autonomous) - ThiruvalluvarUniversity,Bethel Matric Hr Sec School,Bethel Matric Hr Sec School'},
   {'marks_percentages_gpas': '7.5 CGPA,50%,76.5%'}],
  'Certifications': [{'certification_title': 'HTML5 and CSS3 basic to advance course'},
   {'issuing_organization': 'Udemy'}],
  'Languages': [{'language': 'English,Tamil'},
   {'proficiency_level': 'Professional Working Proﬁciency,Native or Bilingual Proﬁciency'}],
  'Projects': [{'project_name': 'Static WebPage,Animated WebPage,Landing and Animated Web Page'},
   {'description': 'Created an static webpage using HTML and CSS. I have successfully built\r\nresponsive and visually appealing webpages. With a focus on clean code\r\nand attention to detail, I strive to deliver user-friendly experiences.,Created a user-friendly animated webpage using HTML and CSS. With...'}],
  'Skills': [{'skill': 'HTML5,CSS3,JavaScript,BootStrap,ReactJS,Redux,Context API,GIT,GITHUB,VS Code,StackBlitz,Netlify,Animation Librarys'}]}}
'''

job_description = '''
{'requirements': ['M.Tech./M.S. or Ph.D. in Computer Science or a closely related discipline from a reputed institute or university.',
  'Expertise in Machine Learning, Systems, Security, HCI.'],
 'education': [],
 'achievements': [],
 'certifications': [],
 'languages': [],
 'projects': [],
 'skills': [],
 'work_experience': ['Over 2 years of teaching or industry experience in a relevant field.']}
'''