## Step 1: Data Collection & Preprocessing

### Goal: 
Extract and preprocess resume and job description data.

In [1]:
#extract text from resumes
from pdfminer.high_level import extract_text
def extract_text_from_pdf(pdf_path):
    return extract_text(pdf_path)
pdf_text = extract_text_from_pdf("C:/Users/rahul/OneDrive/Desktop/rahul-bastia_cvpdf.pdf")
# print(pdf_text)


In [2]:
from docx import Document
def extract_text_from_docx(docx_path):
    doc = Document(docx_path)
    return "\t".join([para.text for para in doc.paragraphs])
docx_text = extract_text_from_docx("C:/Users/rahul/OneDrive/Desktop/rahul-bastia_cv.docx")
# print(docx_text)

# Text clening and Lemmatization

In [3]:
import spacy
import re
from nltk.corpus import stopwords

# Load NLP model
nlp = spacy.load("en_core_web_sm")

def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces

    # Tokenization and Lemmatization
    doc = nlp(text)
    cleaned_text = " ".join([token.lemma_ for token in doc if token.text not in stopwords.words("english")])

    return cleaned_text

cleaned_resume_text = preprocess_text(docx_text)
print(cleaned_resume_text)


rahul bastia leetcode rahulbastia email rahul bastia00 gmail com linkedin rahulbastia phone 91 6371480952 github rahulbastia00 experience hewlett packard enterprise jan 2025 software engineering virtual intern 25 remote write proposal restful web service manage list employee build web server application java spring boot accept respond http request well support upload json datum develop run set unit test assess java spring boot application performance walmart global tech jan 2025 advanced software engineering virtual intern 25 remote solved challenge technical project various walmart team develop novel java heap datum structure shipping department demonstrate strong problem solve algorithmic skill design uml class diagram er diagram datum processing database system showcasing proficiency software design principle project job board backend node js mongodb integration mongodb express js node js code build job portal secure jwt authentication restful api use node js express mongodb enable 

# Set Up MongoDB Connection

In [4]:
from pymongo import MongoClient

#connect to mongodb
client = MongoClient("mongodb://localhost:27017/")
db = client["resume_database"]
collection = db["resumes"]

# store processed resume
resume_data = {"name": "Candidate 1", "resume_text" : cleaned_resume_text}
collection.insert_one(resume_data)


InsertOneResult(ObjectId('679baac0651a6dc1a5e406ce'), acknowledged=True)

## Step 2: NLP-Based Resume Parsing

### Goal: 
Extract key information from resumes, such as skills, education, experience, and job titles, using Named Entity Recognition (NER) and embeddings.

In [14]:
from langchain_groq import ChatGroq
import os
from dotenv import load_dotenv

# It's better to store your API key in an environment variable
load_dotenv()
groq_api_key = os.getenv("GROQ_API_KEY") # Store your API key in an environment variable for security

llm = ChatGroq(
    model_name="llama-3.3-70b-versatile",
    temperature=0,
    groq_api_key=groq_api_key
)

# Make sure pdf_text is defined and contains the resume text
pdf_text = extract_text_from_pdf("C:/Users/rahul/OneDrive/Desktop/rahul-bastia_cvpdf.pdf")
# Replace with the actual text extracted from the resume

# Invoke the model to extract skills, experience, job titles, and education
response = llm.invoke(f"""
Here is the given text extracted from a resume: {pdf_text} 
Please extract the following information:
- Skills
- Experience
- Job Titles
- Education
### Format the response as follows: ###
Skills: [skill1, skill2, ...]
Experience: [experience1, experience2, ...]
Job Titles: [job_title1, job_title2, ...]
Education: [degree1, degree2, ...]
### No Preambles ###
""")

# Initialize empty strings to store skills, experience, job titles, and education
skills_string = ""
experience_string = ""
job_titles_string = ""
education_string = ""

# Check if the response is valid
if response and hasattr(response, 'content'):
    # Assuming the response content is structured as requested
    response_content = response.content.strip()
    
    # Split the response into lines
    lines = response_content.split('\n')
    
    # Extract skills, experience, job titles, and education from the response
    for line in lines:
        if line.startswith("Skills:"):
            skills_string = line.replace("Skills:", "").strip()
        elif line.startswith("Experience:"):
            experience_string = line.replace("Experience:", "").strip()
        elif line.startswith("Job Titles:"):
            job_titles_string = line.replace("Job Titles:", "").strip()
        elif line.startswith("Education:"):
            education_string = line.replace("Education:", "").strip()
else:
    print("Error: Invalid response")

# Now you can access the extracted information as strings
print(f"Extracted Skills: {skills_string}")
print(f"Extracted Experience: {experience_string}")
print(f"Extracted Job Titles: {job_titles_string}")
print(f"Extracted Education: {education_string}")

Extracted Skills: Python, Java, C/C++, JavaScript, SQL, Node.js, React.js, MySQL, MongoDB, GitHub, Docker, Lang chain, Linux, Data Structure, Algorithms, Operating System, Computer Networks, Database Management System, Computer Architecture, Object-Oriented Programming, Machine Learning, DevOps, Communication, Team Work, Problem Solving, Time Management
Extracted Experience: Software Engineering Virtual Intern at Hewlett Packard Enterprise, Advanced Software Engineering Virtual Intern at Walmart Global Tech
Extracted Job Titles: Software Engineering Virtual Intern, Advanced Software Engineering Virtual Intern
Extracted Education: Bachelor of Technology at Gandhi Institute For Technology (GIFT)


In [7]:
print(job_titles_string)

Software Engineering Virtual Intern, Advanced Software Engineering Virtual Intern


# Improve Skill Extraction with Predefined Skill List
* We will match extracted text with a predefined skill database.

In [8]:
import json

# Load predefined skill set (example list)
predefined_skills_string = {"Python", "Machine Learning", "Deep Learning", "Java", "Docker", "Kubernetes", "React", "Node.js", "GitHub", "Algorithms"}


def extract_skills_string(skills_string):
    words = set(skills_string.split())
    matched_skills_string = predefined_skills_string.intersection(words)
    return list(matched_skills_string)

extracted_skills = extract_skills_string(skills_string)
print("Extracted skills_string:", extracted_skills)

Extracted skills_string: []


# Convert Resumes to Embeddings for Better Matching
* To compare resume skills with job descriptions, we use sentence-transformers. 

In [9]:
from sentence_transformers import SentenceTransformer

# Load the sentence-transformer model
model = SentenceTransformer("all-MiniLM-L6-v2")

def get_resume_embedding(text):
    return model.encode(text)

resume_embedding = get_resume_embedding(skills_string)
print("Resume embedding shape:", resume_embedding.shape)


Resume embedding shape: (384,)


## Store Parsed Resumes in MongoDB
** We will store structured resume information in MongoDB.

In [10]:
from pymongo import MongoClient
client = MongoClient("mongodb://localhost:27017/")
db = client["resume_database"]
collection = db["parsed_resume"]

resume_data = {
    "name": "rahul",
    "education": education_string,
    "experience": experience_string,
    "skills": extracted_skills,
    "job_title": job_titles_string,
    "embedding": resume_embedding.tolist()
}
collection.insert_one(resume_data)
print("Parsed resume stored in MongoDB!")

Parsed resume stored in MongoDB!


## Step 3: Candidate Shortlisting Using Machine Learning

### Goal: 
Rank resumes based on job relevance using machine learning (ML). We will:

- ✅ Convert job descriptions and resumes into embeddings
- ✅ Compute similarity scores
- ✅ Rank candidates based on job fit

In [11]:
# %pip install scikit-learn sentence-transformers numpy pandas
# Convert Job Descriptions to Embeddings
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")

def get_job_embedding(job_description):
    return model.encode(job_description)

job_description = "We are looking for a Software Engineer with expertise in Python, Machine Learning, and Deep Learning."
job_embedding = model.encode(job_description)

print("job embadding shape:", job_embedding.shape)


job embadding shape: (384,)


In [12]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from pymongo import MongoClient

# Connecting to MongoDB
client = MongoClient("mongodb://localhost:27017/")
db = client["resume_database"]
collection = db["parsed_resume"]

# Retrieving Resumes
resumes = list(collection.find())

# Check if resumes are retrieved
if not resumes:
    print("No resumes found in the database.")
else:
    print(f"Found {len(resumes)} resumes.")

# Ensure job_embedding is defined
job_description = "We are looking for a Software Engineer with expertise in Python, Machine Learning, and Deep Learning."
job_embedding = model.encode(job_description)

# Computing Similarity Scores
ranked_candidates = []
for resume in resumes:
    if "embedding" not in resume or resume["embedding"] is None:
        print(f"Skipping {resume.get('name', 'Unknown')}: No embedding found.")
        continue

    resume_embedding = np.array(resume["embedding"]).reshape(1, -1)
    job_embedding = np.array(job_embedding).reshape(1, -1)

    # Fix typo in cosine_similarity function
    similarity_score = cosine_similarity(resume_embedding, job_embedding)[0][0]
    ranked_candidates.append((resume["name"], similarity_score))

# Sort candidates by highest similarity
ranked_candidates.sort(key=lambda x: x[1], reverse=True)

# Display ranked candidates
if not ranked_candidates:
    print("No candidates matched the job description.")
else:
    for rank, (name, score) in enumerate(ranked_candidates, 1):
        print(f"{rank}. {name} - Score: {score:.4f}")


Found 2 resumes.
1. rahul - Score: 0.3239
2. rahul - Score: 0.3239


In [13]:
#  Store Shortlisting Results in MongoDB
# Store ranking results in MongoDB
shortlist_collection = db["shortlisted_candidates"]

for rank, (name, score) in enumerate(ranked_candidates, 1):
    shortlist_collection.insert_one({"rank": rank, "name": name, "score": score})

print("Shortlisted candidates stored in MongoDB!")


Shortlisted candidates stored in MongoDB!


## Step 4: Automated Feedback Generation Using LLMs

### Goal: 
Generate personalized selection/rejection emails for candidates based on their ranking.

In [1]:
import os
from dotenv import load_dotenv

load_dotenv()
OPENAI_API_KEY = os.getenv("EMAIL_API")

In [3]:
#  Retrieve Shortlisted Candidates from MongoDB
# connect to mongodb
from pymongo import MongoClient
client = MongoClient("mongodb://localhost:27017/")
db = client["resume_database"]
shortlist_collection = db["shortlisted_candidates"]

# Fetch ranked candidates
ranked_candidates = list(shortlist_collection.find())

if not ranked_candidates:
    print("No shortlisted candidates found.")
else:
    print(f"Found {len(ranked_candidates)} shortlisted candidates.")

Found 7 shortlisted candidates.


In [8]:
from langchain_groq import ChatGroq
import os
from dotenv import load_dotenv

load_dotenv()
GROQ_API_KEY = os.getenv("GROQ_API_KEY")

llm = ChatGroq(
    model_name="llama3-70b-8192",
    temperature=0,
    groq_api_key=GROQ_API_KEY
)

def generate_feedback(name, score, threshold=0.1):
    if score >= threshold:
        status = "Selected"
        prompt = f"""
        Write a professional and encouraging selection email to {name}. 
        Congratulate them on their selection and mention that their skills closely match the job requirements.
        Encourage them to prepare for the next interview round.
        """
    else:
        status = "Rejected"
        prompt = f"""
        Write a professional rejection email to {name}. 
        Thank them for applying and mention that while their skills are valuable, they were not the best fit for this role.
        Encourage them to apply for future opportunities.
        """

    response = llm.invoke(prompt)
    email_content = response.content
    return status, email_content

# Generate feedback for each candidate
for candidate in ranked_candidates:
    name, score = candidate["name"], candidate["score"]
    status, email_content = generate_feedback(name, score)
    
    # Store feedback in MongoDB
    shortlist_collection.update_one(
        {"name": name},
        {"$set": {"status": status, "feedback": email_content}}
    )
    
    print(f"Feedback generated for {name} ({status}).")

Feedback generated for rahul (Selected).
Feedback generated for rahul (Selected).
Feedback generated for rahul (Selected).
Feedback generated for rahul (Selected).
Feedback generated for rahul (Selected).
Feedback generated for rahul (Selected).
Feedback generated for rahul (Selected).
