In [92]:
from PyPDF2 import PdfReader
from langchain_groq import ChatGroq
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser
import chromadb
from chromadb.utils import embedding_functions
from sentence_transformers import SentenceTransformer
import os



In [None]:
GROQ_API_KEY = "your-api-key"

In [94]:
llm = ChatGroq(
    model="llama-3.3-70b-versatile",
    temperature=0,
    groq_api_key = GROQ_API_KEY
)
model = SentenceTransformer('all-MiniLM-L6-v2')

In [95]:
job_description = '''

🧾 Job Title: Junior Machine Learning Engineer
Location: Remote / Hybrid
Experience Level: 0.5 – 2 years
Company: AIWorks Tech Solutions

💼 Job Description:
We are looking for a motivated Junior Machine Learning Engineer to join our AI team. You'll work closely with data scientists and software developers to help build, train, and deploy ML models for real-world use cases in NLP, computer vision, and predictive analytics.

🔧 Responsibilities:
Assist in preprocessing and cleaning data for training models

Implement basic ML/DL models using frameworks like TensorFlow, PyTorch, or Scikit-learn

Work on NLP tasks such as sentiment analysis, keyword extraction, or summarization

Build and maintain API endpoints using Flask or FastAPI

Collaborate with the DevOps team to deploy models into production

Participate in daily standups, code reviews, and team discussions

✅ Requirements:
Strong understanding of Python and core ML libraries

Knowledge of deep learning concepts and model evaluation

Experience with Flask, TensorFlow, or PyTorch

Basic familiarity with NLP or Computer Vision

Good communication and problem-solving skills

🪄 Bonus if you have:
Personal or academic ML projects (GitHub links appreciated)

Exposure to Docker, Streamlit, or FastAPI

Familiarity with cloud platforms (AWS, GCP, Azure)

'''

In [96]:
from langchain_core.prompts import PromptTemplate

prompt_extraction = PromptTemplate.from_template(""" ###The below content is the job description for a certain role :
                                                 {job_description}
                                                 ###Instructions:
                                                 Summarize this job description to only include required skills , experience and responsibilities,bonus and relevant tools in one sentence.
                                                 """)

chain_extract = prompt_extraction | llm
job_summary = chain_extract.invoke({"job_description":job_description})
job_vector = model.encode(job_summary.content).tolist()

In [None]:
json_parser = JsonOutputParser()
client = chromadb.PersistentClient(path="vectorstore")

# 2. Create or get collection
collection = client.get_or_create_collection(name="candidates")
pdf_folder = 'pdf-folder-path'
for pdf in os.listdir(pdf_folder):
    pdf_path = os.path.join(pdf_folder,pdf)
    reader = PdfReader(pdf_path)
    text =''
    for page in reader.pages :
        text += page.extract_text()
    if not text.strip():
        continue
    prompt_extraction = PromptTemplate.from_template(""" ###The below content is the job description for a certain role:
                                                 {text}
                                                 ###Instructions:
                                                 I want my output in a json format with name,skills,total_experience ,description,phone number,email_id,relevant project links as the keys of the json.
                                                 Don't add any preamble.
                                                 If anything out of this is not mentioned then mention as NA
                                                """)

    chain_extract = prompt_extraction | llm
    res = chain_extract.invoke({"text":text})
    json_res = json_parser.parse(res.content)
    candidate = json_res

# 5. Create embedding text
    text_to_embed = f"{candidate['name']}. Skills: {', '.join(candidate['skills'])}. Experience: {candidate['total_experience']}. {candidate['description']}. Projects: {', '.join(candidate['relevant_project_links'])}"

# 6. Generate embedding
    embedding = model.encode(text_to_embed).tolist()

# 7. Add to collection
    collection.add(
        documents=[text_to_embed],
        embeddings=[embedding],
        metadatas=[{
            "name": candidate["name"],
            "email_id": candidate["email_id"],
            "phone_number": candidate["phone_number"],
            "skills": ", ".join(candidate["skills"]),
            "experience": candidate["total_experience"],
            "description": candidate["description"],
            "project_links": ", ".join(candidate["relevant_project_links"])
        }],
        ids=[candidate["email_id"]]
    )



In [None]:
n_results = 3
results = collection.query(
    query_embeddings=[job_vector],
    n_results=n_results,
)

for i in range(n_results):
    meta = results['metadatas'][0][i]
    print(f"""{i+1}:
    Name        : {meta['name']}
    Email       : {results['ids'][0][i]}
    Experience  : {meta['experience']}
    Skills      : {meta['skills']}
    Projects    : {meta['project_links']}
    Description : {meta['description']}
    """)
