In [1]:
# pip install langchain-community langchain-chroma sentence-transformers pypdf mistralai langchain faiss-cpu pypdf sentence-transformers

In [2]:
import os
from dotenv import load_dotenv
from mistralai import Mistral
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

  from pydantic.v1.fields import FieldInfo as FieldInfoV1
  from .autonotebook import tqdm as notebook_tqdm


In [3]:
load_dotenv()               # reads .env file

True

In [4]:
api_key = os.getenv("MISTRAL_KEY")

if not api_key:
    raise ValueError("Missing MISTRAL_KEY in .env")
else:
    print('Key fetched')

Key fetched


In [5]:
client = Mistral(api_key=api_key)
MODEL = "mistral-small-latest"

In [6]:
# --------- Load PDFs ----------
def load_pdf(path):
    loader = PyPDFLoader(path)
    return loader.load()

In [None]:
def rag_impl(resume_docs, jd_docs):
    # 1. Direct Comparison for Match (More accurate than RAG)
    full_resume_text = "\n".join([d.page_content for d in resume_docs])
    full_jd_text = "\n".join([d.page_content for d in jd_docs])
    
    print("‚è≥ Analyzing match...")
    match_pct = get_match_percentage(full_resume_text, full_jd_text)
    print(f"\nüéØ Resume‚ÄìJD Match: {match_pct}%")

    if match_pct < 60:
        print("‚ùå Match below 60%. Candidate rejected.")
        return

    # 2. RAG only for deep question generation
    print("‚úÖ Match confirmed. Indexing documents for questions...")
    documents = resume_docs + jd_docs
    splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
    chunks = splitter.split_documents(documents)
    
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    vectorstore = FAISS.from_documents(chunks, embeddings)
    retriever = vectorstore.as_retriever(search_kwargs={"k": 10}) # Increased k for better context

    questions = generate_questions(retriever)
    print("\n--- Generated Interview Questions ---")
    print(questions)

def get_match_percentage(resume_text, jd_text):
    # Clean up the prompt to handle full texts
    prompt = f"""
    You are a professional ATS (Applicant Tracking System).
    
    TASK:
    1. Analyze the RESUME and JOB DESCRIPTION provided below.
    2. Calculate a match percentage (0-100) based on:
       - Technical Skills & Tools
       - Years of Experience
       - Project Relevance
    
    RESUME:
    ---
    {resume_text}
    ---

    JOB DESCRIPTION:
    ---
    {jd_text}
    ---

    OUTPUT INSTRUCTIONS:
    Output ONLY the numerical value between 0 and 100. Do not include words or symbols.
    """
    response = client.chat.complete(
        model=MODEL,
        messages=[{"role": "user", "content": prompt}]
    )
    # Basic cleaning to ensure it's a number
    res_content = response.choices[0].message.content.strip().replace('%', '')
    try:
        return float(res_content)
    except:
        return 0.0

In [8]:
# --------- Match Percentage Prompt ----------
MATCH_PROMPT = """
You are an ATS system.

Given the CONTEXT below (resume + job description):
1. Calculate percentage match between resume and JD.
2. Consider skills, experience, tools, projects.
3. Output ONLY a number between 0 and 100.

CONTEXT:
{context}
"""

In [9]:
# --------- Question Generation Prompt ----------
QUESTION_PROMPT = """
You are a technical interviewer.

Using the CONTEXT:
- Job description requirements
- Skills mentioned in resume
- Projects done by candidate

Generate:
1. 5 technical questions on the job description
2. 3 project-based questions on the projects done by candidate
3. 2 skill-based questions on the skills mentioned in resume
3. 2 scenario-based questions based on the job description

CONTEXT:
{context}
"""

In [10]:
def generate_questions(retriever):
    docs = retriever.invoke("skills projects requirements")
    context = "\n".join([d.page_content for d in docs])

    response = client.chat.complete(
        model=MODEL,
        messages=[
            {"role": "user", "content": QUESTION_PROMPT.format(context=context)}
        ]
    )
    return response.choices[0].message.content

In [13]:
# --------- Pipeline ----------
if __name__ == "__main__":
    
    print('in main')

  # Get file paths from user
    resume_path = r'C:\Users\Admin\Downloads\Git_Clone\AI-Tools\AI_Interview\data\Rajat__Sharma_AI_ML.pdf' # input("Enter resume PDF path (e.g. Rajat__Sharma_AI_ML.pdf): ").strip()
    jd_path = r'C:\Users\Admin\Downloads\Git_Clone\AI-Tools\AI_Interview\data\JD_ML.pdf' # input("Enter Job Description PDF path: ").strip()

    # Loading pdf files
    resume_docs = load_pdf(resume_path)
    jd_docs = load_pdf(jd_path)

    print('Documents fetched!')

    # Calling
    rag_impl(resume_docs, jd_docs)

    print("\nProcessing...\n")

in main
Documents fetched!
‚è≥ Analyzing match...

üéØ Resume‚ÄìJD Match: 85.0%
‚úÖ Match confirmed. Indexing documents for questions...


  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
Loading weights: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 103/103 [00:00<00:00, 387.71it/s, Materializing param=pooler.dense.weight]                             
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m



--- Generated Interview Questions ---
Here are the tailored interview questions based on the provided context:

### **1. 5 Technical Questions (Job Description Focused)**
1. **NLP/ML Models**: Can you explain how you would design a machine learning system for summarizing unstructured data into a human-readable format? What challenges might arise, and how would you address them?
2. **Data Processing**: How do you handle data cleaning and feature engineering when dealing with billions of data points? What tools or techniques do you use?
3. **Model Training & Optimization**: How do you ensure optimal performance in your ML models? Walk us through your approach to training, retraining, and fine-tuning.
4. **Knowledge Extraction**: What techniques or algorithms would you use for knowledge extraction from unstructured data? Have you worked with RAGs or Agentic AI in this context?
5. **Data Visualization**: How do you present complex AI/ML results to non-technical stakeholders? What tools (e