In [None]:
# ! pip install langchain langchain-google-genai langchain-openai langchain-community langchain dotenv langchain-text-splitters pypdf faiss-cpu langchain_classic

##### In this tutorial I'm using OPENAI Model & EMBEDDINGS but if you wanted to use GOOGLE GEMINI MODEL and EMBEDDINGS then please look at gemini_rag.ipynb 

In [None]:
import os


In [None]:
os.environ['OPENAI_API_KEY'] = "API_KEY" # PASTE YOU'RE API KEY HERE.    
# # set up you're api key - Open AI
# Get you're API key from https://platform.openai.com/settings/organization/api-keys

#### Models: This is the "Brain." It handles the conversation and generates the final human-like response.

In [None]:
from langchain_openai import ChatOpenAI

model = ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0.5
)

#### Embeddings: This is the "Translator." Computers don’t understand words; they understand numbers. Embeddings turn your text into long lists of numbers (vectors) so the computer can calculate how "similar" two sentences are.

In [None]:
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(
    model="text-embedding-3-small"
)

#### Loaders: This is the "Data Entry" step. It simply opens your file ie.resume.pdf and reads the text so the code can process it. It gives you the documents. Suppose you're pdf has 5 pages then it gives you 5 Documents.

In [None]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader('resume.pdf') # Document Loader
documents = loader.load()

[Document(metadata={'producer': 'pdfTeX-1.40.26', 'creator': 'LaTeX with hyperref', 'creationdate': '2025-12-24T10:47:32+00:00', 'author': '', 'keywords': '', 'moddate': '2025-12-24T10:47:32+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.26 (TeX Live 2024) kpathsea version 6.4.0', 'subject': '', 'title': '', 'trapped': '/False', 'source': 'resume.pdf', 'total_pages': 2, 'page': 0, 'page_label': '1'}, page_content='Namilakonda Rahul rahulnamilakonda100@gmail.com\nGithub: github.com/rahulnamilakonda Hyderabad, India\nLinkedIn: linkedin.com/in/namilakondarahul/\nProfessional Summary\n• Junior Machine Learning Engineer with 2+ years of experience in Flutter development and hands-on skills in Python,\nFast API, SQL, NOSQL, GCP (CLOUD) and Data Preprocessing . Worked in a fast-paced startup environment at\n90Heal, contributing to AI-driven solutions for student mental health.\nExperience\n• 90Heal (Still I Rise Tech Pvt Ltd) - AI-Driven Mental Health + Ed-Tech Start

#### Text-Splitters: Imagine trying to find a specific quote in a 500-page book. It’s easier if the book is broken into small, labeled snippets. We "chunk" the resume into 500-character pieces so the AI can pinpoint the exact section it needs without getting overwhelmed. Now our pdf of n pages broken down into chunks of 500 characters and each document contains 500 characters.  

In [103]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
chunks = splitter.split_documents(documents)
chunks

[Document(metadata={'producer': 'pdfTeX-1.40.26', 'creator': 'LaTeX with hyperref', 'creationdate': '2025-12-24T10:47:32+00:00', 'author': '', 'keywords': '', 'moddate': '2025-12-24T10:47:32+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.26 (TeX Live 2024) kpathsea version 6.4.0', 'subject': '', 'title': '', 'trapped': '/False', 'source': 'resume.pdf', 'total_pages': 2, 'page': 0, 'page_label': '1'}, page_content='Namilakonda Rahul rahulnamilakonda100@gmail.com\nGithub: github.com/rahulnamilakonda Hyderabad, India\nLinkedIn: linkedin.com/in/namilakondarahul/\nProfessional Summary\n• Junior Machine Learning Engineer with 2+ years of experience in Flutter development and hands-on skills in Python,\nFast API, SQL, NOSQL, GCP (CLOUD) and Data Preprocessing . Worked in a fast-paced startup environment at\n90Heal, contributing to AI-driven solutions for student mental health.\nExperience'),
 Document(metadata={'producer': 'pdfTeX-1.40.26', 'creator': 'LaTeX with hyp

#### Vector Store: It stores those numeric "translations" (embeddings) of your resume chunks. Because it's stored as numbers, the system can search through thousands of pages in milliseconds to find the most relevant information. You're query to this vector store is also converted to an embedding and performs a similarity search on stored embeddings to get the related documents to you're query.  

In [None]:
from langchain_community.vectorstores import FAISS

vector_store = FAISS.from_documents(chunks, embedding=embeddings)

#### Retriever: This converts you're query to an embedding and performs a similarity search on all the stored vectors in vector store and gives you the related top 5 documents. 

In [79]:
retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 5})

retriever.invoke("What is my experience")

[Document(id='f2762a3e-dde5-424e-a2b0-1e956d482f93', metadata={'producer': 'pdfTeX-1.40.26', 'creator': 'LaTeX with hyperref', 'creationdate': '2025-12-24T10:47:32+00:00', 'author': '', 'keywords': '', 'moddate': '2025-12-24T10:47:32+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.26 (TeX Live 2024) kpathsea version 6.4.0', 'subject': '', 'title': '', 'trapped': '/False', 'source': 'resume.pdf', 'total_pages': 2, 'page': 0, 'page_label': '1'}, page_content='for personalized mental wellness recommendations using Gen AI, improving accuracy by 30%.\n◦ Cross-Platform App Development: Architected and developed a cross-platform mobile app using Flutter, connected\nto Firebase and GCP backend services, enabling personalized wellness tracking for 10,000+ students.\n◦ Applied Psychology and Behavioral Science : Integrated applied and behavioral psychology models within the'),
 Document(id='3ff7d4f2-bcfb-42c3-9b2d-8957b590c33c', metadata={'producer': 'pdfTeX-1.40.26', 'c

#### Prompt

In [93]:
from langchain_core.prompts import ChatPromptTemplate


messages = [("system",
            """
            You are an AI Resume Assistant.

            Your task:
            - Answer questions about the candidate ONLY using the provided resume content.
            - Behave like a recruiter reviewing the resume.

            Guidelines:
            - Do not add skills, experience, or projects that are not explicitly mentioned.
            - If something is missing, clearly state that it is not present in the resume.
            - Prefer bullet points for skills, projects, and experience summaries.
            - Be factual, concise, and professional.

            Resume Context:
            {context}
            """
        ),
        ("human","""{input}""")
        ]

prompt = ChatPromptTemplate.from_messages(messages)


#### Output Parser: To parse the model output we use an output parser since the model response is very verbose and contains much information that an enduser don't require.

In [65]:
from langchain_core.output_parsers import StrOutputParser

parser = StrOutputParser()

#### Connecting above all the compenents we now built the RAG system where we can ask questions on our PDF. 

In [96]:
from langchain_classic.chains.combine_documents import create_stuff_documents_chain
from langchain_classic.chains.retrieval import create_retrieval_chain

combine_docs_chain = create_stuff_documents_chain(model, prompt)
rag_chain = create_retrieval_chain(retriever, combine_docs_chain)

In [104]:
# Questions: 
# Summarize the candidate's profile in 5 bullet points.

# What are the candidate's strongest technical skills?

# Does the candidate have backend development experience?

# Is the candidate suitable for a Data Scientist role?

# What kind of roles is this candidate best suited for?

# Give me top 5 interview questions which may be asked from my resume based on my working experience?

response = rag_chain.invoke({"input":"Give me top 5 interview questions which may be asked from my resume based on my working experience"})
response

{'input': 'Give me top 5 interview questions which may be asked from my resume based on my working experience',
 'context': [Document(id='d7806e94-9d1a-4ea1-8e12-2d490a9dddb5', metadata={'producer': 'pdfTeX-1.40.26', 'creator': 'LaTeX with hyperref', 'creationdate': '2025-12-24T10:47:32+00:00', 'author': '', 'keywords': '', 'moddate': '2025-12-24T10:47:32+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.26 (TeX Live 2024) kpathsea version 6.4.0', 'subject': '', 'title': '', 'trapped': '/False', 'source': 'resume.pdf', 'total_pages': 2, 'page': 0, 'page_label': '1'}, page_content='application using Gen AI, Chain of Thought (CoT) reasoning, and Zero-shot prompting. Introduced audio-based chat\nfeatures for better user interaction.\n◦ Cloud Infrastructure and Data Pipelines : Optimized Google Cloud Platform (GCP) resources and BigQuery for\ndata analytics, reducing cloud costs by 25% and improving data efficiency.\n◦ Collaborative Leadership: Managed a team compris

In [105]:
response['answer']

'Based on your working experience at 90Heal, here are 5 potential interview questions:\n\n1.  "You led the integration of a Retrieval-Augmented Generation (RAG) pipeline for personalized mental wellness recommendations. Can you describe the process of implementing this using Gen AI, Chain of Thought (CoT) reasoning, and Zero-shot prompting, and how it improved accuracy by 30%?"\n2.  "At 90Heal, you optimized Google Cloud Platform (GCP) resources and BigQuery, reducing cloud costs by 25%. Could you elaborate on the specific strategies and changes you implemented to achieve this cost reduction and improve data efficiency?"\n3.  "You managed a team comprising developers and psychologists. How did you foster collaboration, conduct code reviews, and implement agile methodologies to ensure continuous improvement and timely delivery of projects within such a diverse team?"\n4.  "Can you tell me about the introduction of audio-based chat features at 90Heal? What was the motivation behind this 