In [2]:
%%capture --no-stderr
%pip install --quiet -U langchain langchain_community tiktoken langchain-nomic "nomic[local]" langchain-ollama scikit-learn langgraph tavily-python bs4


[notice] A new release of pip is available: 24.3.1 -> 25.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [6]:
!ollama pull deepseek-r1:1.5b

[?25lpulling manifest ⠋ [?25h[?25l[2K[1Gpulling manifest ⠙ [?25h[?25l[2K[1Gpulling manifest ⠹ [?25h[?25l[2K[1Gpulling manifest ⠸ [?25h[?25l[2K[1Gpulling manifest ⠼ [?25h[?25l[2K[1Gpulling manifest ⠴ [?25h[?25l[2K[1Gpulling manifest ⠦ [?25h[?25l[2K[1Gpulling manifest ⠧ [?25h[?25l[2K[1Gpulling manifest ⠇ [?25h[?25l[2K[1Gpulling manifest ⠏ [?25h[?25l[2K[1Gpulling manifest ⠋ [?25h[?25l[2K[1Gpulling manifest ⠙ [?25h[?25l[2K[1Gpulling manifest ⠹ [?25h[?25l[2K[1Gpulling manifest ⠸ [?25h[?25l[2K[1Gpulling manifest ⠼ [?25h[?25l[2K[1Gpulling manifest ⠴ [?25h[?25l[2K[1Gpulling manifest ⠦ [?25h[?25l[2K[1Gpulling manifest ⠧ [?25h[?25l[2K[1Gpulling manifest ⠇ [?25h[?25l[2K[1Gpulling manifest ⠏ [?25h[?25l[2K[1Gpulling manifest ⠋ [?25h[?25l[2K[1Gpulling manifest ⠙ [?25h[?25l[2K[1Gpulling manifest 
pulling aabd4debf0c8...   0% ▕                ▏    0 B/1.1 GB                  [?25h[?25l[2K[1G[A[2K[1Gpulling 

**Defining the Model *deepseek-r1:1.5b***

In [2]:
from langchain_ollama import ChatOllama

local_llm = "deepseek-r1:1.5b"
llm = ChatOllama(model=local_llm, temperature=0)
llm_json_mode = ChatOllama(model=local_llm, temperature=0, format="json")


**Function for Loading the JSON file and created VectorBase**

In [4]:
import json
from langchain_community.vectorstores import SKLearnVectorStore
from langchain_nomic.embeddings import NomicEmbeddings

# Load the JSON file
with open("vectorstore.json", "r", encoding="utf-8") as f:
    vector_data = json.load(f)

# Reconstruct the vector store
vectorstore = SKLearnVectorStore.from_texts(
    texts=vector_data["documents"],
    embedding=NomicEmbeddings(model="nomic-embed-text-v1.5", inference_mode="local"),
    metadatas=vector_data["metadata"],
)

print("Vector store reloaded from JSON!")


Embedding texts: 100%|██████████| 16/16 [00:13<00:00,  1.20inputs/s]

Vector store reloaded from JSON!





**Creating Vectorbase from the Scraped Data**

In [None]:
import os
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyMuPDFLoader, TextLoader  # Load PDFs
from langchain_community.vectorstores import SKLearnVectorStore
from langchain_nomic.embeddings import NomicEmbeddings


pdf_files = [f for f in os.listdir() if f.endswith(".pdf")]
txt_files = [f for f in os.listdir() if f.endswith(".txt")]
docs_list = []

# Load PDFs
for pdf_path in pdf_files:
    loader = PyMuPDFLoader(pdf_path)
    docs_list.extend(loader.load())  # Append loaded documents from PDFs

# Load TXT files
for txt_path in txt_files:
    loader = TextLoader(txt_path)
    docs_list.extend(loader.load())


# # Step 1: Load all PDFs in the current directory
# pdf_files = [f for f in os.listdir() if f.endswith(".pdf")]
# docs_list = []

# for pdf_path in pdf_files:
#     loader = PyMuPDFLoader(pdf_path)
#     docs_list.extend(loader.load())  # Append loaded documents

# Step 2: Split documents
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=1024, chunk_overlap=200
)
doc_splits = text_splitter.split_documents(docs_list)

# Step 3: Add to vectorDB
vectorstore = SKLearnVectorStore.from_documents(
    documents=doc_splits,
    embedding=NomicEmbeddings(model="nomic-embed-text-v1.5", inference_mode="local"),
)

# Step 4: Create retriever (ensure k does not exceed available documents)
retriever = vectorstore.as_retriever(k=min(2, len(doc_splits)))

print(f"Loaded {len(doc_splits)} document chunks from {len(pdf_files)} PDFs.")



Embedding texts: 100%|██████████| 16/16 [00:11<00:00,  1.45inputs/s]

Loaded 16 document chunks from 2 PDFs.





**Vectorbase created**

In [5]:
retriever = vectorstore.as_retriever(k=3)

*Tavily API integration*

In [6]:
import os
import getpass
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Set API key from environment or prompt user
def _set_env(var: str):
    if not os.environ.get(var):
        os.environ[var] = getpass.getpass(f"Enter value for {var}: ")

# Ensure API key is set
_set_env("TAVILY_API_KEY")

# Set parallelism setting for tokenizers
os.environ["TOKENIZERS_PARALLELISM"] = "true"


In [7]:
### Router
import json
from langchain_core.messages import HumanMessage, SystemMessage

# Prompt
# router_instructions = """You are an expert at routing a user question to a vectorstore or web search.

# The vectorstore contains documents related to agents, prompt engineering, and adversarial attacks.

# Use the vectorstore for questions on these topics. For all else, and especially for current events, use web-search.

# Return JSON with single key, datasource, that is 'websearch' or 'vectorstore' depending on the question."""

router_instructions = """
You are an expert router for directing user queries.

The vectorstore contains documents related to:
- job role
- person qualification required
- skills

Use 'vectorstore' if the query is related to any of these topics.
For everything else (including current events), return 'websearch'.

### IMPORTANT ###
Your response MUST be a **valid JSON object** with exactly one key:
{
  "datasource": "vectorstore"
}
or
{
  "datasource": "websearch"
}

Do NOT include explanations, additional keys, or any other information.
"""


# Test router
test_web_search = llm_json_mode.invoke(
    [SystemMessage(content=router_instructions)]
    + [
        HumanMessage(
            content="What are the required skills for this job role?"
        )
    ]
)
# test_web_search_2 = llm_json_mode.invoke(
#     [SystemMessage(content=router_instructions)]
#     + [HumanMessage(content="What are the models released today for llama3.2?")]
# )
# test_vector_store = llm_json_mode.invoke(
#     [SystemMessage(content=router_instructions)]
#     + [HumanMessage(content="What are the types of agent memory?")]
# )
print(
    json.loads(test_web_search.content)
    # json.loads(test_web_search_2.content),
    # json.loads(test_vector_store.content),
)


{'datasource': 'vectorstore'}


**Assessing the goodness of Retrieved Document**

In [16]:
### Retrieval Grader

# Doc grader instructions
doc_grader_instructions = """You are a grader assessing relevance of a retrieved document to a user question.

If the document contains keyword(s) or semantic meaning related to the question, grade it as relevant."""

# Grader prompt
doc_grader_prompt = """Here is the retrieved document: \n\n {document} \n\n Here is the user question: \n\n {question}. 

This carefully and objectively assess whether the document contains at least some information that is relevant to the question.

Return JSON with single key, binary_score, that is 'yes' or 'no' score to indicate whether the document contains at least some information that is relevant to the question."""

# Test
question = "What is the job description for tescra ACHNET company?"
docs = retriever.invoke(question)
doc_txt = docs[1].page_content
doc_grader_prompt_formatted = doc_grader_prompt.format(
    document=doc_txt, question=question
)
result = llm_json_mode.invoke(
    [SystemMessage(content=doc_grader_instructions)]
    + [HumanMessage(content=doc_grader_prompt_formatted)]
)
json.loads(result.content)

Embedding texts: 100%|██████████| 1/1 [00:00<00:00, 12.00inputs/s]


{'key': 'relevant', 'score': 1}

**Integration and routing of "VectorBase" and "Web Search"**

In [43]:
import requests
from langchain.schema import HumanMessage, SystemMessage
import json


# RAG Prompt
rag_prompt = """You are an assistant for question-answering tasks.

Here is the context to use to answer the question:

{context}

Think carefully about the above context.

Now, review the user question:

{question}

Provide an answer to this question using only the above context.

Use three sentences maximum and keep the answer concise.

Answer:"""

# Function to search using TAVIly API
def tavili_search(query):
    url = "https://api.tavily.com/v1/search"  # Replace with TAVIly's actual search endpoint
    headers = {
        "Authorization": f"Bearer {os.getenv('TAVILY_API_KEY')}",
        "Content-Type": "application/json"
    }
    params = {
        "query": query,
        "num_results": 5  # You can adjust the number of results you want
    }
    
    response = requests.get(url, headers=headers, params=params)
    
    if response.status_code == 200:
        results = response.json()
        return results['data']  # Assuming the response returns a 'data' key containing search results
    else:
        return f"Error: {response.status_code} - {response.text}"


# Function to format retrieved documents
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# Function to check if retrieved docs are relevant
# def is_relevant(docs, question):
#     doc_grader_prompt = """Here is the retrieved document: \n\n {document} \n\n Here is the user question: \n\n {question}. 

#     Carefully and objectively assess whether the document contains at least some information that is relevant to the question.

#     ### IMPORTANT ###
#     Your response MUST be a **valid JSON object** with exactly one key:
#     {{
#       "binary_score": "Yes"
#     }}
#     or
#     {{
#       "binary_score": "No"
#     }}

#     Do NOT include explanations, additional keys, or any other information.
#     """
#     #doc_grader_prompt = """Here is the document retrieved from your search: \n\n{document}\n\nThe user has asked the following question: \n\n{question}\n\nDoes this document contain relevant information to answer the question? Please respond with either 'Yes' or 'No'."""


#     docs_txt = format_docs(docs)
#     doc_grader_prompt_formatted = doc_grader_prompt.format(document=docs_txt, question=question)

#     grading_response = llm_json_mode.invoke(
#         [SystemMessage(content="You are a grader assessing relevance of a retrieved document.")]
#         + [HumanMessage(content=doc_grader_prompt_formatted)]
#     )
#     print(f"Grading response: {grading_response.content}")
#     try:
#         grading_result = json.loads(grading_response.content)
#         return grading_result.get("binary_score", "No") == "Yes"
#     except json.JSONDecodeError:
#         return False  # If the model fails to return valid JSON, assume irrelevance.

def is_relevant(docs, question):
    doc_grader_prompt = """Here is the retrieved document: \n\n {document} \n\n Here is the user question: \n\n {question}. 

    Carefully and objectively assess whether the document contains **specific information** that is directly relevant to the question. 

    ### IMPORTANT ###
    - The document must explicitly mention or provide information related to the question.
    - If the document does not contain any information related to the question, return "No".
    - Your response MUST be a **valid JSON object** with exactly one key:
    {{
      "binary_score": "Yes"
    }}
    or
    {{
      "binary_score": "No"
    }}

    Do NOT include explanations, additional keys, or any other information.
    """

    docs_txt = format_docs(docs)
    doc_grader_prompt_formatted = doc_grader_prompt.format(document=docs_txt, question=question)

    # Debug: Print the formatted prompt
    print("Formatted Prompt:", doc_grader_prompt_formatted)

    grading_response = llm_json_mode.invoke(
        [SystemMessage(content="You are a grader assessing relevance of a retrieved document.")]
        + [HumanMessage(content=doc_grader_prompt_formatted)]
    )

    # Debug: Print the raw response from the model
    print("Raw Grading Response:", grading_response.content)

    try:
        grading_result = json.loads(grading_response.content)
        # Debug: Print the parsed JSON
        print("Parsed JSON:", grading_result)

        # Ensure the key exists and has a valid value
        if "binary_score" in grading_result and grading_result["binary_score"] in ["Yes", "No"]:
            # Additional check: If the document does not contain the question keywords, force "No"
            question_keywords = ["viral kohli", "IIT KGP campus"]
            if not any(keyword.lower() in docs_txt.lower() for keyword in question_keywords):
                print("Question keywords not found in document. Forcing 'No'.")
                return False
            return grading_result["binary_score"] == "Yes"
        else:
            print("Invalid JSON structure or missing 'binary_score' key.")
            return False
    except json.JSONDecodeError as e:
        print(f"JSON Decode Error: {e}")
        return False  # If the model fails to return valid JSON, assume irrelevance.  # If the model fails to return valid JSON, assume irrelevance.

# Main Logic for Routing
question = "what is the qualification requirement for tescra achnet software role?"
docs = retriever.invoke(question)



if is_relevant(docs, question):
    docs_txt = format_docs(docs)
    rag_prompt_formatted = rag_prompt.format(context=docs_txt, question=question)
    generation = llm.invoke([HumanMessage(content=rag_prompt_formatted)])
    answer = generation.content
else:
    print("No relevant context found. Searching the web using TAVIly...")
    tavili_results = tavili_search(question)  # Use TAVIly search
    answer = tavili_results  # Use TAVIly results as fallback

print(answer)


Embedding texts:   0%|          | 0/1 [00:00<?, ?inputs/s]

Embedding texts: 100%|██████████| 1/1 [00:00<00:00,  7.89inputs/s]


Formatted Prompt: Here is the retrieved document: 

 Detailed Job Description 
Job Profile 
• Job Designation: Software Engineer 
• Place of Posting: Bangalore/ Remote 
ACHNET, the world's leading AI-driven marketplace that fosters professional growth and 
collaboration, and is made with love by IITians, also we have been loved by IITians as we have been 
hiring them with the utmost preference in building our employee ecosystems.  
We are seeking skilled and motivated Software Development Engineers (SDEs) to join our agile 
team and contribute to the development of robust and feature-rich web applications. As an SDE at 
ACHNET, you will be a crucial part of our mission to create exceptional user experiences and drive 
innovation in technology. 
Role and Responsibilities:  
As an SDE focusing on web application development, you will be responsible for the entire software 
development lifecycle, from requirements analysis to deployment. You will work collaboratively 
within an agile team

In [32]:
docs_txt = format_docs(docs)
print(f"Formatted Document: {docs_txt}")


Formatted Document: Detailed Job Description 
Job Profile 
• Job Designation: Software Engineer 
• Place of Posting: Bangalore/ Remote 
ACHNET, the world's leading AI-driven marketplace that fosters professional growth and 
collaboration, and is made with love by IITians, also we have been loved by IITians as we have been 
hiring them with the utmost preference in building our employee ecosystems.  
We are seeking skilled and motivated Software Development Engineers (SDEs) to join our agile 
team and contribute to the development of robust and feature-rich web applications. As an SDE at 
ACHNET, you will be a crucial part of our mission to create exceptional user experiences and drive 
innovation in technology. 
Role and Responsibilities:  
As an SDE focusing on web application development, you will be responsible for the entire software 
development lifecycle, from requirements analysis to deployment. You will work collaboratively 
within an agile team, often as an individual contribu

In [34]:
print(is_relevant(docs, question))

False


**Function to Generate response only using the context**

In [50]:
### Generate

# Prompt
rag_prompt = """You are an assistant for question-answering tasks. 

Here is the context to use to answer the question:

{context} 

Think carefully about the above context. 

Now, review the user question:

{question}

Provide an answer to this questions using only the above context. 

Use three sentences maximum and keep the answer concise.

Answer:"""

# rag_prompt = """You are an assistant for question-answering tasks.

# Here is the context to use to answer the question:

# {context}

# Now, review the user question:

# {question}

# Answer the question using only the context provided above.

# ### IMPORTANT ###
# If the context does not provide any relevant information to answer the question, strictly respond with "I don't have this knowledge in my database."
# If the context does provide relevant information, answer concisely in three sentences or less.

# Answer:"""



# Post-processing
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# def process_answer(answer):
#     # Check if the answer contains the specific phrase "I don't have this knowledge in my database."
#     if "I don't have this knowledge in my database" in answer:
#         return answer.strip()  # Just return the answer as is
#     else:
#         # If the response is not relevant, we can reformat it to meet the desired output
#         return "I don't have this knowledge in my database"
    
# Test
#question = "Is there a cricket player virat kohli at IIT KGP campus?"
question = "Is there any qualification criteria for tescra achnet software profile?"
docs = retriever.invoke(question)
docs_txt = format_docs(docs)
rag_prompt_formatted = rag_prompt.format(context=docs_txt, question=question)
generation = llm.invoke([HumanMessage(content=rag_prompt_formatted)])
print(generation.content)

# #Test
# #question = "Is there a cricket player virat kohli at IIT KGP campus?"
# question = "Is there any qualification criteria for tescra achnet software profile?"
# docs = retriever.invoke(question)
# docs_txt = format_docs(docs)
# rag_prompt_formatted = rag_prompt.format(context=docs_txt, question=question)
# generation = llm.invoke([HumanMessage(content=rag_prompt_formatted)])

# # Process and print the final answer
# final_answer = process_answer(generation.content)
# print(final_answer)

Embedding texts: 100%|██████████| 1/1 [00:00<00:00,  4.63inputs/s]


<think>
Okay, so I need to figure out if there are any qualification criteria for the ACHNET Software Profile. Let me look through the provided context carefully.

The context is a detailed job description and role responsibilities section from ACHNET. It lists the job profile, including the role as an SDE focusing on web application development. The key points here are about the required skills and qualifications.

Looking at the "Required Skills and Qualifications" section, it mentions several things: strong understanding of data structures, knowledge of machine learning models like YOLO, LLM, NER, experience in object-oriented programming (OOP), proficiency in CSS, HTML, JavaScript for front-end, and experience with NoSQL databases like Redis or MongoDB is a plus.

So, the qualification criteria include specific technical skills and some exposure to certain technologies. The user didn't ask about qualifications but specifically asked if there are any criteria related to Tescra ACHNE

In [12]:
### Hallucination Grader

# Hallucination grader instructions
hallucination_grader_instructions = """

You are a teacher grading a quiz. 

You will be given FACTS and a STUDENT ANSWER. 

Here is the grade criteria to follow:

(1) Ensure the STUDENT ANSWER is grounded in the FACTS. 

(2) Ensure the STUDENT ANSWER does not contain "hallucinated" information outside the scope of the FACTS.

Score:

A score of yes means that the student's answer meets all of the criteria. This is the highest (best) score. 

A score of no means that the student's answer does not meet all of the criteria. This is the lowest possible score you can give.

Explain your reasoning in a step-by-step manner to ensure your reasoning and conclusion are correct. 

Avoid simply stating the correct answer at the outset."""

# Grader prompt
hallucination_grader_prompt = """FACTS: \n\n {documents} \n\n STUDENT ANSWER: {generation}. 

Return JSON with two two keys, binary_score is 'yes' or 'no' score to indicate whether the STUDENT ANSWER is grounded in the FACTS. And a key, explanation, that contains an explanation of the score."""

# Test using documents and generation from above
hallucination_grader_prompt_formatted = hallucination_grader_prompt.format(
    documents=docs_txt, generation=generation.content
)
result = llm_json_mode.invoke(
    [SystemMessage(content=hallucination_grader_instructions)]
    + [HumanMessage(content=hallucination_grader_prompt_formatted)]
)
json.loads(result.content)

{'score': 'yes',
 'key': 'Ensure the STUDENT ANSWER is grounded in the FACTS.',
 'explanation': "The context clearly states that the job requires a Bachelor's degree with a Master's preferred. The student answer aligns with this requirement, ensuring it meets all criteria."}

In [None]:
# ### Answer Grader

# # Answer grader instructions
# answer_grader_instructions = """You are a teacher grading a quiz. 

# You will be given a QUESTION and a STUDENT ANSWER. 

# Here is the grade criteria to follow:

# (1) The STUDENT ANSWER helps to answer the QUESTION

# Score:

# A score of yes means that the student's answer meets all of the criteria. This is the highest (best) score. 

# The student can receive a score of yes if the answer contains extra information that is not explicitly asked for in the question.

# A score of no means that the student's answer does not meet all of the criteria. This is the lowest possible score you can give.

# Explain your reasoning in a step-by-step manner to ensure your reasoning and conclusion are correct. 

# Avoid simply stating the correct answer at the outset."""

# # Grader prompt
# answer_grader_prompt = """QUESTION: \n\n {question} \n\n STUDENT ANSWER: {generation}. 

# Return JSON with two two keys, binary_score is 'yes' or 'no' score to indicate whether the STUDENT ANSWER meets the criteria. And a key, explanation, that contains an explanation of the score."""

# # Test
# question = "What are the traits of a good man?"
# answer = "The Llama 3.2 models released today include two vision models: Llama 3.2 11B Vision Instruct and Llama 3.2 90B Vision Instruct, which are available on Azure AI Model Catalog via managed compute. These models are part of Meta's first foray into multimodal AI and rival closed models like Anthropic's Claude 3 Haiku and OpenAI's GPT-4o mini in visual reasoning. They replace the older text-only Llama 3.1 models."

# # Test using question and generation from above
# answer_grader_prompt_formatted = answer_grader_prompt.format(
#     question=question, generation=answer
# )
# result = llm_json_mode.invoke(
#     [SystemMessage(content=answer_grader_instructions)]
#     + [HumanMessage(content=answer_grader_prompt_formatted)]
# )
# json.loads(result.content)

{'binary_score': 'yes', 'explanation': '...'}

**Fucntion to extract and save the embedding in a vectorbase.json file**

In [1]:
import json

# Extract vectors and document metadata
vector_data = {
    "documents": [doc.page_content for doc in doc_splits],
    "metadata": [doc.metadata for doc in doc_splits],
}

# Save as JSON
with open("vectorstore.json", "w", encoding="utf-8") as f:
    json.dump(vector_data, f, ensure_ascii=False, indent=4)

print("Vector store saved as JSON!")


NameError: name 'doc_splits' is not defined