In [1]:
import os
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.output_parsers import StrOutputParser

# Set up the LangChain Gemini model
model = ChatGoogleGenerativeAI(
    api_key="AIzaSyA3xxNyX9FdTugUqnN7940fcMNZQjQZ2EY",
    model="gemini-1.5-flash" 
)

# Create the parser
parser = StrOutputParser()

# Correct chaining with LangChain
chain = model | parser

# Invoke the chain with the query
response = chain.invoke("what is rag")

print(response)

RAG stands for **Retrieval Augmented Generation**.  It's a technique used in large language models (LLMs) to improve their performance and accuracy by allowing them to access and process external information during generation.  Instead of relying solely on the knowledge embedded within their training data, RAG-enhanced LLMs can retrieve relevant documents or information from a knowledge base or other external sources, incorporating that information into their responses.  This makes them less prone to hallucinations (fabricating information) and more capable of handling factual questions and tasks requiring up-to-date information.


In [2]:
from langchain.prompts import ChatPromptTemplate

template = """
Answer the question based on the context below. If you can't 
answer the question, reply "I don't know".

Context: {context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)
chain = prompt | model | parser


In [3]:
# Read the original file with UTF-8 encoding
with open('ExtractedText/qns.txt', 'r', encoding='utf-8') as qns:
    content = qns.read()

# Save the content to a new file with UTF-8 encoding
with open('context.txt', 'w', encoding='utf-8') as context:
    context.write(content)

# Read the new file with UTF-8 encoding
with open("context.txt", 'r', encoding='utf-8') as file:
    context = file.read()

# Display the first 100 characters
print(context[:100])




Top 50 Data Structures Interview Questions And Answers PDF Data Structure Interview Questions for Fr


In [4]:
try:
    response = chain.invoke({
        "context": context,
        "question": "What is a data structure"
    })
    print(response)
except Exception as e:
    print(f"Error occurred: {e}")

A data structure is a mechanical way to organize, align, and manipulate data as per requirements.  It deals with different datasets and how well they are aligned to ensure that data can be organized and accessed efficiently.  Data organization determines how a program performs, and data dependency and relationships between datasets play a crucial role.


In [5]:
from langchain_community.document_loaders import TextLoader

# Specify utf-8 encoding while loading the file
loader = TextLoader("context.txt", encoding="utf-8")

# Load the document properly
text_documents = loader.load()

# Print the loaded document(s)
print(text_documents)
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
documents = text_splitter.split_documents(text_documents)


[Document(metadata={'source': 'context.txt'}, page_content='Top 50 Data Structures Interview Questions And Answers PDF Data Structure Interview Questions for Freshers 1. What is a data structure? A \ndata structure is a mechanical way to organise, align, and manipulate data as per requirements. It is not restricted to putting data in a table but \ndeals with different datasets and how well they are aligned. The aim is to ensure that data can be organised and accessed efficiently. Data \norganisation determines how a program performs. Moreover, data dependency and relationships between two or more datasets play a crucial \nrole in data structures. While designing code, we need to pay utmost attention to how data is structured because incorrectly structured or \ninefficiently stored data can hamper the overall performance of the code. 2. What are the applications of data structures? Data structures are \napplied across multiple industries and domains as algorithms are the primary require

In [6]:
from langchain_community.document_loaders import TextLoader

# Specify utf-8 encoding while loading the file
loader = TextLoader("ExtractedText/ans.txt", encoding="utf-8")

# Load the document properly
text_documents = loader.load()

# Print the loaded document(s)
print(text_documents)
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
documents_ans = text_splitter.split_documents(text_documents)

[Document(metadata={'source': 'ExtractedText/ans.txt'}, page_content='13. What is FIFO? \nHariOm \n')]


In [7]:
# import pinecone
# from langchain.vectorstores import Pinecone
# from langchain.embeddings import HuggingFaceEmbeddings
# hf_embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
# embeddings = hf_embeddings.embed_documents([doc.page_content for doc in documents])
# # Initialize Pinecone
# pc = pinecone.Pinecone(api_key="pcsk_5jvG4s_GPWUrsc8cBhH4gXoUW2RfP9T1qsuQkksmyr8585oTjLJbdG4CmcjCM67fppTWNG", environment="us-east-1")

# index_name = "qns-rag"
# if index_name not in pc.list_indexes().names():
#     pc.create_index(name=index_name, metric="cosine", dimension=len(embeddings[0]))

# index = pc.Index(index_name)

# # Upsert embeddings into Pinecone
# vectors = [(str(i), emb, {"text": documents[i].page_content}) for i, emb in enumerate(embeddings)]
# index.upsert(vectors=vectors)

# # Correct Pinecone VectorStore initialization
# pinecone_vectorstore = Pinecone(index=index, embedding=hf_embeddings, text_key="text")

# print(f"Documents upserted to Pinecone index '{index_name}' successfully.")


In [8]:
# from langchain_core.runnables import RunnableParallel, RunnablePassthrough
# chain = (
#     {"context": pinecone_vectorstore.as_retriever(), "question": RunnablePassthrough()}
#     | prompt
#     | model
#     | parser
# )

# # Invoke the chain with the query
# chain.invoke("Are linked lists linear or non-linear data structures?")

In [9]:
import pinecone
from langchain.vectorstores import Pinecone
from langchain.embeddings import HuggingFaceEmbeddings
import time

# Initialize HuggingFace embeddings
hf_embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

# Embed context and user chunks
context_embeddings = hf_embeddings.embed_documents([doc.page_content for doc in documents])
user_embeddings = hf_embeddings.embed_documents([doc.page_content for doc in documents_ans])

# Initialize Pinecone
pc = pinecone.Pinecone(api_key="pcsk_5jvG4s_GPWUrsc8cBhH4gXoUW2RfP9T1qsuQkksmyr8585oTjLJbdG4CmcjCM67fppTWNG", 
                        environment="us-east-1")

# Create index if not available
index_name = "qns-rag"
if index_name not in pc.list_indexes().names():
    pc.create_index(name=index_name, metric="cosine", dimension=len(context_embeddings[0]))

index = pc.Index(index_name)

# Prepare vectors for upsert
context_vectors = [(f"context_{i}", emb, {"text": documents[i].page_content}) for i, emb in enumerate(context_embeddings)]
user_vectors = [(f"user_{i}", emb, {"text": documents_ans[i].page_content}) for i, emb in enumerate(user_embeddings)]

# Upsert context and user answers
index.upsert(vectors=context_vectors, namespace="context")
index.upsert(vectors=user_vectors, namespace="user")

# Wait for data to be ready in Pinecone
def wait_for_index_ready(index, namespace="context"):
    while True:
        stats = index.describe_index_stats()
        if stats['namespaces'].get(namespace, {}).get('vector_count', 0) > 0:
            break
        time.sleep(1)

wait_for_index_ready(index, namespace="context")
wait_for_index_ready(index, namespace="user")

print("Documents upserted to Pinecone successfully and ready to query!")



  hf_embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")





Documents upserted to Pinecone successfully and ready to query!


In [10]:
import os
from langchain.vectorstores import Pinecone

# Create directory for output if it doesn't exist
OUTPUT_DIR = "Op"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Retrieve matches and classify answers
def classify_answers(documents_ans, user_embeddings, index, threshold=0.85):
    correct_answers = []
    incorrect_answers = []

    for i, emb in enumerate(user_embeddings):
        # Search for the most similar chunk in context
        results = index.query(vector=emb, top_k=1, namespace="context", include_metadata=True)

        # Get best match and its similarity score
        if results["matches"]:
            best_match = results["matches"][0]
            score = best_match["score"]

            # Check similarity and classify the answer
            if score >= threshold:
                correct_answers.append({
                    "user_answer": documents_ans[i].page_content,
                    "correct_answer": best_match["metadata"]["text"],
                    "similarity": score
                })
            else:
                incorrect_answers.append({
                    "user_answer": documents_ans[i].page_content,
                    "correct_answer": best_match["metadata"]["text"],
                    "similarity": score
                })

    return correct_answers, incorrect_answers


# Classify answers as correct or incorrect
correct_answers, incorrect_answers = classify_answers(documents_ans, user_embeddings, index)

# Define file path to save the output
op_filename = "op.txt"
op_path = os.path.join(OUTPUT_DIR, op_filename)

# Prepare output in plain text format
output_lines = []

# Add correct answers to output
if correct_answers:
    output_lines.append("✅ Correct Answers:\n")
    for ans in correct_answers:
        output_lines.append(f"User Answer: {ans['user_answer']}\n")
        output_lines.append(f"Correct Answer: {ans['correct_answer']}\n")
        output_lines.append(f"Similarity Score: {ans['similarity']:.2f}\n")
        output_lines.append("-" * 50 + "\n")

# Add incorrect answers to output
if incorrect_answers:
    output_lines.append("\n❌ Incorrect Answers:\n")
    for ans in incorrect_answers:
        output_lines.append(f"User Answer: {ans['user_answer']}\n")
        output_lines.append(f"Correct Answer: {ans['correct_answer']}\n")
        output_lines.append(f"Similarity Score: {ans['similarity']:.2f}\n")
        output_lines.append("-" * 50 + "\n")

# Write output to the file as plain text
with open(op_path, "w", encoding="utf-8") as op_file:
    op_file.writelines(output_lines)

print(f"✅ Output saved successfully in '{op_path}'")

# Print correct answers
if correct_answers:
    print("✅ Correct Answers:")
    for ans in correct_answers:
        print(f"\nUser Answer: {ans['user_answer']}\n")
        print(f"Correct Answer: {ans['correct_answer']}\n")
        print(f"Similarity Score: {ans['similarity']:.2f}\n")

# Print incorrect answers
if incorrect_answers:
    print("\n❌ Incorrect Answers:")
    for ans in incorrect_answers:
        print(f"\nUser Answer: {ans['user_answer']}\n")
        print(f"Correct Answer: {ans['correct_answer']}\n")
        print(f"Similarity Score: {ans['similarity']:.2f}\n")





✅ Output saved successfully in 'Op\op.txt'

❌ Incorrect Answers:

User Answer: 13. What is FIFO? 
HariOm

Correct Answer: the previous node. This enables traversal in both directions. Some of the examples of a doubly-linked list are:   Browser cache with back-
forward visited pages   Song playlist with next and previous buttons   Undo and redo functionality 13. What is FIFO? FIFO stands for First in, 
First out order, representing the way and order in which data is accessed. The data element stored first in the list is the first entity to be removed

Similarity Score: 0.48



In [11]:
import pinecone

# Initialize Pinecone
pc = pinecone.Pinecone(api_key="pcsk_5jvG4s_GPWUrsc8cBhH4gXoUW2RfP9T1qsuQkksmyr8585oTjLJbdG4CmcjCM67fppTWNG", environment="us-east-1")

# Name of your index
index_name = "qns-rag"

# Connect to the index
index = pc.Index(index_name)

# Delete all data from the index
index.delete(delete_all=True, namespace="context")
index.delete(delete_all=True, namespace="user")
# index.delete(delete_all=True, namespace="")
print(f"All data from index '{index_name}' has been deleted successfully!")

All data from index 'qns-rag' has been deleted successfully!
