In [2]:
import os
import faiss
import pickle
import google.generativeai as genai
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQAWithSourcesChain
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from dotenv import load_dotenv

In [3]:
load_dotenv()

True

In [4]:
google_api_key = os.getenv("GOOGLE_API_KEY")

In [5]:
llm=ChatGoogleGenerativeAI(model="gemini-2.0-flash",temperature=0.7)

In [6]:
docs_folder='C:/Users/saifu/Desktop/rag_project1/docs'
pdf_files = [os.path.join(docs_folder,f) for f in os.listdir(docs_folder) if f.endswith('.pdf')]
documents=[]

for pdf in pdf_files:
    loader=PyPDFLoader(pdf)
    documents.extend(loader.load())

In [7]:
len(documents)

420

In [8]:
documents

[Document(metadata={'producer': 'PyPDF', 'creator': 'PyPDF', 'creationdate': '', 'source': 'C:/Users/saifu/Desktop/rag_project1/docs\\SPM - all.pdf', 'total_pages': 420, 'page': 0, 'page_label': '1'}, page_content='Software Process Modeling\nSLIIT  - Faculty of Computing\nSoftware Process Model\n2021\nIntroduction'),
 Document(metadata={'producer': 'PyPDF', 'creator': 'PyPDF', 'creationdate': '', 'source': 'C:/Users/saifu/Desktop/rag_project1/docs\\SPM - all.pdf', 'total_pages': 420, 'page': 1, 'page_label': '2'}, page_content='SLIIT  - Faculty of Computing\nIT1060\nSession outcomes\n•Introduction to Module\n•SPM- Introduction\nSLIIT 2'),
 Document(metadata={'producer': 'PyPDF', 'creator': 'PyPDF', 'creationdate': '', 'source': 'C:/Users/saifu/Desktop/rag_project1/docs\\SPM - all.pdf', 'total_pages': 420, 'page': 2, 'page_label': '3'}, page_content='SLIIT  - Faculty of Computing\nIT1060\nAcademic Integrity Policy\n• Are you aware that following are not accepted in \nSLIIT???\n• Plagiar

In [9]:
doc_splitter = RecursiveCharacterTextSplitter(chunk_size=520,chunk_overlap=20)

In [10]:
doc_chunks=doc_splitter.split_documents(documents)

In [11]:
len(doc_chunks)

461

In [12]:
embedding=GoogleGenerativeAIEmbeddings(model="models/embedding-001")

In [13]:
vectorindex_gemini=FAISS.from_documents(doc_chunks,embedding)

In [14]:
vectorindex_gemini

<langchain_community.vectorstores.faiss.FAISS at 0x29b5d2ae6c0>

In [15]:
vectorindex_gemini.save_local('vectorindex_gemini_01')

In [16]:
vectorindex_load=FAISS.load_local('vectorindex_gemini_01',embedding,allow_dangerous_deserialization=True)

In [17]:
chain=RetrievalQAWithSourcesChain.from_llm(llm=llm,retriever=vectorindex_load.as_retriever())

In [18]:
chain



In [19]:
import langchain

In [20]:
query="what are the Traditional Approaches of Life Cycle Models"

langchain.debug=True

chain({'question':query},return_only_outputs=True)

  chain({'question':query},return_only_outputs=True)


[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain] Entering Chain run with input:
[0m{
  "question": "what are the Traditional Approaches of Life Cycle Models"
}
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain > chain:MapReduceDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain > chain:MapReduceDocumentsChain > chain:LLMChain] Entering Chain run with input:
[0m{
  "input_list": [
    {
      "context": "SLIIT  - Faculty of Computing\nIT 1060\nLife Cycle Model (Process Model)\n• A software life cycle (process) model:\n• is a descriptive and diagrammatic model \nof the life cycle of a software product;\n• identifies all the activities and phases \nnecessary for software development;\n• establishes a precedence ordering among \nthe different activities.\n• Life cycle models encourage systematic \nand disciplined software development.\nSLIIT 11",
      "question": "what

{'answer': 'The traditional approaches of Life Cycle Models are:\n1. Waterfall Model\n2. Incremental Model\n3. Prototyping Model\n4. Spiral Model\n5. Unified Process\n',
 'sources': 'C:/Users/saifu/Desktop/rag_project1/docs\\SPM - all.pdf'}

In [21]:
query="what is a fork?"

langchain.debug=True

chain({'question':query},return_only_outputs=True)

[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain] Entering Chain run with input:
[0m{
  "question": "what is a fork?"
}
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain > chain:MapReduceDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain > chain:MapReduceDocumentsChain > chain:LLMChain] Entering Chain run with input:
[0m{
  "input_list": [
    {
      "context": "IT1060 - Software Process Modeling\nSLIIT  - Faculty of Computing\nFork\n• A fork is when a single flow of control splits into\ntwo or more parallel (concurrent) flows of\ncontrol.\n• Represents a split in the flow of control.",
      "question": "what is a fork?"
    },
    {
      "context": "IT1060 - Software Process Modeling\nSLIIT  - Faculty of Computing\nForks & Joins\n• Forks and joins are used to showing activities that\ncan occur at the same time (in parallel).\n– this does not mean that the activi

{'answer': 'A fork is when a single flow of control splits into two or more parallel (concurrent) flows of control, representing a split in the flow of control. Unlike a branch point, the control flows down all forked paths.\n',
 'sources': 'C:/Users/saifu/Desktop/rag_project1/docs\\SPM - all.pdf'}

In [22]:
query="what is Partitioning"

langchain.debug=True

chain({'question':query},return_only_outputs=True)

[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain] Entering Chain run with input:
[0m{
  "question": "what is Partitioning"
}
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain > chain:MapReduceDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain > chain:MapReduceDocumentsChain > chain:LLMChain] Entering Chain run with input:
[0m{
  "input_list": [
    {
      "context": "IT1060 - Software Process Modeling\nSLIIT  - Faculty of Computing\nPartitioning\n• An activity partition is an activity group for\nactions that have some common characteristic.\n• Partitions often correspond to organizational\nunits or business actors in a business model.",
      "question": "what is Partitioning"
    },
    {
      "context": "IT1060 - Software Process Modeling\nSLIIT  - Faculty of Computing\nEquivalence partitions\n• In equivalence-partitioning technique we need \nto test only one co

{'answer': 'Partitioning can refer to:\n\n*   An activity group for actions that have some common characteristic, often corresponding to organizational units or business actors in a business model. (',
 'sources': 'C:/Users/saifu/Desktop/rag\\_project1/docs\\\\SPM - all.pdf)'}

In [23]:
import requests
import pytesseract
from PIL import Image
from io import BytesIO

In [64]:
# def extract_text_from_image(image_path):
#     respose=requests.get(image_path)
#     img=Image.open(BytesIO(respose.content))
#     extracted_text=pytesseract.image_to_string(img)
#     return extracted_text.strip()

In [65]:
#the above function only al;low image url-- so need image path also

In [66]:
# response.content: Gets the binary content of the image.
# BytesIO(response.content): Converts the binary data into a file-like object.
# Image.open(...): Opens the image using PIL (Python Imaging Library).
# extracted_text = pytesseract.image_to_string(img)
# .strip(): Removes leading and trailing whitespaces from the extracted text.

In [24]:
pytesseract.pytesseract.tesseract_cmd = r"C:\\Program Files\\Tesseract-OCR\\tesseract.exe"

In [26]:
#new function with both url,path3

def extract_text_from_image(image_source):
    if image_source.startswith('http'):
        response=requests.get(image_source)
        img= Image.open(BytesIO(response.content))
    elif os.path.exists(image_source):
        img=Image.open(image_source)
    else:
        raise ValueError("Invelid image source!")
    
    extracted_text=pytesseract.image_to_string(img)
    return extracted_text.strip()

In [27]:
image_path="C:/Users/saifu/OneDrive/Pictures/Screenshots 1/Screenshot 2025-02-17 103629.png"
image_url="https://storage.googleapis.com/ticket_book_sb_saif/Screenshot%202025-02-17%20103629.png"

In [28]:
extracted_text=extract_text_from_image(image_path)

In [29]:
query=extracted_text

langchain.debug=True

chain({'question':query},return_only_outputs=True)

[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain] Entering Chain run with input:
[0m{
  "question": "Give me the explanation to\n\npartitioning with example."
}
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain > chain:MapReduceDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain > chain:MapReduceDocumentsChain > chain:LLMChain] Entering Chain run with input:
[0m{
  "input_list": [
    {
      "context": "IT1060 - Software Process Modeling\nSLIIT  - Faculty of Computing\nEquivalence partitioning \nExample \n• Example of a function which takes a parameter \n“month”.\n• The valid range for the month is 1 to 12, \nrepresenting January to December. This valid \nrange is called a partition.\n• In this example there are two further partitions \nof invalid ranges.\nx < 1 1 ≤ x ≤ 12 12 < x",
      "question": "Give me the explanation to\n\npartitioning with example."
    },
 

{'answer': 'Equivalence partitioning is a testing technique where the input data is divided into partitions, and only one condition from each partition is tested, assuming all conditions in a partition are treated the same by the software. This helps reduce the number of tests needed.\n\nFor example, if a function takes a "month" parameter with a valid range of 1 to 12, there are three partitions: x < 1, 1 ≤ x ≤ 12, and 12 < x. Test cases would be chosen to test each partition, such as -2, 5, and 17.\n',
 'sources': 'C:/Users/saifu/Desktop/rag_project1/docs\\SPM - all.pdf'}

In [30]:
image_url="https://storage.googleapis.com/ticket_book_sb_saif/Screenshot%202025-02-17%20103629.png"
extracted_text=extract_text_from_image(image_url)

In [31]:
query=extracted_text

langchain.debug=True

chain({'question':query},return_only_outputs=True)

[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain] Entering Chain run with input:
[0m{
  "question": "Give me the explanation to\n\npartitioning with example."
}
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain > chain:MapReduceDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain > chain:MapReduceDocumentsChain > chain:LLMChain] Entering Chain run with input:
[0m{
  "input_list": [
    {
      "context": "IT1060 - Software Process Modeling\nSLIIT  - Faculty of Computing\nEquivalence partitioning \nExample \n• Example of a function which takes a parameter \n“month”.\n• The valid range for the month is 1 to 12, \nrepresenting January to December. This valid \nrange is called a partition.\n• In this example there are two further partitions \nof invalid ranges.\nx < 1 1 ≤ x ≤ 12 12 < x",
      "question": "Give me the explanation to\n\npartitioning with example."
    },
 

{'answer': 'Equivalence partitioning is a testing technique where you divide the input data into partitions. You only need to test one condition from each partition because it is assumed that all conditions in one partition will be treated the same way by the software. For example, if a function takes a "month" parameter with a valid range of 1 to 12, there are three partitions: x < 1, 1 ≤ x ≤ 12, and 12 < x. Test cases are chosen so that each partition would be tested (e.g., -2, 5, and 17).\n',
 'sources': 'C:/Users/saifu/Desktop/rag_project1/docs\\SPM - all.pdf'}

In [33]:
image_path="C:\\Users\\saifu\\Desktop\\rag_project1\\img_hand\WhatsApp Image 2025-02-17 at 12.10.24_6de15b34.jpg"
extracted_text=extract_text_from_image(image_path)
extracted_text

  image_path="C:\\Users\\saifu\\Desktop\\rag_project1\\img_hand\WhatsApp Image 2025-02-17 at 12.10.24_6de15b34.jpg"


'What is boundawy value avaligis 2'

In [None]:
# The problem is the above tesseract not performing well when using a handwritten Text
# so use a
# 1) LSTM-based OCR - --oem 1
# 2) DL - easyocr
# 3) GoogleVision API

In [25]:
# 1 --- --oem 1 --psm 6
def extract_handwritten_text(image_path):
    img = Image.open(image_path)
    custom_config = r'--oem 1 --psm 6' 
    text = pytesseract.image_to_string(img, config=custom_config)
    return text.strip()

In [26]:
image_path="C:\\Users\\saifu\\Desktop\\rag_project1\\img_hand\WhatsApp Image 2025-02-17 at 12.10.23_4f201181.jpg"
extract_text=extract_handwritten_text(image_path)
extract_text

  image_path="C:\\Users\\saifu\\Desktop\\rag_project1\\img_hand\WhatsApp Image 2025-02-17 at 12.10.23_4f201181.jpg"


'What js partitioning ?'

In [27]:
image_path="C:\\Users\\saifu\\Desktop\\rag_project1\\img_hand\WhatsApp Image 2025-02-17 at 12.10.24_6de15b34.jpg"
extract_text=extract_handwritten_text(image_path)
extract_text

  image_path="C:\\Users\\saifu\\Desktop\\rag_project1\\img_hand\WhatsApp Image 2025-02-17 at 12.10.24_6de15b34.jpg"


'© hab is boamdey value analysts 7'

In [None]:
#So the --oem 1 --psm 6 also not performing well

In [None]:
#2 easyocr
# import easyocr
# reader = easyocr.Reader(['en'])
# text = reader.readtext(image_path, detail=0)
# print("Extracted Text:", " ".join(text))


: 

The easyocr took too long time so use Google Vison API

In [28]:
#Google Vision API
import os
import requests
from google.cloud import vision
from google.cloud.vision_v1 import types
from PIL import Image
from io import BytesIO


In [None]:
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] ="C:\\Users\\saifu\\Desktop\\rag_project1\\09c1a.json"

In [30]:
client = vision.ImageAnnotatorClient()

In [49]:
def extract_text_from_image(image_source):
    if image_source.startswith('http'):
        response=requests.get(image_source)
        img=BytesIO(response.content)
        image=vision.Image(content=img.getvalue())
    elif os.path.exists(image_source):
        with open(image_source,'rb') as image_file:
            content=image_file.read()
        image=vision.Image(content=content)
    else:
        raise ValueError("Invalid image source!")

    response=client.text_detection(image=image)
    texts=response.text_annotations

    if texts:
        return texts[0].description.strip()
    return None

In [50]:
image_path="C:\\Users\\saifu\\Desktop\\rag_project1\\img_hand\WhatsApp Image 2025-02-17 at 12.10.24_6de15b34.jpg"
extract_text=extract_text_from_image(image_path)
extract_text

  image_path="C:\\Users\\saifu\\Desktop\\rag_project1\\img_hand\WhatsApp Image 2025-02-17 at 12.10.24_6de15b34.jpg"


'what is boundary value analysis?'

so now i am going to use Google visoon api. but comparing to the first option tesseract is better

In [None]:
image_path="https://storage.googleapis.com/ticket_book_sb_saif/WhatsApp%20Image%202025-02-17%20at%2012.10.23_4f201181.jpg"
extract_text=extract_text_from_image(image_path)


In [52]:
query=extract_text

langchain.debug=True

chain({'question':query},return_only_outputs=True)

[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain] Entering Chain run with input:
[0m{
  "question": "What is partitioning?"
}
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain > chain:MapReduceDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain > chain:MapReduceDocumentsChain > chain:LLMChain] Entering Chain run with input:
[0m{
  "input_list": [
    {
      "context": "IT1060 - Software Process Modeling\nSLIIT  - Faculty of Computing\nPartitioning\n• An activity partition is an activity group for\nactions that have some common characteristic.\n• Partitions often correspond to organizational\nunits or business actors in a business model.",
      "question": "What is partitioning?"
    },
    {
      "context": "IT1060 - Software Process Modeling\nSLIIT  - Faculty of Computing\nEquivalence partitioning \nExample \n• Example of a function which takes a parameter \n“mon

{'answer': 'Partitioning can refer to activity partitioning, which is an activity group for actions that have some common characteristic (',
 'sources': 'C:/Users/saifu/Desktop/rag_project1/docs\\SPM - all.pdf). It can also refer to equivalence partitioning, where a valid range is called a partition, for example, the valid range for the month is 1 to 12 (representing January to December). In equivalence partitioning, you only need to test one condition from each partition because you assume that all the conditions in one partition will be treated the same way by the software ('}

In [None]:
image_path="https://storage.googleapis.com/ticket_book_sb_saif/00007.png"
extract_text=extract_text_from_image(image_path)

In [54]:
query=extract_text

langchain.debug=True

chain({'question':query},return_only_outputs=True)

[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain] Entering Chain run with input:
[0m{
  "question": "Briefly explin What is Boundary\nvalue analysis with real world\nexample"
}
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain > chain:MapReduceDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain > chain:MapReduceDocumentsChain > chain:LLMChain] Entering Chain run with input:
[0m{
  "input_list": [
    {
      "context": "IT1060 - Software Process Modeling\nSLIIT  - Faculty of Computing\n• Write test cases for input box accepting \nnumbers between 1 and 1000 using Boundary \nvalue analysis\nBoundary Value Analysis\nExample",
      "question": "Briefly explin What is Boundary\nvalue analysis with real world\nexample"
    },
    {
      "context": "IT1060 - Software Process Modeling\nSLIIT  - Faculty of Computing\nBoundary value analysis\n• Equivalence partitioning is no

{'answer': 'Boundary Value Analysis focuses on testing values at the edges of input boundaries. For example, when testing an input box that accepts numbers between 1 and 1000, test cases should include the boundary values (1 and 1000), values just below the boundaries (0 and 999), and values just above the boundaries (2 and 1001).\n',
 'sources': 'C:/Users/saifu/Desktop/rag_project1/docs\\SPM - all.pdf'}