In [1]:
pip install -U unstructured pypdf langchain langchain-community langchain-huggingface psycopg2-binary pgvector

Collecting unstructured
  Downloading unstructured-0.17.2-py3-none-any.whl.metadata (24 kB)
Collecting pypdf
  Downloading pypdf-5.4.0-py3-none-any.whl.metadata (7.3 kB)
Collecting langchain
  Using cached langchain-0.3.23-py3-none-any.whl.metadata (7.8 kB)
Collecting langchain-community
  Using cached langchain_community-0.3.21-py3-none-any.whl.metadata (2.4 kB)
Collecting langchain-huggingface
  Using cached langchain_huggingface-0.1.2-py3-none-any.whl.metadata (1.3 kB)
Collecting psycopg2-binary
  Downloading psycopg2_binary-2.9.10-cp311-cp311-win_amd64.whl.metadata (5.0 kB)
Collecting pgvector
  Downloading pgvector-0.4.0-py3-none-any.whl.metadata (17 kB)
Collecting chardet (from unstructured)
  Downloading chardet-5.2.0-py3-none-any.whl.metadata (3.4 kB)
Collecting filetype (from unstructured)
  Downloading filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting python-magic (from unstructured)
  Downloading python_magic-0.4.27-py2.py3-none-any.whl.metadata (5.8 kB)
Colle

In [2]:
!pip install -U python-docx

Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Downloading python_docx-1.1.2-py3-none-any.whl (244 kB)
Installing collected packages: python-docx
Successfully installed python-docx-1.1.2


In [3]:
# import things from langchain
# import os for managing file paths
import os

# import document_loaders for loading text and PDFs
from langchain.document_loaders import TextLoader, PyPDFLoader, UnstructuredWordDocumentLoader

#import text_splitter for splitting large texts into smaller chunks
from langchain.text_splitter import RecursiveCharacterTextSplitter

# import embeddings for converting text into numerical vectors
from langchain.embeddings import HuggingFaceEmbeddings

#import vectorstore for storing and retrieving embeddings
from langchain.vectorstores.pgvector import PGVector

In [34]:
# Loading docuement function
def load_document(file_path):
    """
        load a document from a file path of extension txt, pdf, or docx.
        Args: file_path (str): file path of the document
        Returns: langchain.document.Document: a list of document object
    """
    ext = os.path.splitext(file_path)[1].lower()
    if ext == '.txt':
        loader = TextLoader(file_path, encoding='utf-8')
    elif ext == '.pdf':
        loader = PyPDFLoader(file_path)
    elif ext == '.docx':
        loader = UnstructuredWordDocumentLoader(file_path)
    else:
        raise ValueError(f"Unsupported file format: {ext}")
    return loader.load()

# Split docuement function
def split_document(document, chunk_size=500, chunk_overlap=100):
    """
        Split a document into smaller chunks with a specified overlap.
    """
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    return text_splitter.split_documents(document)

# Set up PGVector store from document chunks using HuggingFace embeddings
def create_pgvector_store(chunks, connection_string, collection_name="srs_documents"):
    """
        Creates a PgVector store from a list of document chunks using HuggingFace embeddings.
    """
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

    vector_store = PGVector.from_documents(
        documents=chunks,
        embedding=embeddings,
        collection_name=collection_name,
        connection_string=connection_string,
    )
    return vector_store


if __name__ == "__main__":
    # Path to your document
    document_path = "./data/Python SRS.docx"

    try:
        # Step 1. Load the document using file path
        document = load_document(document_path)
        print(f"Document loaded successfully: {document_path}")

        # Step 2. Split the document into smaller chunks
        chunks = split_document(document)
        print(f"Document split into {len(chunks)} chunks")

        # Step 3. Set up PgVector for storing and retrieving embeddings
        connection_string = "postgresql+psycopg2://Rupantar:1234@localhost/genairagdb"
        vector_store = create_pgvector_store(chunks, connection_string)
        print("PgVector store created successfully", vector_store)

    except Exception as e:
        print(f"Error occured : {e}")

Document loaded successfully: ./data/Python SRS.docx
Document split into 14 chunks
PgVector store created successfully <langchain_community.vectorstores.pgvector.PGVector object at 0x0000019551223B50>


  store = cls(


In [7]:
pip install sentence-transformers


Note: you may need to restart the kernel to use updated packages.


In [None]:
# # Retrieving the embeddings from the pgvector

# from langchain.vectorstores.pgvector import PGVector
# from langchain.embeddings import HuggingFaceEmbeddings
# from langchain.schema import Document

# # Example SRS documents (replace with your actual data)
# srs_documents = [
#     "Functional requirement 1: The system shall...",
#     "Functional requirement 2: The user shall be able to...",
#     # Add more SRS document chunks as needed
# ]

# # Initialize the 768-dim embedding model
# embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

# # Define your new collection name (to separate from previous one)
# collection_name = "srs_chunks_768"

# # Initialize PGVector vectorstore
# vectorstore = PGVector.from_documents(
#     documents=[Document(page_content=chunk) for chunk in srs_documents],
#     embedding=embedding_model,
#     collection_name=collection_name,
#     connection_string="postgresql+psycopg2://Rupantar:1234@localhost:5432/genairagdb"
# )

# # Create a retriever to fetch relevant SRS chunks
# retriever = vectorstore.as_retriever()

# # Retrieve documents based on a query
# docs = retriever.get_relevant_documents('Extract all functional requirements')

# # Print preview of each document (first 500 characters)
# for i, doc in enumerate(docs):
#     print(f"--- Document {i+1} ---")
#     print(doc.page_content[:500])
#     print()


  store = cls(


--- Document 1 ---
Functional requirement 1: The system shall...

--- Document 2 ---
Functional requirement 2: The user shall be able to...



In [26]:
pip install groq

Note: you may need to restart the kernel to use updated packages.


In [24]:
pip install langchain python-dotenv langchain-groq


Note: you may need to restart the kernel to use updated packages.


In [None]:
# from groq import Groq
# from dotenv import load_dotenv
# import os

# # Load environment variables from .env file
# load_dotenv()

# # Access the API key
# api_key = os.getenv("GROQ_API_KEY")


# # Initialize the client with your API key
# llm = Groq(api_key=api_key)

# # Create a function to invoke LLaMA-3
# def query_llama3(prompt):
#     response = llm.chat.completions.create(
#         model="llama3-8b-8192",  # or "llama3-70b-8192" if you prefer
#         messages=[
#             {"role": "user", "content": prompt}
#         ]
#     )
#     return response.choices[0].message.content

# def extract_functional_requirements(chunks):
#     prompt_template = '''Extract all functional requirements from the following software specification chunk:

# {srs_chunk}

# Return the result in JSON format with the following fields:
# - id
# - description
# - module (if any)'''

#     results = []
#     for chunk in chunks:
#         response = query_llama3(prompt_template.format(srs_chunk=chunk))
#         results.append(response)
#     return results

In [38]:
# Code Generated by Sidekick is for learning and experimentation purposes only. 

# 1. Imports
# from langchain.retrievers import VectorStoreRetriever
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
import os
from dotenv import load_dotenv
from langchain_groq import ChatGroq

# Load environment variables
load_dotenv()

# 2. Wrap the vector store retriever in a retriever object
retriever = vector_store.as_retriever(search_kwargs={"k": 5})

# 3. Define the prompt template for our question-answering chain
template = """
You are an assistant that extracts the functional requirements from an SRS document of a project.
Your task is to extract the required points from the given context.
Given the context (pulled from the SRS):

{context}

Please extract and return **only** the following requirements in JSON format with these keys:
- "endpoints": list of objects {{ "path": "", "method": "", "params":[...], "description": "" }}
- "logic": description of the system's business rules and computations
- "schema": description of tables, relationships, and constraints
- "auth": description of authentication and authorization mechanisms
 """

# template = """
#       You are an assistant that extracts backend development requirements from an SRS document.

#       Given the following context from the document:

#       {context}

#       Extract the following in JSON format:

#       {
#         "endpoints": [
#           {
#             "path": "",
#             "method": "",
#             "description": "",
#             "params": []
#           }
#         ],
#         "logic": "Summary of the core business logic and workflows.",
#         "schema": "Describe the database tables, fields, and relationships.",
#         "auth": "Explain the login, JWT usage, roles (user, manager), and access control."
#       }

#       Only return JSON. Do not add any explanation.

# """


prompt = PromptTemplate(input_variables=["context"], template=template)

# 4. Load the Groq LLM (Hypothetical valid model name)
groqLLM = ChatGroq(
    model="llama-3.1-8b-instant",  # Replace with a valid model name
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
    api_key=os.getenv("GROQ_API_KEY"),
)

# 5. Create the retrieval chain with the retriever and the prompt
qa_chain = RetrievalQA.from_chain_type(
    llm=groqLLM,  # Use Groq LLM
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=False,
    chain_type_kwargs={"prompt": prompt}
)

# 6. Run the chain on the document
query = "Extract the functional requirements from the document for upcoming project."
response = qa_chain.run(query)

# Print the output
print(response)


Based on the provided context, I was unable to extract the required points as the context seems to be incomplete and repetitive. However, I can provide a possible interpretation of the given information.

Assuming the context is referring to the Leave Management System (LMS) and Pods, here's a possible extraction of the requirements in JSON format:

```json
{
  "endpoints": [
    {
      "path": "/expense",
      "method": "POST",
      "params": ["employee_id", "expense_amount", "supporting_documents"],
      "description": "Submit expense with valid supporting documents"
    },
    {
      "path": "/expense",
      "method": "GET",
      "params": ["employee_id"],
      "description": "Retrieve expense history for a specific employee"
    }
  ],
  "logic": "The system should allow employees to submit expenses after completing training modules. The system should validate the submitted expenses and ensure they have valid supporting documents. The system should also allow managers to vi