In [2]:
print('jjj')


jjj


In [46]:
from langchain_community.llms import Ollama  
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain_community.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
import pdfplumber
import re

class PDFRAGSystem:
    def __init__(self):
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=500,
            chunk_overlap=100,
            length_function=len
        )
        self.embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
        self.llm = Ollama(model="llama2")  # ✅ Use Ollama's Llama 2 model

    def process_pdf(self, file_path):
        """Extract and clean text from PDF using pdfplumber"""
        try:
            with pdfplumber.open(file_path) as pdf:
                text = "\n".join([self._clean_text(page.extract_text() or "") for page in pdf.pages])
            texts = self.text_splitter.split_text(text)
            return texts
        except Exception as e:
            raise RuntimeError(f"PDF processing failed: {str(e)}")

    def _clean_text(self, text):
        """Remove unwanted characters and spaces"""
        text = re.sub(r'[^\x00-\x7F]+', ' ', text)  # Remove non-ASCII
        return re.sub(r'\s+', ' ', text).strip()  # Normalize spaces

    def create_vector_store(self, chunks):
        """Create FAISS vector store with embeddings"""
        vector_store = FAISS.from_texts(chunks, self.embeddings)
        return vector_store

    def create_qa_chain(self, vector_store):
        """Create a QA chain with a custom prompt"""
        prompt_template = PromptTemplate(
            template="Context: {context}\n\nQuestion: {question}\nAnswer:",
            input_variables=["context", "question"]
        )
        return RetrievalQA.from_chain_type(
            llm=self.llm,  # ✅ Use Ollama model
            chain_type="stuff",
            retriever=vector_store.as_retriever(),
            chain_type_kwargs={"prompt": prompt_template}
        )

# Example usage
if __name__ == "__main__":
    rag_system = PDFRAGSystem()
    pdf_path = "C:\\Users\\Holisol\\Downloads\\Warehousing & Delivery Agreement - EKKO.pdf"

    # Process the PDF
    chunks = rag_system.process_pdf(pdf_path)
    
    # Create FAISS vector store
    vector_store = rag_system.create_vector_store(chunks)

    # Create QA chain
    qa_chain = rag_system.create_qa_chain(vector_store)

    # Ask a question
    question = "Validity period of agreement?"
    result = qa_chain.invoke({"query": question})  # ✅ Use `.invoke()`
    
    print(f"Answer: {result['result']}")


INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2


Answer: Based on the provided text, the validity period of the agreement is not explicitly mentioned. However, there are some clues that suggest the validity period may be defined in the agreement.

Clause 7.4 states that the parties must comply with statutory laws as may be applicable, which suggests that the agreement has a limited duration. Additionally, clause 19.1 provides that all notices or communications related to the agreement must be made in writing and sent by courier, registered mail, or hand delivery, which implies that the agreement has a defined lifespan.

However, without further information or context, it is impossible to determine the exact validity period of the agreement with certainty. It may be necessary to review the full agreement or consult with legal counsel to determine the specific terms and conditions of the agreement related to its validity period.


In [17]:
import logging
import time
import asyncio

logging.basicConfig(level=logging.INFO)

class PDFRAGSystem:
    def __init__(self):
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=300,  # Smaller chunk size for faster embedding
            chunk_overlap=50
        )
        self.embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
        self.llm = Ollama(model="llama2")

    async def process_pdf(self, file_path):
        """Extract and clean text from PDF using pdfplumber"""
        logging.info("Processing PDF...")
        start_time = time.time()
        try:
            with pdfplumber.open(file_path) as pdf:
                text = "\n".join([self._clean_text(page.extract_text() or "") for page in pdf.pages])
            texts = self.text_splitter.split_text(text)
            logging.info(f"PDF processed in {time.time() - start_time:.2f} sec, {len(texts)} chunks created.")
            return texts
        except Exception as e:
            raise RuntimeError(f"PDF processing failed: {str(e)}")

    def _clean_text(self, text):
        """Remove unwanted characters and spaces"""
        text = re.sub(r'[^\x00-\x7F]+', ' ', text)  # Remove non-ASCII
        return re.sub(r'\s+', ' ', text).strip()

    async def create_vector_store(self, chunks):
        """Create FAISS vector store with embeddings"""
        logging.info("Creating FAISS vector store...")
        start_time = time.time()
        vector_store = FAISS.from_texts(chunks, self.embeddings)
        logging.info(f"Vector store created in {time.time() - start_time:.2f} sec.")
        return vector_store

    async def create_qa_chain(self, vector_store):
        """Create a QA chain with a custom prompt"""
        prompt_template = PromptTemplate(
            template="Context: {context}\n\nQuestion: {question}\nAnswer:",
            input_variables=["context", "question"]
        )
        return RetrievalQA.from_chain_type(
            llm=self.llm,
            chain_type="stuff",
            retriever=vector_store.as_retriever(),
            chain_type_kwargs={"prompt": prompt_template}
        )

async def main():
    pdf_path = "C:\\Users\\Holisol\\Downloads\\Agreement - Warehouse Services (Holisol Logistics).pdf"
    rag_system = PDFRAGSystem()

    # Process PDF
    chunks = await rag_system.process_pdf(pdf_path)

    # Create FAISS vector store
    vector_store = await rag_system.create_vector_store(chunks)

    # Create QA chain
    qa_chain = await rag_system.create_qa_chain(vector_store)

    # Ask a question
    question = "Is there any unlawful statement according to the supply chain industry in Hindi?"
    logging.info("Asking question...")

    start_time = time.time()
    result = qa_chain.invoke({"query": question})  # ✅ Use `.invoke()`
    logging.info(f"Answer generated in {time.time() - start_time:.2f} sec.")

    print(f"Answer: {result['result']}")

if __name__ == "__main__":
    asyncio.run(main())


INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
INFO:root:Processing PDF...
INFO:root:PDF processed in 4.99 sec, 522 chunks created.
INFO:root:Creating FAISS vector store...
INFO:root:Vector store created in 6.80 sec.
INFO:root:Asking question...
INFO:root:Answer generated in 247.59 sec.


Answer: Yes, there are several unlawful statements according to the supply chain industry in Hindi. Here are some examples:

1. निर्माता के पदार्इं सुधारना (Nirmata ke padaariin sudharnaa) - This means that the supplier is not responsible for any damage or loss caused to the goods during storage or transportation, even if it was due to the supplier's negligence.
2. मालिक के सम्झावे (Malike ke samjave) - This means that the owner of the goods has the right to inspect and audit the supplier's facilities and operations at any time.
3. पदार्इं सुधारना (Padaariin sudharnaa) - This means that the supplier is not responsible for any damage or loss caused to the goods during storage or transportation, even if it was due to the supplier's negligence.
4. कार्यांतरण की सुझाव (Kaaryaaantaranaa ke sujhaava) - This means that the supplier must provide a detailed report of all activities related to the goods, including storage and transportation.
5. सुधारना के लिये (Sudharnaa ke liye) - This means th

In [54]:
from langchain_community.llms import Ollama  
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain_community.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
import pytesseract  # OCR library
from PIL import Image  # For handling images
import re
import os

# Set the path to the Tesseract executable (if needed)
# pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

class ImageRAGSystem:
    def __init__(self):
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=500,
            chunk_overlap=100,
            length_function=len
        )
        self.embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
        self.llm = Ollama(model="llama3.1:8b") # ✅ Use Ollama's Llama 2 model

    def process_image(self, image_path):
        """Extract and clean text from an image using OCR"""
        try:
            # Open the image using PIL
            image = Image.open(image_path)
            
            # Use pytesseract to extract text
            text = pytesseract.image_to_string(image)
            
            # Clean the extracted text
            text = self._clean_text(text)
            
            # Split the text into chunks
            texts = self.text_splitter.split_text(text)
            return texts
        except Exception as e:
            raise RuntimeError(f"Image processing failed: {str(e)}")

    def _clean_text(self, text):
        """Remove unwanted characters and spaces"""
        text = re.sub(r'[^\x00-\x7F]+', ' ', text)  # Remove non-ASCII
        return re.sub(r'\s+', ' ', text).strip()  # Normalize spaces

    def create_vector_store(self, chunks):
        """Create FAISS vector store with embeddings"""
        vector_store = FAISS.from_texts(chunks, self.embeddings)
        return vector_store

    def create_qa_chain(self, vector_store):
        """Create a QA chain with a custom prompt"""
        prompt_template = PromptTemplate(
            template="Context: {context}\n\nQuestion: {question}\nAnswer:",
            input_variables=["context", "question"]
        )
        return RetrievalQA.from_chain_type(
            llm=self.llm,  # ✅ Use Ollama model
            chain_type="stuff",
            retriever=vector_store.as_retriever(),
            chain_type_kwargs={"prompt": prompt_template}
        )

# Example usage
if __name__ == "__main__":
    rag_system = ImageRAGSystem()
    image_path = "C:\\Users\\Holisol\\Downloads\\Screenshot 2025-02-06 231024.jpg" # Path to the image file

    # Process the image
    chunks = rag_system.process_image(image_path)
    
    # Create FAISS vector store
    vector_store = rag_system.create_vector_store(chunks)

    # Create QA chain
    qa_chain = rag_system.create_qa_chain(vector_store)

    # Ask a question
    question = "summarize this documents?"
    result = qa_chain.invoke({"query": question})  # ✅ Use `.invoke()`
    
    print(f"Answer: {result['result']}")

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2


Answer: This document explains why a specific Udemy course is valuable and unique. Here's a summary:

* The course covers the entire SAP Supply Chain process, not just one module.
* It integrates topics from multiple SAP modules (SD, MM, IM, WM, LE) to provide a comprehensive understanding of how SAP Supply Chain works.
* The course takes a business-first approach, explaining how processes should work and then showing how they are configured in SAP R/3 across all relevant modules.
* The instructor has 25 years of experience as a SAP management consultant, giving them expertise to share.


In [36]:
from langchain_community.llms import Ollama  
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain_community.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
import pandas as pd  # For handling Excel/CSV data
import re

class ExcelRAGSystem:
    def __init__(self):
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=500,
            chunk_overlap=100,
            length_function=len
        )
        self.embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
        self.llm = Ollama(model="llama2")  # ✅ Use Ollama's Llama 2 model

    def process_excel(self, file_path):
        """Read and preprocess data from an Excel file"""
        try:
            # Read the Excel file
            df = pd.read_excel(file_path)
            
            # Convert the DataFrame to a string representation
            data_text = df.to_string(index=False)
            
            # Clean the text
            data_text = self._clean_text(data_text)
            
            # Split the text into chunks
            texts = self.text_splitter.split_text(data_text)
            return texts
        except Exception as e:
            raise RuntimeError(f"Excel processing failed: {str(e)}")

    def _clean_text(self, text):
        """Remove unwanted characters and spaces"""
        text = re.sub(r'[^\x00-\x7F]+', ' ', text)  # Remove non-ASCII
        return re.sub(r'\s+', ' ', text).strip()  # Normalize spaces

    def create_vector_store(self, chunks):
        """Create FAISS vector store with embeddings"""
        vector_store = FAISS.from_texts(chunks, self.embeddings)
        return vector_store

    def create_qa_chain(self, vector_store):
        """Create a QA chain with a custom prompt"""
        prompt_template = PromptTemplate(
            template="Context: {context}\n\nQuestion: {question}\nAnswer:",
            input_variables=["context", "question"]
        )
        return RetrievalQA.from_chain_type(
            llm=self.llm,  # ✅ Use Ollama model
            chain_type="stuff",
            retriever=vector_store.as_retriever(),
            chain_type_kwargs={"prompt": prompt_template}
        )

# Example usage
if __name__ == "__main__":
    rag_system = ExcelRAGSystem()
    excel_path = "C:\\Users\\Holisol\\Downloads\\USANA_Tauru_orderRe.xlsx" # Path to the Excel file

    # Process the Excel file
    chunks = rag_system.process_excel(excel_path)
    
    # Create FAISS vector store
    vector_store = rag_system.create_vector_store(chunks)

    # Create QA chain
    qa_chain = rag_system.create_qa_chain(vector_store)

    # Ask a question
    question = "give summary of dataset with graph"
    result = qa_chain.invoke({"query": question})  # ✅ Use `.invoke()`
    
    print(f"Answer: {result['result']}")

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2


Answer: 
Based on the data provided, here is a summary of the orders placed by customers through the Usana API:

* Total number of orders: 8
* Number of customers: 4
* Total value of orders: $2,394.79
* Average order value: $315.46
* Most popular product category: NUTRACEUTICALS (5 orders)
* Least popular product category: ACCESSORIES (1 order)

In addition, a bar graph is provided below to visualize the distribution of order values:

![Order Value Distribution](https://i.imgur.com/Mu8K7V2.png)

The graph shows that the majority of orders have an average value of around $300, with a long tail of orders towards higher and lower values.


In [91]:
    from langchain_community.llms import Ollama  
    from langchain.chains import RetrievalQA
    from langchain.prompts import PromptTemplate
    from langchain_community.vectorstores import FAISS
    from langchain.embeddings import HuggingFaceEmbeddings
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    import pdfplumber  # For PDF processing
    import pytesseract  # For OCR (image processing)
    from PIL import Image  # For handling images
    import pandas as pd  # For Excel processing
    import re
    import os

    # Set the path to the Tesseract executable (if needed)
    # pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

    class MultiFileRAGSystem:
        def __init__(self):
            self.text_splitter = RecursiveCharacterTextSplitter(
                chunk_size=500,
                chunk_overlap=100,
                length_function=len
            )
            self.embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
            self.llm = Ollama(model="llama3.1:8b")
    # ✅ Use Ollama's Llama 3 model
    # ✅ Use Ollama's Llama 2 model

        def process_file(self, file_path):
            """Process a file based on its extension"""
            if file_path.lower().endswith('.pdf'):
                return self.process_pdf(file_path)
            elif file_path.lower().endswith(('.jpg', '.jpeg', '.png')):
                return self.process_image(file_path)
            elif file_path.lower().endswith(('.xlsx', '.xls', '.csv')):
                return self.process_excel(file_path)
            else:
                raise ValueError("Unsupported file type. Please upload a PDF, image (JPG/PNG), or Excel file.")

        def process_pdf(self, file_path):
            """Extract and clean text from PDF using pdfplumber"""
            try:
                with pdfplumber.open(file_path) as pdf:
                    text = "\n".join([self._clean_text(page.extract_text() or "") for page in pdf.pages])
                texts = self.text_splitter.split_text(text)
                return texts
            except Exception as e:
                raise RuntimeError(f"PDF processing failed: {str(e)}")

        def process_image(self, file_path):
            """Extract and clean text from an image using OCR"""
            try:
                # Open the image using PIL
                image = Image.open(file_path)
                
                # Use pytesseract to extract text
                text = pytesseract.image_to_string(image)
                
                # Clean the extracted text
                text = self._clean_text(text)
                
                # Split the text into chunks
                texts = self.text_splitter.split_text(text)
                return texts
            except Exception as e:
                raise RuntimeError(f"Image processing failed: {str(e)}")

        def process_excel(self, file_path):
            """Read and preprocess data from an Excel file"""
            try:
                # Read the Excel file
                if file_path.lower().endswith('.csv'):
                    df = pd.read_csv(file_path)
                else:
                    df = pd.read_excel(file_path)
                
                # Convert the DataFrame to a string representation
                data_text = df.to_string(index=False)
                
                # Clean the text
                data_text = self._clean_text(data_text)
                
                # Split the text into chunks
                texts = self.text_splitter.split_text(data_text)
                return texts
            except Exception as e:
                raise RuntimeError(f"Excel processing failed: {str(e)}")

        def _clean_text(self, text):
            """Remove unwanted characters and spaces"""
            text = re.sub(r'[^\x00-\x7F]+', ' ', text)  # Remove non-ASCII
            return re.sub(r'\s+', ' ', text).strip()  # Normalize spaces

        def create_vector_store(self, chunks):
            """Create FAISS vector store with embeddings"""
            vector_store = FAISS.from_texts(chunks, self.embeddings)
            return vector_store

        def create_qa_chain(self, vector_store):
            """Create a QA chain with a custom prompt"""
            prompt_template = PromptTemplate(
                template="Context: {context}\n\nQuestion: {question}\nAnswer:",
                input_variables=["context", "question"]
            )
            return RetrievalQA.from_chain_type(
                llm=self.llm,  # ✅ Use Ollama model
                chain_type="stuff",
                retriever=vector_store.as_retriever(),
                chain_type_kwargs={"prompt": prompt_template}
            )

    # Example usage
    if __name__ == "__main__":
        rag_system = MultiFileRAGSystem()
        file_path = "C:\\Users\\Holisol\\Downloads\\Screenshot 2025-02-08 013548.jpg" # Replace with your file path

        # Process the file based on its extension
        try:
            chunks = rag_system.process_file(file_path)
            
            # Create FAISS vector store
            vector_store = rag_system.create_vector_store(chunks)

            # Create QA chain
            qa_chain = rag_system.create_qa_chain(vector_store)

            # Ask a question
            question = "please summarize this law in three lines only "
            result = qa_chain.invoke({"query": question})  # ✅ Use `.invoke()`
            
            print(f"Answer: {result['result']}")
        except Exception as e:
            print(f"Error: {str(e)}")

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2


Answer: Unfortunately, the provided text is partially encoded and contains many errors, making it difficult to decipher. However, I will attempt to provide a summary of the content in three lines:

The document appears to be discussing online review processes and obtaining legal documents, possibly related to divorce or separation.

The author mentions the need for real ID documents and possibly navigating complex procedures for acquiring them.

A brief mention is made of needing help with paperwork, referencing pawn shops, and potentially dealing with legal issues.


: 