In [3]:
import os

from dotenv import find_dotenv,load_dotenv

env_path = find_dotenv()
load_dotenv(env_path)

DB_key = os.getenv("DB_key")
print(DB_key)

DB-key


In [20]:
from agentic_chunker import AgenticChunker
from ..raw import imoprt_data

# https://arxiv.org/pdf/2312.06648.pdf

from langchain.output_parsers.openai_tools import JsonOutputToolsParser
from langchain_ollama import OllamaLLM
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableLambda
from langchain.chains import create_extraction_chain
from typing import Optional, List
from langchain.chains import create_extraction_chain_pydantic
from langchain_core.pydantic_v1 import BaseModel
from langchain import hub

obj = hub.pull("wfh/proposal-indexing")
llm = OllamaLLM(model="llama3.2:1b")
runnable = obj | llm

class Sentences(BaseModel):
    sentences: List[str]
    
# Extraction
extraction_chain = create_extraction_chain_pydantic(pydantic_schema=Sentences, llm=llm)
def get_propositions(text):
    runnable_output = runnable.invoke({
    	"input": text
    }).content
    propositions = extraction_chain.invoke(runnable_output)["text"][0].sentences
    return propositions

text = load_pdf("R22_syllabus.pdf")
    
paragraphs = text.split("\n\n")
text_propositions = []
for i, para in enumerate(paragraphs[:5]):
    propositions = get_propositions(para)
    text_propositions.extend(propositions)
    print (f"Done with {i}")

print (f"You have {len(text_propositions)} propositions")
print(text_propositions[:10])

ac = AgenticChunker("llama3.2:1b")
ac.add_propositions(text_propositions)
print(ac.pretty_print_chunks())
chunks = ac.get_chunks(get_type='list_of_strings')
print(chunks)
documents = [Document(page_content=chunk, metadata={"source": "local"}) for chunk in chunks]
rag(documents, "agentic-chunks")

ImportError: attempted relative import with no known parent package

In [4]:
from pathlib import Path
print(Path.cwd())

/home/shasank/shasank/collage_project/RAG-chatBot/data


In [2]:
from chunkers import chunk_pdf

docs = chunk_pdf("R22_syllabus.pdf")
docs

[Document(metadata={'source': 'R22_syllabus.pdf', 'page': 0, 'page_label': '1'}, page_content='GAYATRI  VIDYA  PARISHAD  COLLEGE  FOR  DEGREE  AND  P.G.  COURSES  (A) \nRUSHIKONDA,  VISAKHAPATANAM  530045  | website:  www.gvpcdpgc.edu.in  \n(Approved  by A.I.C.T.E  | Affiliated  to Andhra  University  | An ISO 9001:2015  Certified  Institute)  \nENGINEERING  AND  TECHNOLOGY  PROGRAM                            \nDEPARTMENT  OF  COMPUTER  SCIENCE  AND  ENGINEERING  - AI&ML \n B.\n Tech\n Computer\n Science  and\n Engineering\n with  AI\n &\n ML\n (R-22\n Regulation )\n \n I\n Year\n –\n I\n Semester\n \n \n \n \nCourse  \ncode   \nCategory   \nCourse Title  Hours per \nweek   \nInternal \nMarks   \nExternal \nMarks   \nTotal \nMarks   \nCredits  \nL T P  \n \nCSM1101   \nBS Engineering Mathematics -I (Partial  \nDifferentiation, Multiple Integrals, \nFourier Series and Applications)   \n3  \n0  \n0  \n30  \n70  \n100  \n3'),
 Document(metadata={'source': 'R22_syllabus.pdf', 'page': 0, 'p

In [8]:
import os
from pinecone import Pinecone, ServerlessSpec
from dotenv import find_dotenv,load_dotenv
from langchain_ollama import OllamaEmbeddings
from langchain_pinecone import PineconeVectorStore
from chunkers import chunk_pdf

import time

env_path = find_dotenv()
load_dotenv(env_path)
Pinecone_key = os.getenv("Pinecone_key")
embedding_model = "bge-m3:latest"
index_name = "rag-db"
pc = Pinecone(api_key=Pinecone_key)
index = pc.Index(index_name)
print(index)
embeddings = OllamaEmbeddings(model=embedding_model)

  from tqdm.autonotebook import tqdm


<pinecone.data.index.Index object at 0x7f0f120d3770>


In [9]:
print(docs)

[Document(metadata={'source': 'R22_syllabus.pdf', 'page': 0, 'page_label': '1'}, page_content='GAYATRI  VIDYA  PARISHAD  COLLEGE  FOR  DEGREE  AND  P.G.  COURSES  (A) \nRUSHIKONDA,  VISAKHAPATANAM  530045  | website:  www.gvpcdpgc.edu.in  \n(Approved  by A.I.C.T.E  | Affiliated  to Andhra  University  | An ISO 9001:2015  Certified  Institute)  \nENGINEERING  AND  TECHNOLOGY  PROGRAM                            \nDEPARTMENT  OF  COMPUTER  SCIENCE  AND  ENGINEERING  - AI&ML \n B.\n Tech\n Computer\n Science  and\n Engineering\n with  AI\n &\n ML\n (R-22\n Regulation )\n \n I\n Year\n –\n I\n Semester\n \n \n \n \nCourse  \ncode   \nCategory   \nCourse Title  Hours per \nweek   \nInternal \nMarks   \nExternal \nMarks   \nTotal \nMarks   \nCredits  \nL T P  \n \nCSM1101   \nBS Engineering Mathematics -I (Partial  \nDifferentiation, Multiple Integrals, \nFourier Series and Applications)   \n3  \n0  \n0  \n30  \n70  \n100  \n3'), Document(metadata={'source': 'R22_syllabus.pdf', 'page': 0, 'pa

In [10]:

vector_store = PineconeVectorStore(index=index, embedding=embeddings)  # Pass the index object

doc_ids = vector_store.add_documents(documents=docs)
print(doc_ids)


['7ef8227e-bf8c-48fa-b823-87b2230d00d1', 'a15e6d2f-6ac8-4528-ae7b-0fa522a9b3db', 'a8468375-b34c-430f-81c8-32ab2bb7d272', 'e7a9cfdb-9c62-43ef-aa52-57ecf4a8b25e', '1a75607d-f20f-417e-8753-4d33f706a0cd', '566f0d41-cdf1-4cfd-b0aa-756214b35aaa', '4b77983e-0d8d-4fad-a220-df3774f45eac', '47066c2e-d8dd-4977-8c24-be7caa2aa7a5', '988d334e-2b06-46aa-b9e4-b9d8c63ab3c6', 'e16d12af-36fc-475a-95fd-b011f388aa72', 'b58da05f-ce8d-4ae7-8aa9-12a1e52f6ab3', '25b05e74-d377-437d-83e8-d2d9ca0abbeb', '9e780260-9048-4f60-9978-05e90c078268', '33a24fcd-d89f-4129-a549-409f610ba8fd', 'b3199de6-f1df-4fcd-9cf5-7552a957a558', 'c0f027d1-1712-4919-935a-ea5306ca0ed1', '5c91e168-ba38-4020-8c91-7fc7310a0229', '1edde300-c479-4ee9-88d3-24cee05e94d8', '6ede7e77-d05e-458f-9d6a-d47c50b54291', 'a25203c0-a19f-4aba-b590-26b2393063f7', 'dc3c571c-1485-45bd-9f6e-6cfd82c1b479', '436a2052-d362-48e7-b26e-1700d6051736', '9fd6d1ba-08a8-45be-9614-cdd04036dd33', '534f01c1-bfb0-46da-9b16-983becbb7b49', '3f77ee5d-b299-4dae-bde2-ac56286927b6',

In [1]:
from chunkers import chunk_pdf

docs = chunk_pdf("../data/Hands on Machine Learning with Scikit Learn and TensorFlow.pdf")

Advanced encoding /UniGB-UTF16-H not implemented yet
Advanced encoding /UniGB-UTF16-H not implemented yet
Advanced encoding /UniGB-UTF16-H not implemented yet
Advanced encoding /UniGB-UTF16-H not implemented yet
Advanced encoding /UniGB-UTF16-H not implemented yet
Advanced encoding /UniGB-UTF16-H not implemented yet
Advanced encoding /UniGB-UTF16-H not implemented yet
Advanced encoding /UniGB-UTF16-H not implemented yet
Advanced encoding /UniGB-UTF16-H not implemented yet
Advanced encoding /UniGB-UTF16-H not implemented yet
Advanced encoding /UniGB-UTF16-H not implemented yet
Advanced encoding /UniGB-UTF16-H not implemented yet
Advanced encoding /UniGB-UTF16-H not implemented yet
Advanced encoding /UniGB-UTF16-H not implemented yet
Advanced encoding /UniGB-UTF16-H not implemented yet
Advanced encoding /UniGB-UTF16-H not implemented yet
Advanced encoding /UniGB-UTF16-H not implemented yet
Advanced encoding /UniGB-UTF16-H not implemented yet
Advanced encoding /UniGB-UTF16-H not implement

In [2]:
Test_docs = chunk_pdf("../data/2-2 MOOCs,.pdf")

In [3]:
Test_docs

[Document(metadata={'source': '../data/2-2 MOOCs,.pdf', 'page': 0, 'page_label': '1'}, page_content='Dear Students,  \n \nIn 2-2 all of you must complete one NPTEL course of 8 weeks / 12  weeks as per your \nacademic regulations.  \n \nOnce you join the course in the course tab you see the mentor tab. Click the mentor tab, you \nwill see a list of faculties . You can choose any one of the faculty in the list.  \n \nAfter successful completion in your certificate internal marks for 25 will be given by NPTEL \nwill be multiplied by 2 and total 50 marks will be given as your internal exam and out of 75 \nmarks given by nptel will be reduced to 50 marks and will be given as your internal marks. \nTotal 100 marks.  \n \nOn successful completion of the course,  you have to submit an NPTEL certificate verified by \nSPOC of our college (N.V. Siva Krishna, Assistant Professor in Physics, BS&H Department)'),
 Document(metadata={'source': '../data/2-2 MOOCs,.pdf', 'page': 0, 'page_label': '1'}, p

In [None]:
from DB import uplode_to_pinecone


uplode_to_pinecone(index_name="test-db",docs=Test_docs,namespace="test_space",dimensions=1024)

True

In [1]:
from DB import deleat_by_name

deleat_by_name(index_name="test-db",namespace="test_space")

  from tqdm.autonotebook import tqdm


In [1]:
from chunkers import chunk_pdf

docs = chunk_pdf("../data/Hands on Machine Learning with Scikit Learn and TensorFlow.pdf")

Advanced encoding /UniGB-UTF16-H not implemented yet
Advanced encoding /UniGB-UTF16-H not implemented yet
Advanced encoding /UniGB-UTF16-H not implemented yet
Advanced encoding /UniGB-UTF16-H not implemented yet
Advanced encoding /UniGB-UTF16-H not implemented yet
Advanced encoding /UniGB-UTF16-H not implemented yet
Advanced encoding /UniGB-UTF16-H not implemented yet
Advanced encoding /UniGB-UTF16-H not implemented yet
Advanced encoding /UniGB-UTF16-H not implemented yet
Advanced encoding /UniGB-UTF16-H not implemented yet
Advanced encoding /UniGB-UTF16-H not implemented yet
Advanced encoding /UniGB-UTF16-H not implemented yet
Advanced encoding /UniGB-UTF16-H not implemented yet
Advanced encoding /UniGB-UTF16-H not implemented yet
Advanced encoding /UniGB-UTF16-H not implemented yet
Advanced encoding /UniGB-UTF16-H not implemented yet
Advanced encoding /UniGB-UTF16-H not implemented yet
Advanced encoding /UniGB-UTF16-H not implemented yet
Advanced encoding /UniGB-UTF16-H not implement

In [2]:
tittle = docs[0].metadata["source"]
namespace = tittle[8:]
namespace

'Hands on Machine Learning with Scikit Learn and TensorFlow.pdf'

In [4]:
from DB import upload_to_pinecone
upload_to_pinecone(index_name="main-db",docs=docs,namespace=namespace,dimensions=1024)

True

In [None]:
import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

import streamlit as st
from chat import chat_with_ollama
from data_processing.DB import upload_to_pinecone
from data_processing.chunkers import chunk_pdf
from langchain_community.document_loaders.pdf import PyPDFLoader

# Initialize session state for chat history
if "messages" not in st.session_state:
    st.session_state.messages = []

# Sidebar for PDF upload and configuration
with st.sidebar:
    st.header("Upload PDF")
    index_name = st.text_input("Index Name", "main-db")
    namespace = st.text_input("Namespace", "default")
    dimensions = st.number_input("Dimensions", min_value=1, value=768)
    embedding_model = st.text_input("Embedding Model", "bge-m3:latest")
    
    uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
    
    if uploaded_file is not None:
        with st.spinner("Processing PDF..."):
            # Save the uploaded file to a temporary location
            with open(os.path.join("/tmp", uploaded_file.name), "wb") as f:
                f.write(uploaded_file.getbuffer())
            
            pdf_path = os.path.join("/tmp", uploaded_file.name)
            docs = chunk_pdf(pdf_path)
            
            success = upload_to_pinecone(index_name, docs, namespace, dimensions, embedding_model)
            if success:
                st.success("PDF uploaded to Pinecone successfully!")
            else:
                st.error("Failed to upload PDF to Pinecone.")

# Main chat interface
st.title("Chat with Ollama")
model_name = st.text_input("Model Name", "llama3.2:1b")

# Display chat history
for message in st.session_state.messages:
    with st.chat_message(message["role"]):
        st.markdown(message["content"])

# Chat input
if prompt := st.chat_input("What would you like to know about the document?"):
    # Add user message to chat history
    st.session_state.messages.append({"role": "user", "content": prompt})
    
    # Display user message
    with st.chat_message("user"):
        st.markdown(prompt)
    
    # Generate and display assistant response
    with st.chat_message("assistant"):
        with st.spinner("Thinking..."):
            response = chat_with_ollama(model_name, prompt)
            st.markdown(response)
            
    # Add assistant response to chat history
    st.session_state.messages.append({"role": "assistant", "content": response})