In [None]:
! pip install cohere pinecone-client langchain


In [70]:
import os
import cohere
from pinecone import Pinecone, ServerlessSpec
from langchain.embeddings import CohereEmbeddings
from langchain.vectorstores import Pinecone as LangchainPinecone
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Set environment variables for your API keys
os.environ['COHERE_API_KEY'] = 'tRElrMvv28BGXpMiE5ixF5MPv19Sl6BB1cRdWlKO'
os.environ['PINECONE_API_KEY'] = 'pcsk_7XQ88P_5ANWACLmF8s5A3LZYX8p3g7QzWQSDccFetyZFbMgexXsy1zhkbHhnLfmC5ypzq6'
os.environ['PINECONE_ENV'] = 'us-east-1'  # Adjust according to your Pinecone environment

# Initialize Cohere client
cohere_api_key = os.environ['COHERE_API_KEY']
cohere_client = cohere.Client(cohere_api_key)

# Initialize Pinecone
pc = Pinecone(api_key=os.environ['PINECONE_API_KEY'])  # Updated initialization

# Create the Pinecone index if it doesn't exist
index_name = "rag-qa2"
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name, 
        dimension=1024,  # Adjust the dimension to the appropriate value for Cohere
        metric='cosine',  # You can use other metrics like cosine or dotproduct
        spec=ServerlessSpec(cloud='aws', region='us-east-1')  # Update the spec as necessary
    )

In [77]:
# Function to read PDF and split it into chunks
def read_data_from_doc(file_path):
    loader = PyPDFLoader(file_path)
    docs = loader.load()
    return docs

def make_chunks(docs, chunk_len=1000, chunk_overlap=200):
    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=chunk_len, chunk_overlap=chunk_overlap)
    chunks = text_splitter.split_documents(docs)
    return chunks

# Load and split data
data = read_data_from_doc('dataset.pdf')
splits = make_chunks(data)

In [78]:
embeddings = CohereEmbeddings(model="embed-english-v3.0", user_agent="my-app-v1")


In [79]:
def index_documents_in_pinecone(chunks, embeddings):
    vectorstore = LangchainPinecone.from_documents(
        chunks,
        embeddings,
        index_name=index_name
    )
    return vectorstore

In [80]:
# Step 9: Index the document chunks in Pinecone
vectorstore = index_documents_in_pinecone(splits, embeddings)

In [81]:
def query_pinecone(prompt, vectorstore):
    retriever = vectorstore.as_retriever()
    result = retriever.get_relevant_documents(prompt)
    
    if result:
        # Instead of returning just the page content, split it into readable parts
        result_text = result[0].page_content
        # You can further split the result into sentences or paragraphs if needed
        formatted_answer = result_text.replace(" ", " ").strip()  # Example of a basic cleanup
        return formatted_answer
    else:
        return "Sorry, I couldn't find an answer."

In [None]:
# Step 11: Test query
query = "How does the proposed Intention-Enhanced Batch Generation method improve similar question generation for customer service chatbots, and what advantages does it offer over traditional approaches?"
answer = query_pinecone(query, vectorstore)
print(answer)  # Prints the result of the query