In [2]:
pip install pinecone-client


Collecting pinecone-client
  Downloading pinecone_client-3.0.2-py3-none-any.whl.metadata (12 kB)
Collecting certifi>=2019.11.17 (from pinecone-client)
  Using cached certifi-2024.2.2-py3-none-any.whl.metadata (2.2 kB)
Collecting urllib3>=1.26.0 (from pinecone-client)
  Using cached urllib3-2.2.0-py3-none-any.whl.metadata (6.4 kB)
Downloading pinecone_client-3.0.2-py3-none-any.whl (201 kB)
   ---------------------------------------- 0.0/201.4 kB ? eta -:--:--
   ---------------------------------------- 0.0/201.4 kB ? eta -:--:--
   -- ------------------------------------- 10.2/201.4 kB ? eta -:--:--
   -- ------------------------------------- 10.2/201.4 kB ? eta -:--:--
   -- ------------------------------------- 10.2/201.4 kB ? eta -:--:--
   -- ------------------------------------- 10.2/201.4 kB ? eta -:--:--
   -- ------------------------------------- 10.2/201.4 kB ? eta -:--:--
   -- ------------------------------------- 10.2/201.4 kB ? eta -:--:--
   ------ ------------------------

In [None]:
import PyPDF2
from nltk.tokenize import sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
import pinecone

# Function to extract text from PDF and break it into chunks
def extract_chunks_from_pdf(pdf_file, chunk_size=200):
    chunks = []
    with open(pdf_file, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        for page in reader.pages:
            text = page.extract_text()
            sentences = sent_tokenize(text)
            for i in range(0, len(sentences), chunk_size):
                chunk = ' '.join(sentences[i:i+chunk_size])
                chunks.append(chunk)
    return chunks


from sklearn.feature_extraction.text import TfidfVectorizer

def store_chunks_in_pinecone(chunks, api_key):
    pinecone.init(api_key=api_key)
    index = pinecone.Index(name="text_chunks_index", dimension=300, metric="cosine")
    vectorizer = TfidfVectorizer()
    vectorized_chunks = vectorizer.fit_transform(chunks)
    index.upsert(vectors=vectorized_chunks, ids=range(len(chunks)))




# Function to process a question and find top relevant chunks
def find_top_relevant_chunks(question, api_key):
    pinecone.init()  # Initialize Pinecone without specifying the API key here
    client = pinecone.Client(api_key=api_key)  # Instantiate the Pinecone client with your API key
    vectorizer = TfidfVectorizer()
    question_vector = vectorizer.transform([question])
    results = client.query("text_chunks_index", question_vector, top_k=3)
    relevant_chunk_indices = [result.id for result in results]
    return [chunks[index] for index in relevant_chunk_indices]

# Main function
def main():
    # Extract chunks from PDF
    pdf_file = "chatbot.pdf"
    chunks = extract_chunks_from_pdf(pdf_file)

    # Store chunks in Pinecone
    api_key = "api_key"
    store_chunks_in_pinecone(chunks, api_key)

    # Process questions
    while True:
        question = input("Ask me a question (or type 'exit' to quit): ")
        if question.lower() == 'exit':
            break
        relevant_chunks = find_top_relevant_chunks(question, api_key)
        print("Top 3 relevant chunks:")
        for chunk in relevant_chunks:
            print(chunk)
        print()

if __name__ == "__main__":
    main()
