In [None]:
!pip install cohere pinecone-client PyPDF2 gradio

Collecting cohere
  Downloading cohere-5.11.0-py3-none-any.whl.metadata (3.4 kB)
Collecting pinecone-client
  Downloading pinecone_client-5.0.1-py3-none-any.whl.metadata (19 kB)
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting gradio
  Downloading gradio-5.0.2-py3-none-any.whl.metadata (15 kB)
Collecting boto3<2.0.0,>=1.34.0 (from cohere)
  Downloading boto3-1.35.38-py3-none-any.whl.metadata (6.7 kB)
Collecting fastavro<2.0.0,>=1.9.4 (from cohere)
  Downloading fastavro-1.9.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.5 kB)
Collecting httpx>=0.21.2 (from cohere)
  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting httpx-sse==0.4.0 (from cohere)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting parameterized<0.10.0,>=0.9.0 (from cohere)
  Downloading parameterized-0.9.0-py2.py3-none-any.whl.metadata (18 kB)
Collecting sagemaker<3.0.0,>=2.232.1 (from cohere)
  Downloading s

In [None]:
import cohere
import pinecone
import PyPDF2
import gradio as gr
import os
from pinecone import Pinecone, ServerlessSpec

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


In [None]:
# Initialize Cohere
cohere_client = cohere.Client('SeiMH89JES2SBOmT811HSH9JC7kq9rc07pyTCqvT')

# Initialize Pinecone client
pc = Pinecone(api_key='db545199-08c0-4f4a-a285-79401be8375f')

# Define index name and dimension
index_name = 'qa-bot'

# Check if the index exists, create it if not
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=4096,
        metric='cosine',
        spec=ServerlessSpec(
            cloud='aws',
            region='us-east-1'  # Set the appropriate region
        )
    )

# Connect to the index
index = pc.Index(index_name)

In [None]:
# Function to read and extract text from PDF
def upload_pdf(file):
    pdf_reader = PyPDF2.PdfReader(file)
    text = ""
    for page in pdf_reader.pages:
        text += page.extract_text() + "\n"
    return text

In [None]:
# Function to ingest PDF text into Pinecone
def ingest_pdf(file):
    pdf_text = upload_pdf(file)

    # Split text into chunks if needed
    documents = pdf_text.split('\n\n')  # Adjust as needed for chunking

    # Create embeddings for each chunk
    embeddings = cohere_client.embed(texts=documents).embeddings

    # Insert embeddings into Pinecone
    for i, embed in enumerate(embeddings):
        index.upsert([(str(i), embed, {"text": documents[i]})])

    return "PDF content has been successfully ingested into Pinecone."

In [None]:
# Retrieve relevant documents based on a query
def retrieve_relevant_docs(query):
    query_vector = cohere_client.embed(texts=[query]).embeddings[0]
    response = index.query(
        vector=query_vector,
        top_k=5,
        include_values=True,
        include_metadata=True
    )
    retrieved_docs = [match['metadata'] for match in response['matches']]
    return retrieved_docs

In [None]:
# Generate answer based on the retrieved documents
def generate_answer(query, retrieved_docs):
    if retrieved_docs:
        context = ' '.join([doc['text'] for doc in retrieved_docs])
        prompt = f"Context: {context}\n\nQuestion: {query}\n\nPlease provide a direct answer based on the context above:"
    else:
        prompt = f"Question: {query}\n\nPlease provide a direct answer based on your knowledge."

    response = cohere_client.generate(
        model='command-r-plus',
        prompt=prompt,
        max_tokens=100,
        temperature=0.5
    )
    return response.generations[0].text.strip()

In [None]:
# Combine document retrieval and answer generation
def qa_bot(query, pdf_file=None):
    if pdf_file is not None:
        # If a PDF is uploaded, ingest its content and retrieve relevant documents
        pdf_status = ingest_pdf(pdf_file)
        retrieved_docs = retrieve_relevant_docs(query)
        answer = generate_answer(query, retrieved_docs)
        return pdf_status, answer
    else:
        # If no PDF is uploaded, answer the question using general knowledge
        answer = generate_answer(query, [])
        return None, answer

In [None]:
import gradio as gr

# Function to handle chatbot interaction
def chatbot_interface(query, pdf_file=None, chat_history=[]):
    pdf_status = None
    # If a PDF is uploaded, ingest its content and retrieve relevant documents
    if pdf_file is not None:
        pdf_status, answer = qa_bot(query, pdf_file)
    else:
        # If no PDF is uploaded, just answer the question using general knowledge
        pdf_status, answer = qa_bot(query)

    # Append the new question-answer pair to the chat history
    chat_history.append((query, answer))

    # Return the chat history along with PDF ingestion status (if applicable)
    return chat_history, "", pdf_status

# Gradio interface setup with custom CSS
with gr.Blocks(css=".small-upload-btn .btn {padding: 2px 8px;}") as ui:
    # Set up the header
    gr.Markdown("<h1 align='center'>QA Chatbot</h1>")

    # Display the chat history
    chatbot_output = gr.Chatbot(label="Chat History")

    # Input components: text box for questions, small file uploader for PDF, and submit button
    with gr.Row():
        # Question input box
        question_input = gr.Textbox(
            placeholder="Ask me any question",
            label="Your Question",
            show_label=False,
        )

        # Small PDF upload button
        pdf_uploader = gr.File(label="", elem_classes="small-upload-btn", type="filepath", show_label=False)

    # Submit button below the question input
    submit_btn = gr.Button(value="Submit", variant="primary")

    # Hidden textbox for managing chat history
    chat_history_state = gr.State([])

    # Status box for PDF ingestion
    pdf_status_box = gr.Textbox(label="PDF Status", placeholder="Upload a PDF to ingest it.", interactive=False)

    # Set the button functionality
    submit_btn.click(
        fn=chatbot_interface,
        inputs=[question_input, pdf_uploader, chat_history_state],
        outputs=[chatbot_output, question_input, pdf_status_box]
    )

# Launch the Gradio interface
ui.launch()


Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://d98afaf7485fc203b8.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


