In [2]:
!pip install pinecone-client
!pip install transformers datasets
!pip install cohere
!pip install torch




In [3]:
pip install --upgrade pinecone-client



In [4]:
pip install pinecone-client datasets transformers torch cohere



In [5]:
pip install --upgrade cohere



In [6]:
pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [24]:
# RAG QA Bot Notebook


import os
import PyPDF2
from pinecone import Pinecone
from transformers import AutoTokenizer, AutoModel
import torch
import cohere
from google.colab import files

# Initialize Pinecone
pc = Pinecone(api_key="f4c8e22b-b1a2-4ecd-a1e5-f8afbf271048")
index = pc.Index("qa-bot-index")

# Initialize tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModel.from_pretrained("distilbert-base-uncased")

# Initialize Cohere client
co = cohere.Client('9LA9uG3Mr6swUpOkd8PKBfyJ6EWyVTSvtZKufJ2S')

def embed_text(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        embeddings = model(**inputs).last_hidden_state.mean(dim=1)
    return embeddings.squeeze().tolist()

def extract_text_from_pdf(file_path):
    with open(file_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page in reader.pages:
            text += page.extract_text() + "\n"
    return text

def split_text(text, chunk_size=1000):
    return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

def upload_and_process_pdf():
    print("Please upload your PDF file.")
    try:
        uploaded = files.upload()
        if not uploaded:
            print("No file was uploaded.")
            return None

        file_name = next(iter(uploaded))
        file_path = os.path.join(os.getcwd(), file_name)
        print(f"Processing {file_name}...")

        text = extract_text_from_pdf(file_path)
        chunks = split_text(text)

        print("Embedding and uploading chunks to Pinecone...")
        for i, chunk in enumerate(chunks):
            embedding = embed_text(chunk)
            doc_id = f"doc_{i}"
            index.upsert(vectors=[(doc_id, embedding, {"text": chunk})])

        print(f"Uploaded {len(chunks)} chunks to Pinecone.")
        return chunks
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return None

def generate_answer(query, context):
    prompt = f"Context: {context}\n\nQuestion: {query}\n\nAnswer:"
    try:
        response = co.generate(
            model="command",
            prompt=prompt,
            max_tokens=500,
            temperature=0.7,
            stop_sequences=["Human:", "Context:"]
        )
        return response.generations[0].text.strip()
    except Exception as e:
        print(f"An error occurred while generating the answer: {str(e)}")
        return None

def main():
    chunks = upload_and_process_pdf()
    if not chunks:
        return

    print("\nExample queries:")
    example_queries = [
        "What is the main topic of the document?",
        "Summarise the document in detail.",
        "What are the key points and takeways?"
    ]

    for query in example_queries:
        print(f"\nQuery: {query}")
        query_embedding = embed_text(query)
        results = index.query(vector=query_embedding, top_k=3, include_metadata=True)

        if results['matches']:
            context = " ".join([match['metadata']['text'] for match in results['matches']])
            answer = generate_answer(query, context)
            print(f"Answer: {answer}")
        else:
            print("No relevant information found.")

    print("\nNow you can ask your own questions:")
    while True:
        query = input("\nEnter your question (or 'quit' to exit): ")
        if query.lower() == 'quit':
            break

        query_embedding = embed_text(query)
        results = index.query(vector=query_embedding, top_k=3, include_metadata=True)

        if results['matches']:
            context = " ".join([match['metadata']['text'] for match in results['matches']])
            answer = generate_answer(query, context)
            print(f"Answer: {answer}")
        else:
            print("No relevant information found.")

if __name__ == "__main__":
    main()

Please upload your PDF file.


Saving EcoCoin Research Paper.pdf to EcoCoin Research Paper (1).pdf
Processing EcoCoin Research Paper (1).pdf...
Embedding and uploading chunks to Pinecone...
Uploaded 25 chunks to Pinecone.

Example queries:

Query: What is the main topic of the document?
Answer: The main topic of the document appears to be discussing face detection, recognition, and analysis using AI and Machine Learning algorithms, specifically in the context of a student rewards system. It highlights the importance of confidence level, similarity index, and face ID in accurately detecting and recognizing faces for various face-related tasks. The document also provides insights into how a Lambda function can be integrated with other services like S3 and DynamoDB to store and retrieve data respectively, in order to perform these face analysis tasks and update balances as a reward for certain behaviors captured on camera.

Query: Summarise the document in detail.
Answer: The text describes the interaction with DynamoD