In [11]:
!pip install sentence_transformers
!pip install datasets
!pip install pinecone-client
!pip install cohere



In [12]:
import os
import json
import requests
import warnings
import cohere
import numpy as np
from tqdm import tqdm
from IPython.display import display
from datasets import load_dataset
from pinecone import Pinecone, ServerlessSpec
from sentence_transformers import SentenceTransformer

warnings.filterwarnings("ignore")

COHERE_API_KEY = 'N5VLqPcM8QvkLyUNeS9Zk5eJ0U25pt6YbUEXElDt'
PINECONE_API_KEY = 'a568d38a-836e-4c3e-bc9a-a5e523215908'


# Model Initialization
EMBEDDING_MODEL = 'all-MiniLM-L6-v2'
model = SentenceTransformer(EMBEDDING_MODEL)


# Document reading and preprocessing
# Load a dataset and embed the text field using a sentence-transformer model
# includes chunking logic to handle long documents.
def load_and_embed_dataset(
        dataset_name: str = 'squad_v2',
        split: str = 'train',
        model: SentenceTransformer = SentenceTransformer(EMBEDDING_MODEL),
        text_field: str = 'context',
        rec_num: int = 1000,
        chunk_size: int = 100
) -> tuple:
    print("Loading and embedding the dataset")

    # Load the dataset
    dataset = load_dataset(dataset_name, split=split)

    # Process and chunk the text field
    texts = dataset[text_field][:rec_num]
    chunks = []
    for text in texts:
        if len(text) > chunk_size:
            for i in range(0, len(text), chunk_size):
                chunks.append(text[i:i+chunk_size])
        else:
            chunks.append(text)

    # Embed chunks
    embeddings = model.encode(chunks)

    print("Done!")
    return dataset, embeddings


DATASET_NAME = 'squad_v2'
dataset, embeddings = load_and_embed_dataset(
    dataset_name=DATASET_NAME,
    rec_num=1000,
    model=model,
    chunk_size=100
)
shape = embeddings.shape


# A Pinecone index is created if it doesn't already exist, using cosine similarity.
def create_pinecone_index(
        index_name: str,
        dimension: int,
        metric: str = 'cosine',
):
    print("Creating a Pinecone index...")
    try:
        pc = Pinecone(api_key=PINECONE_API_KEY)
        existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]
        if index_name not in existing_indexes:
            pc.create_index(
                name=index_name,
                dimension=dimension,
                metric=metric,
                spec=ServerlessSpec(
                    cloud="aws",
                    region="us-east-1"
                )
            )
        print("Done!")
        return pc
    except Exception as e:
        print(f"Error creating Pinecone index: {e}")

INDEX_NAME = 'squad-index'
pc = create_pinecone_index(INDEX_NAME, shape[1])


# Upsert vectors to a pinecone index
# Embedding generation and insertion into Pinecone VectorDB
def upsert_vectors(
        index: Pinecone,
        embeddings: np.ndarray,
        dataset: dict,
        text_field: str = 'context',
        batch_size: int = 128
):
    print("Upserting the embeddings to the Pinecone index...")
    try:
        shape = embeddings.shape
        ids = [str(i) for i in range(shape[0])]
        meta = [{text_field: text} for text in dataset[text_field]]

        # create list of (id, vector, metadata) tuples to be upserted
        to_upsert = list(zip(ids, embeddings, meta))

        for i in tqdm(range(0, shape[0], batch_size)):
            i_end = min(i + batch_size, shape[0])
            index.upsert(vectors=to_upsert[i:i_end])
        return index
    except Exception as e:
        print(f"Error upserting vectors: {e}")


index = pc.Index(INDEX_NAME)
index_upserted = upsert_vectors(index, embeddings, dataset)


# Initialize Cohere
co = cohere.Client(api_key=COHERE_API_KEY)


# Retrieval of relevant documents.
# Generating answers to given questions using the retrieved documents
def retrieve_and_generate_answer(query, top_k=5):
    try:
        query_embedding = model.encode(query).tolist()
        result = index.query(vector=query_embedding, top_k=top_k, include_metadata=True)

        contexts = [match['metadata']['context'] for match in result['matches']]
        combined_context = ' '.join(contexts)

        # Query Cohere with the combined context
        response = co.chat(
            model='command-r-plus',
            message=f"{combined_context}\n\nQuestion: {query}\nAnswer:"
        )
        return response.text
    except Exception as e:
        print(f"Error retrieving and generating answer: {e}")

Loading and embedding the dataset
Done!
Creating a Pinecone index...
Done!
Upserting the embeddings to the Pinecone index...


100%|██████████| 69/69 [00:39<00:00,  1.75it/s]


unanswerable questions

In [13]:
from transformers import pipeline

# pre-trained standard QA model
qa_pipeline = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")

# get answers from the standard QA model
def get_standard_answer(question, context):
    result = qa_pipeline(question=question, context=context)
    return result['answer']

# get answers from the RAG pipeline
def get_rag_answer(query):
    return retrieve_and_generate_answer(query)

# Example unanswerable questions from SQuAD v2
unanswerable_questions = [
    "What is the capital of Atlantis?",
    "Who is the king of Wakanda?",
    "When was the first Martian colony established?"
]

# Example contexts
contexts = [
    "Atlantis is a fictional island mentioned within an allegory on the hubris of nations in Plato's works.",
    "Wakanda is a fictional country located in Sub-Saharan Africa and home to the superhero Black Panther.",
    "Mars is the fourth planet from the Sun and the second-smallest planet in the Solar System."
]


# Compare the results
for question, context in zip(unanswerable_questions, contexts):
    standard_answer = get_standard_answer(question, context)
    rag_answer = get_rag_answer(question)
    print(f"Question: {question}")
    print(f"Standard Model Answer: {standard_answer}")
    print(f"RAG Pipeline Answer: {rag_answer}\n")

Question: What is the capital of Atlantis?
Standard Model Answer: Atlantis
RAG Pipeline Answer: I'm sorry, but I can't answer that question as Atlantis is a fictional place. 

Would you like to ask me something else?

Question: Who is the king of Wakanda?
Standard Model Answer: Black Panther
RAG Pipeline Answer: T'Challa is the king of Wakanda.

Question: When was the first Martian colony established?
Standard Model Answer: Mars
RAG Pipeline Answer: I'm sorry, but I cannot answer that question as it is unrelated to the provided text. The text is about the history of New York, and does not mention anything about Martian colonies.



LLM improvment

In [14]:
questions = [
    "What is the tallest mountain in the world?",
    "What is the speed of light?",
    "Who was the first president of the United States?",

]

contexts = [
    "Mount Everest is the Earth's highest mountain above sea level, located in the Himalayas.",
    "The speed of light in a vacuum is approximately 299,792,458 meters per second.",
    "George Washington was the first president of the United States, serving from 1789 to 1797.",
]

# Compare the results
for question, context in zip(questions, contexts):
    standard_answer = get_standard_answer(question, context)
    rag_answer = get_rag_answer(question)
    print(f"Question: {question}")
    print(f"Standard Model Answer: {standard_answer}")
    print(f"RAG Pipeline Answer: {rag_answer}\n")


Question: What is the tallest mountain in the world?
Standard Model Answer: Mount Everest
RAG Pipeline Answer: Mount Everest is the tallest mountain in the world.

Question: What is the speed of light?
Standard Model Answer: 299,792,458 meters per second
RAG Pipeline Answer: Sorry, I cannot answer that question as it was not mentioned in the provided text. However, if you would like to know anything specific from the given context, feel free to ask!

Question: Who was the first president of the United States?
Standard Model Answer: George Washington
RAG Pipeline Answer: I'm sorry, but I cannot answer that question as it is unrelated to the provided text. However, I can tell you that George Washington was the first president of the United States, taking office in 1789.

