In [1]:
# Load all required Libraries
import pandas as pd
import transformers, torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, AutoModelForCausalLM
from datasets import Dataset

from pymilvus import MilvusClient, FieldSchema, CollectionSchema, DataType

from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
    answer_correctness
)

import tqdm as tqdm
from sentence_transformers import SentenceTransformer

import nltk, re, string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter
from nltk import pos_tag

nltk.download("punkt", quiet=True)
nltk.download("stopwords", quiet=True)
nltk.download("punkt_tab", quiet=True)


  from .autonotebook import tqdm as notebook_tqdm


True

# Read Passages from the Datasets and Drop rows if they are NA or empty

In [2]:
passages = pd.read_parquet("hf://datasets/rag-datasets/rag-mini-wikipedia/data/passages.parquet/part.0.parquet")

print(passages.shape)
passages.head()

(3200, 1)


Unnamed: 0_level_0,passage
id,Unnamed: 1_level_1
0,"Uruguay (official full name in ; pron. , Eas..."
1,"It is bordered by Brazil to the north, by Arge..."
2,Montevideo was founded by the Spanish in the e...
3,The economy is largely based in agriculture (m...
4,"According to Transparency International, Urugu..."


In [3]:
passages.to_csv('/Users/ritvikhariharan/Desktop/Course Readings/NLX & LLM/rharihar_Assignment2-rag/data/passages.csv', index=False)

# Do EDA on the passage dataset
- You can try to find the maximum and minimum length of the passages before indexing (just a direction)

In [4]:
# Code for EDA
passages['passage_len'] = passages['passage'].str.len()

# Basic stats
print("Passages shape:", passages.shape)
print("\nColumn names:", passages.columns.tolist())
print("\nMissing values:")
print(passages.isnull().sum())

print("\nPassage length (characters):")
print("Min:", passages['passage_len'].min())
print("Max:", passages['passage_len'].max())
print("Mean:", passages['passage_len'].mean())
print("Median:", passages['passage_len'].median())

Passages shape: (3200, 2)

Column names: ['passage', 'passage_len']

Missing values:
passage        0
passage_len    0
dtype: int64

Passage length (characters):
Min: 1
Max: 2515
Mean: 389.848125
Median: 299.0


In [5]:
print("\nShortest passage:\n", passages.loc[passages['passage_len'].idxmin(), 'passage'])
print("\nLongest passage:\n", passages.loc[passages['passage_len'].idxmax(), 'passage'])


Shortest passage:
 |

Longest passage:
 As Ford approached his ninetieth year, he began to experience significant health problems associated with old age. He suffered two minor strokes at the 2000 Republican National Convention, but made a quick recovery.  Gerald Ford recovering after strokes. BBC, August 2, 2000.  Retrieved on December 31, 2006.  In January 2006, he spent 11 days at the Eisenhower Medical Center near his residence at Rancho Mirage, California, for treatment of pneumonia.  Former President Ford, 92, hospitalized with pneumonia. Associated Press, January 17, 2006. Retrieved on October 19, 2007.  On April 23, President George W. Bush visited Ford at his home in Rancho Mirage for a little over an hour. This was Ford's last public appearance and produced the last known public photos, video footage and voice recording. While vacationing in Vail, Colorado, he was hospitalized for two days in July, 2006 for shortness of breath.  Gerald Ford released from hospital. Associated

In [6]:
passages['passage_word_count'] = passages['passage'].str.split().apply(len)
print("\nPassage length (words):")
print("Min:", passages['passage_word_count'].min())
print("Max:", passages['passage_word_count'].max())
print("Mean:", passages['passage_word_count'].mean())
print("Median:", passages['passage_word_count'].median())


Passage length (words):
Min: 1
Max: 425
Mean: 62.10375
Median: 48.0


In [7]:
print("\nShortest passage (words):\n", passages.loc[passages['passage_word_count'].idxmin(), 'passage'])
print("\nLongest passage (words):\n", passages.loc[passages['passage_word_count'].idxmax(), 'passage'])


Shortest passage (words):
 125px

Longest passage (words):
 As Ford approached his ninetieth year, he began to experience significant health problems associated with old age. He suffered two minor strokes at the 2000 Republican National Convention, but made a quick recovery.  Gerald Ford recovering after strokes. BBC, August 2, 2000.  Retrieved on December 31, 2006.  In January 2006, he spent 11 days at the Eisenhower Medical Center near his residence at Rancho Mirage, California, for treatment of pneumonia.  Former President Ford, 92, hospitalized with pneumonia. Associated Press, January 17, 2006. Retrieved on October 19, 2007.  On April 23, President George W. Bush visited Ford at his home in Rancho Mirage for a little over an hour. This was Ford's last public appearance and produced the last known public photos, video footage and voice recording. While vacationing in Vail, Colorado, he was hospitalized for two days in July, 2006 for shortness of breath.  Gerald Ford released from 

In [8]:
print(passages['passage'].duplicated().sum(), "duplicate passages found.")

4 duplicate passages found.


In [9]:
duplicate_passages = passages[passages['passage'].duplicated(keep=False)]
duplicate_passages.value_counts('passage')

passage
; Government                                                                                3
* Pratt, H., "Nikola Tesla 1856 1943", Proceedings of the IRE, Vol. 44, September, 1956.    2
|-                                                                                          2
Name: count, dtype: int64

In [10]:
text = ' '.join(passages['passage'].tolist())

In [11]:
import nltk, re, string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download("punkt", quiet=True)
nltk.download("stopwords", quiet=True)
nltk.download("punkt_tab", quiet=True)

STOP = set(stopwords.words("english"))


tokens = word_tokenize(text)

# remove stopwords (+ optional: drop punctuation & numbers)
clean_tokens = [
    t for t in tokens
    if t.lower() not in STOP
    and t not in string.punctuation
    and not re.fullmatch(r"\d+(\.\d+)?", t)
]

In [12]:
from collections import Counter
from nltk import pos_tag

nltk.download('averaged_perceptron_tagger_eng', quiet=True) 
nltk.download('universal_tagset', quiet=True)

penn_tags = [tag for _, tag in pos_tag(clean_tokens)]
univ_tags = [tag for _, tag in pos_tag(clean_tokens, tagset="universal")]

penn_counts = Counter(penn_tags)
univ_counts = Counter(univ_tags)

print("Universal POS (coarse):")
for tag, cnt in univ_counts.most_common():
    print(f"{tag:>6} : {cnt}")

print("\nPenn Treebank POS (fine):")
for tag, cnt in penn_counts.most_common():
    print(f"{tag:>6} : {cnt}")

Universal POS (coarse):
  NOUN : 68592
  VERB : 19359
   ADJ : 17610
   ADV : 5296
     . : 2185
   NUM : 2089
   ADP : 1642
   PRT : 1587
     X : 227
   DET : 150
  CONJ : 75
  PRON : 51

Penn Treebank POS (fine):
   NNP : 32917
    NN : 24221
    JJ : 16780
   NNS : 10826
   VBD : 7759
    RB : 5143
   VBG : 3251
   VBP : 2663
   VBN : 2233
    CD : 2089
   VBZ : 1663
    IN : 1642
   POS : 1526
    VB : 1139
    '' : 1054
    `` : 1047
    MD : 651
  NNPS : 628
   JJS : 494
   JJR : 336
    FW : 226
    DT : 144
   RBR : 140
     : : 79
    CC : 75
    RP : 61
   WP$ : 28
   PRP : 13
  PRP$ : 9
   WRB : 8
   WDT : 6
     $ : 5
   RBS : 5
    UH : 1
    WP : 1


# Tokenize Text and Generate Embeddings using Sentence Transformers

In [13]:
passages.head()

Unnamed: 0_level_0,passage,passage_len,passage_word_count
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,"Uruguay (official full name in ; pron. , Eas...",250,43
1,"It is bordered by Brazil to the north, by Arge...",349,62
2,Montevideo was founded by the Spanish in the e...,312,50
3,The economy is largely based in agriculture (m...,354,58
4,"According to Transparency International, Urugu...",217,30


In [14]:
import nltk
from nltk.stem.wordnet import WordNetLemmatizer 
from nltk.corpus import wordnet
nltk.download("stopwords", quiet = True)
nltk.download("wordnet", quiet = True)
nltk.download("punkt", quiet = True)
nltk.download('averaged_perceptron_tagger', quiet = True)
lemmatizer = WordNetLemmatizer()
english_stopwords = set(nltk.corpus.stopwords.words('english'))

from nltk.tokenize import word_tokenize
def clean_text(text):
    """
    Clean the input string by converting it to lowercase, removing 's and apostrophe.
    
    args:
        text (str) : the input text
        
    return:
        str : the cleaned text
    """
    text = text.lower()
    text = re.sub(r'<[^>]+>', '', text)
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r"'s\b", '', text)
    text = text.replace("'","")
    text = text.strip()
    return text

def tokenize(cleaned_text):
    """
    Tokenize the input string.
    
    args:
        cleaned_text (str): the input text, output from clean_text
        
    return:
        List[str] : a list of tokens from the input text
    """
    token = word_tokenize(cleaned_text)
    tokens = []
    for t in token:
        t = re.split(r'[^a-zA-Z0-9]',t)
        tokens.extend(t)

    tokens = [token for token in tokens if len(token)>1]
    return tokens
    pass

def lemmatize(tokens, stopwords = {}):
    """
    Lemmatize each token in an input list of tokens
    
    args:
        tokens (List[str]) : a list of token, output from tokenize
    
    kwargs:
        stopwords (Set[str]) : the set of stopwords to exclude
    
    return:
        List[str] : a list of lemmatized and filtered tokens
    """
    temp = [nltk.pos_tag(word.split()) for word in tokens] 
    tem = []
    for t in temp:
        if t:
            t = list(t[0])
            if t[1].startswith('J'):
                t[1] = wordnet.ADJ
            elif t[1].startswith('V'):
                t[1] = 'v'
            elif t[1].startswith('R'):
                t[1] = 'r'  
            else:
                t[1] = 'n'
            tem.append(t)   

    #lemm = [lemmatizer.lemmatize(words[0],words[1]) for words in tem if ((words[0] not in stopwords) & (len(words[0]) > 1))]
    lemm = [lemmatizer.lemmatize(words[0],words[1]) for words in tem]
    lemm = [words for words in lemm if ((words not in stopwords) & (len(words) > 1))]
    return lemm

    pass

def preprocess_text(text, stopwords = english_stopwords):
    # do not modify this function
    cleaned_text = clean_text(text)
    tokens = tokenize(cleaned_text)
    # print(lemmatize(tokens, stopwords))
    return lemmatize(tokens, stopwords)

passages['processed_passage'] = passages['passage'].apply(lambda x: preprocess_text(x))

In [15]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Encode Text
passages["embedding"] = (
    passages["passage"]
    .fillna("")
    .astype(str)
    .apply(lambda x: embedding_model.encode(x, normalize_embeddings=True).astype("float32"))
)

passages.reset_index(inplace=True)
passages.rename(columns={"index": "id"}, inplace=True)
passages.head()

Unnamed: 0,id,passage,passage_len,passage_word_count,processed_passage,embedding
0,0,"Uruguay (official full name in ; pron. , Eas...",250,43,"[uruguay, official, full, name, pron, eastern,...","[0.0069853608, -0.06149814, -0.06683705, -0.00..."
1,1,"It is bordered by Brazil to the north, by Arge...",349,62,"[border, brazil, north, argentina, across, ban...","[0.12249575, -0.04751835, -0.07255241, -0.0070..."
2,2,Montevideo was founded by the Spanish in the e...,312,50,"[montevideo, found, spanish, early, 18th, cent...","[-0.03174129, -0.059810296, -0.061128285, -0.0..."
3,3,The economy is largely based in agriculture (m...,354,58,"[economy, largely, base, agriculture, make, 10...","[0.050179835, -0.049152985, 0.05018971, 0.0486..."
4,4,"According to Transparency International, Urugu...",217,30,"[accord, transparency, international, uruguay,...","[0.018462706, -0.0802608, -0.078614995, -0.034..."


In [16]:
dim = passages['embedding'][0].shape[0]
dim

384

# Create Milvus Client and Insert your Embeddings to your DB
- Make sure you define a schema for your collection (Points will be deducted if you fail to define a proper schema with ids, passage text, embedding)

In [17]:
# Define every column of your schema

id = passages['id'].tolist()               
passage = passages['passage'].tolist()            
embedding = passages['embedding'].tolist()         

In [18]:
schema = MilvusClient.create_schema()
schema.add_field(
    field_name="id",
    datatype=DataType.INT64,
    is_primary=True,
    auto_id=False,
)
schema.add_field(
    field_name="passage",
    datatype=DataType.VARCHAR,
    max_length=8000
)
schema.add_field(
    field_name="embedding",
    datatype=DataType.FLOAT_VECTOR,
    dim=dim
)

{'auto_id': False, 'description': '', 'fields': [{'name': 'id', 'description': '', 'type': <DataType.INT64: 5>, 'is_primary': True, 'auto_id': False}, {'name': 'passage', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 8000}}, {'name': 'embedding', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 384}}], 'enable_dynamic_field': False}

In [19]:
client = MilvusClient("rag_wikipedia_mini.db")

print(client)

# Create the Collection with Collection Name = "rag_mini". Make sure you define the schema variable while creating the collection
#collection = client.create_collection(collection_name="rag_mini", schema=schema)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


<pymilvus.milvus_client.milvus_client.MilvusClient object at 0x30e52f0e0>


**Convert your Pandas Dataframe to a list of dictionaries**
- The Dictionary at least have 3 keys [id, passage, embedding]

In [20]:
rag_data = passages[['id', 'passage', 'embedding']].to_dict(orient='records')

In [21]:
# Code to insert the data to your DB
#client.create_collection(collection_name="rag_mini", schema=schema)
res = client.insert(collection_name="rag_mini", data=rag_data)

print(res)

{'insert_count': 3200, 'ids': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215,

- Do a Sanity Check on your database 

**Do not delete the below line during your submission**

In [22]:
print("Entity count:", client.get_collection_stats("rag_mini")["row_count"])
print("Collection schema:", client.describe_collection("rag_mini"))

Entity count: 12800
Collection schema: {'collection_name': 'rag_mini', 'auto_id': False, 'num_shards': 0, 'description': '', 'fields': [{'field_id': 100, 'name': 'id', 'description': '', 'type': <DataType.INT64: 5>, 'params': {}, 'is_primary': True}, {'field_id': 101, 'name': 'passage', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 8000}}, {'field_id': 102, 'name': 'embedding', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 384}}], 'functions': [], 'aliases': [], 'collection_id': 0, 'consistency_level': 0, 'properties': {}, 'num_partitions': 0, 'enable_dynamic_field': False}


# Steps to Fetch Results
- Read the Question Dataset
- Clean the Question Dataset if necessary (Drop Questions with NaN etc.)
- Convert Each Query to a Vector Embedding (Use the same embedding model you used to embed your document)
- Try for a Single Question First
- Load Collection into Memory after creating Index for Search on your embedding field (This is an essential step before you can search in your db)
- Search and Fetch Top N Results

In [23]:
import pandas as pd

queries = pd.read_parquet("hf://datasets/rag-datasets/rag-mini-wikipedia/data/test.parquet/part.0.parquet")
queries

Unnamed: 0_level_0,question,answer
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Was Abraham Lincoln the sixteenth President of...,yes
2,Did Lincoln sign the National Banking Act of 1...,yes
4,Did his mother die of pneumonia?,no
6,How many long was Lincoln's formal education?,18 months
8,When did Lincoln begin his political career?,1832
...,...,...
1710,Was Wilson president of the American Political...,Yes
1711,Did he not cast his ballot for John M. Palmer ...,Yes
1712,Did Wilson not spend 1914 through the beginnin...,Yes
1713,"Was Wilson , a staunch opponent of antisemitis...",Yes


In [24]:
query = queries['question'][0]
print(query)                 # Your single query

query_embedding = embedding_model.encode(query, normalize_embeddings=True).astype("float32")   

print(query_embedding.shape)

Was Abraham Lincoln the sixteenth President of the United States?
(384,)


#### Create Index on the embedding column on your DB

In [25]:
index_params = MilvusClient.prepare_index_params()

# Add an index on the embedding field
index_params.add_index(
    field_name="embedding",
    index_type="IVF_FLAT",
    metric_type= "L2",
    params = {"nlist": 128,}
)

# Create the index
try:
    client.create_index("rag_mini", index_params=index_params)
    print("Index created successfully")
except Exception as e:
    print(f"Index creation result: {e}")

# Load collection into memory (required for search)
client.load_collection("rag_mini")
print("Collection loaded into memory")

Index created successfully
Collection loaded into memory


In [26]:
# Search the db with your query embedding
output_ = client.search(
    collection_name="rag_mini",
    data=[query_embedding],
    anns_field= "embedding",
    search_params={"metric_type": "L2", "params": {"nprobe": 10}},
    output_fields=["id", "passage"]
)

print(output_)

data: [[{'id': 288, 'distance': 0.5809625387191772, 'entity': {'id': 288, 'passage': 'Young Abraham Lincoln'}}, {'id': 278, 'distance': 0.8319282531738281, 'entity': {'id': 278, 'passage': 'Abraham Lincoln (February 12, 1809 â\x80\x93 April 15, 1865) was the sixteenth President of the United States, serving from March 4, 1861 until his assassination. As an outspoken opponent of the expansion of slavery in the United States, "[I]n his short autobiography written for the 1860 presidential campaign, Lincoln would describe his protest in the Illinois legislature as one that \'briefly defined his position on the slavery question, and so far as it goes, it was then the same that it is now." This was in reference to the anti-expansion sentiments he had then expressed. Doris Kearns Goodwin, Team of Rivals: The Political Genius of Abraham Lincoln (2005) p. 91.  Holzer pg. 232.  Writing of the Cooper Union  speech, Holzer notes, "Cooper Union proved a unique confluence of political culture, rhet

## Now get the Context 
- Initially use the first passage ONLY as your context
- In Later Experiments, you must try at least 2 different passage selection strategies (Top 3 / Top 5 / Top 10) and pass to your prompt

In [27]:
context = output_[0][1].entity.get("passage")
print(context)

Abraham Lincoln (February 12, 1809 â April 15, 1865) was the sixteenth President of the United States, serving from March 4, 1861 until his assassination. As an outspoken opponent of the expansion of slavery in the United States, "[I]n his short autobiography written for the 1860 presidential campaign, Lincoln would describe his protest in the Illinois legislature as one that 'briefly defined his position on the slavery question, and so far as it goes, it was then the same that it is now." This was in reference to the anti-expansion sentiments he had then expressed. Doris Kearns Goodwin, Team of Rivals: The Political Genius of Abraham Lincoln (2005) p. 91.  Holzer pg. 232.  Writing of the Cooper Union  speech, Holzer notes, "Cooper Union proved a unique confluence of political culture, rhetorical opportunity, technological innovation, and human genius, and it brought Abraham Lincoln to the center stage of American politics at precisely the right time and place, and with precisely the

**Develop your Prompt**

In [28]:
system_prompt = f"You are a knowledgeable assistant. Use the provided context to answer the question accurately and concisely. If the context does not contain the answer, respond with 'I don't know'."

prompt = f"""{system_prompt} \n Context: {context}: \n Question: {query} """
print(prompt)

You are a knowledgeable assistant. Use the provided context to answer the question accurately and concisely. If the context does not contain the answer, respond with 'I don't know'. 
 Context: Abraham Lincoln (February 12, 1809 â April 15, 1865) was the sixteenth President of the United States, serving from March 4, 1861 until his assassination. As an outspoken opponent of the expansion of slavery in the United States, "[I]n his short autobiography written for the 1860 presidential campaign, Lincoln would describe his protest in the Illinois legislature as one that 'briefly defined his position on the slavery question, and so far as it goes, it was then the same that it is now." This was in reference to the anti-expansion sentiments he had then expressed. Doris Kearns Goodwin, Team of Rivals: The Political Genius of Abraham Lincoln (2005) p. 91.  Holzer pg. 232.  Writing of the Cooper Union  speech, Holzer notes, "Cooper Union proved a unique confluence of political culture, rhetoric

# RAG Response for a Single Query

In [29]:
# Load the LLM Model you want to use
# Load model directly
#from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")

In [30]:
# Generate answer
inputs = tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True)
 
outputs = model.generate(**inputs, max_new_tokens=50)

# Decode and extract answer.
answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Answer:", answer)

Answer: Yes


# Generate Responses for all the Queries in the Dataset

In [31]:
queries.head()

Unnamed: 0_level_0,question,answer
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Was Abraham Lincoln the sixteenth President of...,yes
2,Did Lincoln sign the National Banking Act of 1...,yes
4,Did his mother die of pneumonia?,no
6,How many long was Lincoln's formal education?,18 months
8,When did Lincoln begin his political career?,1832


In [32]:
# Your Code Here
import pandas as pd
from tqdm import tqdm  # For progress bar

queries = pd.read_parquet("hf://datasets/rag-datasets/rag-mini-wikipedia/data/test.parquet/part.0.parquet")

# Create a new column for answers
queries['answer_generated'] = ''
queries['context'] = list([] for _ in range(len(queries)))

# Load model once (outside the loop for efficiency)
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")

# Process each question
for idx in tqdm(range(len(queries)), desc="Processing queries"):
    query = queries['question'].iloc[idx]
    
    # Get query embedding
    query_embedding = embedding_model.encode(query, normalize_embeddings=True).astype("float32")   
    
    # Search the database
    output_ = client.search(
        collection_name="rag_mini",
        data=[query_embedding],
        anns_field="embedding",
        search_params={"metric_type": "L2", "params": {"nprobe": 10}},
        output_fields=["id", "passage"],
        limit=5
    )
    
    # Extract context from search results
    context = ""
    if output_ and len(output_[0]) > 0:
        # Get context from the top result
        context = output_[0][0].entity.get("passage", "")
        queries.iloc[idx, queries.columns.get_loc('context')].append(context)    
    # Create prompt
    system_prompt = "You are a knowledgeable assistant. Use the provided context to answer the question accurately and concisely. If the context does not contain the answer, respond with 'I don't know'."
    prompt = f"{system_prompt} \n Context: {context} \n Question: {query}"
    
    # Generate answer
    inputs = tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True)
    outputs = model.generate(**inputs, max_new_tokens=50)
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Save answer to dataframe
    queries.iloc[idx, queries.columns.get_loc('answer_generated')] = answer 

# Save the updated dataframe
queries.head()

Processing queries: 100%|██████████| 918/918 [03:48<00:00,  4.01it/s]


Unnamed: 0_level_0,question,answer,answer_generated,context
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,Was Abraham Lincoln the sixteenth President of...,yes,Yes.,[Young Abraham Lincoln]
2,Did Lincoln sign the National Banking Act of 1...,yes,Yes,[Lincoln believed in the Whig theory of the pr...
4,Did his mother die of pneumonia?,no,I don't know,[An autopsy performed after his death revealed...
6,How many long was Lincoln's formal education?,18 months,18 months,[Lincoln's formal education consisted of about...
8,When did Lincoln begin his political career?,1832,1832,"[Lincoln began his political career in 1832, a..."


# Finding out the Basic QA Metrics (F1 score, EM score)

In [33]:
# Your code Here
#from sklearn.metrics import f1_score
import numpy as np

def compute_exact_match(predictions, references):
    """Calculate Exact Match score"""
    em_scores = []
    for pred, ref in zip(predictions, references):
        em_scores.append(1.0 if pred == ref else 0.0)
    return np.mean(em_scores)

def compute_f1(predictions, references):
    """Calculate F1 score"""
    f1_scores = []
    for pred, ref in zip(predictions, references):
        pred_tokens = pred.split()
        ref_tokens = ref.split()
        common = set(pred_tokens) & set(ref_tokens)
        num_common = len(common)
        
        if num_common == 0:
            f1_scores.append(0.0)
            continue
        
        precision = num_common / len(pred_tokens)
        recall = num_common / len(ref_tokens)
        f1 = (2 * precision * recall) / (precision + recall)
        f1_scores.append(f1)
    
    return np.mean(f1_scores)

predictions = queries['answer_generated'].tolist()
references = queries['answer'].tolist()

em_score = compute_exact_match(predictions, references)
f1_score_value = compute_f1(predictions, references)

print(f"Exact Match (EM) Score: {em_score:.4f}")
print(f"F1 Score: {f1_score_value:.4f}")


Exact Match (EM) Score: 0.1547
F1 Score: 0.2291


# Advanced Evaluation using RAGAs

In [34]:
data = {
    "question": queries['question'].tolist() ,                     # Question
    "answer": queries['answer_generated'].tolist() ,                       # Generated Answer
    "contexts": queries['context'].tolist() ,                     # Context you pass in. You can just use top-1 here
    "ground_truths": queries['answer'].tolist(),
    "reference": queries['answer'].tolist()
}

# Convert dict to dataset
dataset = Dataset.from_dict(data)

In [35]:
# # Pass the dataset above to the evaluate method in RAGAs
# # Your code here
# from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
# from langchain_huggingface import HuggingFacePipeline, HuggingFaceEmbeddings

# tok = AutoTokenizer.from_pretrained("google/flan-t5-base")
# mdl = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")

# gen_pipe = pipeline(
#     task="text2text-generation",
#     model=mdl,
#     tokenizer=tok,
#     # set device_map="auto" if you have GPU; remove if CPU-only
#     device_map="auto",
#     max_new_tokens=256,
# )
# lc_llm = HuggingFacePipeline(pipeline=gen_pipe)

# # 2) Embeddings (Sentence-Transformers) via LangChain
# #    Use the model you already used for RAG: all-MiniLM-L6-v2
# emb = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# # 3) Wire into RAGAs
# #from ragas import settings
# # from ragas.llms import LangchainLLM
# # from ragas.embeddings import LangchainEmbeddings

# # llm_wrapper = LangchainLLM(lc_llm)
# # emb_wrapper = LangchainEmbeddings(emb)

# # 4) Choose metrics and evaluate
# from ragas import evaluate
# from ragas.metrics import (
#     answer_relevancy,
#     faithfulness,
#     context_precision,
#     context_recall,
#     # answer_correctness,      # enable if you also provide strong references
#     # context_relevancy,       # available in some versions
# )

# metrics = [
#     answer_relevancy,
#     faithfulness,
#     context_precision,
#     context_recall,
# ]

# # `dataset` must already exist from your previous cell:
# # columns required: question (str), answer (str),
# # contexts (list[str]), and ground_truths (list[str]) OR reference
# result = evaluate(dataset, metrics=metrics, llm=lc_llm, embeddings=emb)

# print("Aggregate scores:\n", result.scores)
# df_scores = result.to_pandas()
# df_scores.head()
