#### Develop a RAG system using dataset Stanford Question Answering Dataset (SQuAD).

- Without finetuning
- link to data: https://rajpurkar.github.io/SQuAD-explorer/

In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import datasets

import json
import os
import time

from openai import OpenAI
import tiktoken
# import seaborn as sns
from tenacity import retry, wait_exponential
from tqdm import tqdm
from collections import defaultdict
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

tqdm.pandas()

from dotenv import load_dotenv, find_dotenv

import openai
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY", "<your OpenAI API key if not set as env var>"))


In [2]:
# !mkdir -p ..data/local_cache
# !wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json -O ..data/local_cache/train.json
# !wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json -O ..data/local_cache/dev.json

In [3]:
def json_to_df(json_data):
    qas = []
    context = []
    is_impossible = []
    answers = []
    titles = []

    for article in json_data['data']:
        title = article['title']
        for paragraph in article['paragraphs']:
            for qa in paragraph['qas']:
                qas.append(qa['question'].strip())
                context.append(paragraph['context'])
                is_impossible.append(qa['is_impossible'])
                
                ans_list = []
                for ans in qa['answers']:
                    ans_list.append(ans['text'])
                answers.append(ans_list)
                titles.append(title)

    df = pd.DataFrame({'title': titles, 'question': qas, 'context': context, 'is_impossible': is_impossible, 'answers': answers})
    return df


df = json_to_df(json.load(open('..data/local_cache/train.json')))




In [4]:
df.head()

Unnamed: 0,title,question,context,is_impossible,answers
0,Beyoncé,When did Beyonce start becoming popular?,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,False,[in the late 1990s]
1,Beyoncé,What areas did Beyonce compete in when she was...,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,False,[singing and dancing]
2,Beyoncé,When did Beyonce leave Destiny's Child and bec...,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,False,[2003]
3,Beyoncé,In what city and state did Beyonce grow up?,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,False,"[Houston, Texas]"
4,Beyoncé,In which decade did Beyonce become famous?,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,False,[late 1990s]


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130319 entries, 0 to 130318
Data columns (total 5 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   title          130319 non-null  object
 1   question       130319 non-null  object
 2   context        130319 non-null  object
 3   is_impossible  130319 non-null  bool  
 4   answers        130319 non-null  object
dtypes: bool(1), object(4)
memory usage: 4.1+ MB


In [6]:
df['title'].nunique()

442

In [7]:
df.iloc[0]['context']

'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".'

In [8]:
df.iloc[0]['title']

'Beyoncé'

In [9]:
df[df['title'] == 'Beyoncé']['context'].nunique()

66

In [10]:
df['title'].unique()

array(['Beyoncé', 'Frédéric_Chopin',
       'Sino-Tibetan_relations_during_the_Ming_dynasty', 'IPod',
       'The_Legend_of_Zelda:_Twilight_Princess', 'Spectre_(2015_film)',
       '2008_Sichuan_earthquake', 'New_York_City',
       'To_Kill_a_Mockingbird', 'Solar_energy', 'Kanye_West', 'Buddhism',
       'American_Idol', 'Dog', '2008_Summer_Olympics_torch_relay',
       'Genome', 'Comprehensive_school', 'Republic_of_the_Congo',
       'Prime_minister', 'Institute_of_technology', 'Wayback_Machine',
       'Dutch_Republic', 'Symbiosis', 'Canadian_Armed_Forces',
       'Cardinal_(Catholicism)', 'Iranian_languages', 'Lighting',
       'Separation_of_powers_under_the_United_States_Constitution',
       'Architecture', 'Human_Development_Index', 'Southern_Europe',
       'BBC_Television', 'Arnold_Schwarzenegger', 'Plymouth', 'Heresy',
       'Warsaw_Pact', 'Materialism', 'Christian',
       'Sony_Music_Entertainment', 'Oklahoma_City', 'Hunter-gatherer',
       'United_Nations_Population_Fund

In [11]:
df[df['title']=='Dog']

Unnamed: 0,title,question,context,is_impossible,answers
6547,Dog,What is the three word Latin name for domestic...,The domestic dog (Canis lupus familiaris or Ca...,False,[Canis lupus familiaris]
6548,Dog,What is Canis familiaris?,The domestic dog (Canis lupus familiaris or Ca...,False,[domestic dog]
6549,Dog,How long has the domestic dog been selectively...,The domestic dog (Canis lupus familiaris or Ca...,False,[millennia]
6550,Dog,Along with various behaviors and physical attr...,The domestic dog (Canis lupus familiaris or Ca...,False,[sensory capabilities]
6551,Dog,What decade had significant studies of dog gen...,Although initially thought to have originated ...,False,[2010s]
...,...,...,...,...,...
6934,Dog,Dog cognition has been studied on what kind of...,"Although it is said that the ""dog is man's bes...",False,[pet dogs living in human homes.]
6935,Dog,What would wolves have gotten from living with...,"Wolves, and their dog descendants, would have ...",False,[significant benefits]
6936,Dog,What has likely led to human success?,The cohabitation of dogs and humans would have...,False,[the domestication of dogs]
6937,Dog,Studies that people are better off with dogs h...,The scientific evidence is mixed as to whether...,False,[poorly controlled]


In [12]:
df[df['title']=='Dog']['context'].nunique()

75

In [13]:
df_qa = df[df['title']=='Dog']

df_qa.info()

<class 'pandas.core.frame.DataFrame'>
Index: 392 entries, 6547 to 6938
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   title          392 non-null    object
 1   question       392 non-null    object
 2   context        392 non-null    object
 3   is_impossible  392 non-null    bool  
 4   answers        392 non-null    object
dtypes: bool(1), object(4)
memory usage: 15.7+ KB


In [14]:
for item in df_qa['answers'].values[:1]:
    print(item)

['Canis lupus familiaris']


In [15]:
def get_answer(answer_list):
    return answer_list[0]

In [16]:
df_qa['id'] = df_qa.reset_index().index

df_qa['instruction'] = 'Answer the following questions based on the given context only. Do not use any external information. If the question cannot be answered based on the given context, return "it is impossible".'
# Create a new 'first_answer' column
df_qa['answer'] = df_qa['answers'].apply(lambda x: x[0] if len(x) > 0 else np.nan)


In [17]:
df_qa['answer'].isna().sum()

0

In [18]:
df_qa.columns

Index(['title', 'question', 'context', 'is_impossible', 'answers', 'id',
       'instruction', 'answer'],
      dtype='object')

In [19]:
df_qa = df_qa[['id', 'title', 'question', 'context', 'instruction', 'answer', 'is_impossible']]

In [20]:
df_qa.head()

Unnamed: 0,id,title,question,context,instruction,answer,is_impossible
6547,0,Dog,What is the three word Latin name for domestic...,The domestic dog (Canis lupus familiaris or Ca...,Answer the following questions based on the gi...,Canis lupus familiaris,False
6548,1,Dog,What is Canis familiaris?,The domestic dog (Canis lupus familiaris or Ca...,Answer the following questions based on the gi...,domestic dog,False
6549,2,Dog,How long has the domestic dog been selectively...,The domestic dog (Canis lupus familiaris or Ca...,Answer the following questions based on the gi...,millennia,False
6550,3,Dog,Along with various behaviors and physical attr...,The domestic dog (Canis lupus familiaris or Ca...,Answer the following questions based on the gi...,sensory capabilities,False
6551,4,Dog,What decade had significant studies of dog gen...,Although initially thought to have originated ...,Answer the following questions based on the gi...,2010s,False


In [21]:
from llama_index.core import Document

documents = []
for index, row in df_qa.iterrows():
    metadata = {
        "question_id": row.get("id"),
        "instruction": row.get("instruction"),
        "answer": row.get("answer"),
        "context": row.get("context"),
    }
    documents.append(
        Document(
            text=row.get("question"),
            metadata=metadata,
            id=row.get("id")
        )
    )


In [22]:
documents[0].metadata, documents[0].text, type(documents[0])

({'question_id': 0,
  'instruction': 'Answer the following questions based on the given context only. Do not use any external information. If the question cannot be answered based on the given context, return "it is impossible".',
  'answer': 'Canis lupus familiaris',
  'context': 'The domestic dog (Canis lupus familiaris or Canis familiaris) is a domesticated canid which has been selectively bred for millennia for various behaviors, sensory capabilities, and physical attributes.'},
 'What is the three word Latin name for domesticated dogs?',
 llama_index.core.schema.Document)

In [23]:
# import requests
# 
# r = requests.get('https://gist.githubusercontent.com/aelbuni/a2b67f43af40f81b7da30c9ff79fb90a/raw/# 816bb5162679b5e6a22fa8b2b35ee0a
# c7fe2a3ea/multilingual_embedding.py')
# 
# with open('multilingual_embedding.py', 'w') as f:
#     f.write(r.text)
# 

In [24]:
from multilingual_embedding import MultiLingualEmbedding

model_name = "intfloat/multilingual-e5-small"
embed_model = MultiLingualEmbedding(embed_batch_size=10, model_name=model_name)

In [25]:
_ = load_dotenv(find_dotenv())
openai.api_key = os.environ['OPENAI_API_KEY']


In [26]:
from qdrant_client import QdrantClient
from llama_index.core import ServiceContext, set_global_service_context

qdrant_client = QdrantClient(":memory:") # Create in-memory Qdrant instance, for testing

# Setup the multilingual-e5-model as the default embedding model
service_context = ServiceContext.from_defaults(
        chunk_size=5000,
        embed_model=embed_model
    )

# Set a global service context
set_global_service_context(service_context)

In [27]:
from llama_index.core import StorageContext
from llama_index.vector_stores.qdrant import QdrantVectorStore

qdrant_vector_store = QdrantVectorStore(client=qdrant_client, collection_name="SQuAD_dogs", batch_size=10)

# construct vector store and customize storage context
storage_context = StorageContext.from_defaults(
    vector_store=qdrant_vector_store
)

In [28]:
from llama_index.core import VectorStoreIndex

index = VectorStoreIndex.from_documents(documents=documents, storage_context=storage_context, show_progress=True)

Parsing nodes: 100%|██████████| 392/392 [00:00<00:00, 5091.99it/s]
Generating embeddings: 100%|██████████| 392/392 [00:07<00:00, 53.49it/s]


In [29]:
retriever = index.as_retriever(similarity_top_k=2)
original_question = "What alternative word can be used to mean domestic dog?"
nodes = retriever.retrieve(original_question)


lst =[]
for node in nodes:   
   lst.append({'original_question': original_question,'new_question': node.text, 'new_question_answer': node.metadata["answer"], 'matching_score': node.score})

df_new = pd.DataFrame(lst)
df_new


Unnamed: 0,original_question,new_question,new_question_answer,matching_score
0,What alternative word can be used to mean dome...,What is the three word Latin name for domestic...,Canis lupus familiaris,0.885335
1,What alternative word can be used to mean dome...,Dog could also come from the original layer of...,Proto-Indo-European,0.876147
