#
# Load squad2.0 train .json onto df
#

In [1]:
import json, requests
import pandas as pd
import numpy as np
from tqdm import tqdm

In [2]:
# Load the SQuAD 2.0 JSON file
# URL of the SQuAD 2.0 dataset
url = 'https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json'

# Fetch the content from the URL
response = requests.get(url)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Load the JSON content into a dictionary
    squad_data = response.json()

In [3]:
# Initialize lists to hold flattened data
data_list = []

# Use tqdm to add a progress bar to the outer loop
for article in tqdm(squad_data['data'], desc='Processing Articles'):
    title = article['title']
    for paragraph in article['paragraphs']:
        context = paragraph['context']
        for qa in paragraph['qas']:
            question = qa['question']
            qid = qa['id']
            is_impossible = qa.get('is_impossible', False)
            answers = qa.get('answers', [])
            if not is_impossible:
                for answer in answers:
                    answer_text = answer['text']
                    answer_start = answer['answer_start']
                    data_list.append({
                        'title': title,
                        'context': context,
                        'question': question,
                        'id': qid,
                        'is_impossible': is_impossible,
                        'answer_text': answer_text,
                        'answer_start': answer_start
                    })
            else:
                # For unanswerable questions, append with empty answer fields
                data_list.append({
                    'title': title,
                    'context': context,
                    'question': question,
                    'id': qid,
                    'is_impossible': is_impossible,
                    'answer_text': '',
                    'answer_start': -1
                })

# Convert list of records to a DataFrame
df = pd.DataFrame(data_list)
#df.set_index("id", inplace=True)

# Display the DataFrame
print(df.head())

Processing Articles: 100%|██████████| 442/442 [00:00<00:00, 4314.76it/s]


     title                                            context  \
0  Beyoncé  Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...   
1  Beyoncé  Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...   
2  Beyoncé  Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...   
3  Beyoncé  Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...   
4  Beyoncé  Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...   

                                            question  \
0           When did Beyonce start becoming popular?   
1  What areas did Beyonce compete in when she was...   
2  When did Beyonce leave Destiny's Child and bec...   
3      In what city and state did Beyonce  grow up?    
4         In which decade did Beyonce become famous?   

                         id  is_impossible          answer_text  answer_start  
0  56be85543aeaaa14008c9063          False    in the late 1990s           269  
1  56be85543aeaaa14008c9065          False  singing and dancing           207  
2  56be85543aeaaa14008c9066     

In [4]:
df.sample(5)

Unnamed: 0,title,context,question,id,is_impossible,answer_text,answer_start
108343,Hyderabad,The establishment of Indian Drugs and Pharmace...,In what year was Indian Drugs and Pharmaceutic...,572f97da947a6a140053caa2,False,1961,102
63891,Yale_University,Yale has produced alumni distinguished in thei...,What Italian Prime Minister never attended Yale?,5ad3ed86604f3c001a3ff7b4,True,,-1
96373,Ottoman_Empire,Before the reforms of the 19th and 20th centur...,What was the main responsibility of the Ottoma...,572a3c5c3f37b31900478804,False,to defend and extend the land of the Muslims,726
41856,Franco-Prussian_War,"On 1 September 1870, the battle opened with th...",On which date did the battle begin with the Ar...,570d615efed7b91900d45f83,False,1 September 1870,3
27762,Middle_Ages,"In central and northern Italy and in Flanders,...",What sea were the Hanseatic cities located on?,56fb7c108ddada1400cd6457,False,Baltic,232


#
# Chunk Metadata via NLP Tagging
#

In [5]:
# Step 1: Install spaCy and download the model
!pip install spacy -q
# !pip install spacy[cuda117] -q
#!python -m spacy download en_core_web_sm

import spacy
print("CUDA available:", spacy.prefer_gpu())

[0mCUDA available: False


In [6]:
import spacy
import spacy.cli

def ensure_model_downloaded(model_name):
    try:
        # Attempt to load the model
        spacy.load(model_name)
        print(f"{model_name} is already downloaded.")
    except OSError:
        # If loading fails, download the model
        print(f"{model_name} not found. Downloading...")
        spacy.cli.download(model_name)
        print(f"{model_name} has been downloaded.")

# Call the function for the 'en_core_web_sm' model
ensure_model_downloaded('en_core_web_sm')

en_core_web_sm is already downloaded.


In [7]:
import spacy
from multiprocessing.pool import ThreadPool
from tqdm import tqdm

# Apply tqdm
tqdm.pandas()

# Load spaCy model
nlp = spacy.load('en_core_web_sm')
# Check if CUDA is available
print("CUDA available:", spacy.prefer_gpu())

# Function to extract nlp features from a text
def spacy_extract(text):
    doc = nlp(text)  # Process the text with spaCy
    
    # Extract entities
    entities = [ent.text for ent in doc.ents]  

    # Extract the subject
    subobjects = [token.text for token in doc if (not token.is_stop) and
                ((token.dep_ == 'nsubj')
               or (token.dep_ == 'iobj')
               or (token.dep_ == 'dobj'))]
    
    # Get lemmas for each token
    # lemmatized_words = [token.lemma_ for token in doc]  
    return {
        "entities": ' '.join(entities), 
        "subobjects": ' '.join(subobjects),
        #"lemmatext": ' '.join(lemmatized_words),
    }

# WWWW extraction function
def wwww_extract(doc):
    results = {
        "who": [],
        "what": [],
        "where": [],
        "when": [],
    }

    for sent in doc.sents:
        for token in sent:
            # Skip the token if stop word
            if token.is_stop:
                continue
            
            # Extract "Who"
            if token.dep_ == "nsubj":
                results["who"].append(token.text)
            if token.ent_type_ in ["PERSON", "ORG"]:
                results["who"].append(token.text)

            # Extract "What"
            if token.dep_ == "dobj" or token.pos_ == "NOUN":
                results["what"].append(token.text)

            # Extract "Where"
            if token.ent_type_ in ["GPE", "LOC", "FAC"]:
                results["where"].append(token.text)

            # Extract "When"
            if token.ent_type_ in ["DATE", "TIME"]:
                results["when"].append(token.text)
    return  {
                "who": ' '.join(results["who"]),
                "what": ' '.join(results["what"]),
                "where": ' '.join(results["where"]),
                "when": ' '.join(results["when"]),
            }

# Function to apply spacy_extract in parallel with tqdm
def parallel_apply(func, data):
    with ThreadPool(processes = 12) as pool:
        # Use tqdm to create a progress bar
        results = list(tqdm(pool.imap(func, data), total=len(data)))
    return results

def batch_apply(listoftexts = [""]):
    # Load the spaCy model
    nlp = spacy.load("en_core_web_sm")
    batch_size = 256
    results = []

    for doc in tqdm(nlp.pipe(listoftexts, batch_size=batch_size, n_process=8), total=len(listoftexts), desc="Processing"):
        results.append(wwww_extract(doc))

    return results

CUDA available: False


In [8]:
# Define data scope
nsample = 1000
samdf = df.sample(nsample, random_state=42)

# Apply spacy to the DataFrame column
#features = samdf['context'].progress_apply(spacy_extract)
#features = parallel_apply(spacy_extract, samdf['context'])
features = batch_apply(samdf['context'].tolist())
print(features[0])

Processing: 100%|██████████| 1000/1000 [00:04<00:00, 200.52it/s]

{'who': 'markets market crisis BNP Paribas Paribas', 'what': 'collapse institutions bailout banks governments stock markets areas housing market evictions foreclosures unemployment crisis role failure businesses consumer wealth trillions dollars downturn activity recession debt crisis phase crisis liquidity crisis withdrawals hedge funds evaporation liquidity', 'where': 'U.S.', 'when': 'August 9 , 2007'}





In [9]:
featuresdf = pd.json_normalize(features)

In [10]:
print(len(featuresdf))

1000


In [11]:
samdf = df.sample(nsample, random_state=42)
samdf = pd.concat([samdf.reset_index(drop=True), pd.json_normalize(features).reset_index(drop=True)], axis=1)
samdf.head(5)

Unnamed: 0,title,context,question,id,is_impossible,answer_text,answer_start,who,what,where,when
0,Financial_crisis_of_2007%E2%80%9308,It threatened the collapse of large financial ...,What year did the global recession that follow...,573257950fdd8d15006c69ee,False,2012,481,markets market crisis BNP Paribas Paribas,collapse institutions bailout banks government...,U.S.,"August 9 , 2007"
1,House_music,"But house was also being developed on Ibiza,[c...",what was a popular club in ibiza that started ...,5706b11d0eeca41400aa0d36,False,Amnesia,251,artists Balearic mix clubs clubs Ecstasy DJs T...,house house artists labels island time mid-198...,Amnesia UK Manchester London Southwark,mid-1980s late 1987
2,Mary_(mother_of_Jesus),Although Calvin and Huldrych Zwingli honored M...,In what century did Martin Luther honor Mary a...,5ad17e8d645df0001a2d1e38,True,,-1,Calvin Calvin Huldrych Zwingli Mary Martin Lut...,Mary century idea respect honor Catholics Mary...,Marian,16th century 16th century
3,Himachal_Pradesh,"Due to extreme variation in elevation, great v...",What is the climate like?,5709667eed30961900e840a1,False,varies from hot and subhumid tropical,115,variation Himachal climate state Lahaul Himach...,variation elevation variation conditions clima...,Dharamsala Spiti,"summer , winter Summer mid - April end June Wi..."
4,Elizabeth_II,The Queen addressed the United Nations for a s...,How many times has the Queen toured Canada?,5ad378fa604f3c001a3fe3a1,True,,-1,Queen United Nations Commonwealth Commonwealth...,Nations time capacity Queen realms moon anchor...,New York Canada Australia,2010 October 2011 1954


In [12]:
samdf['title'] = samdf['title'].str.replace('_', ' ', regex=False).str.split('%').str[0]
samdf["twwww"] = samdf.apply(lambda row: f"{row['title']}, {row['who']}, {row['what']}, {row['where']}, {row['when']}", axis=1)
samdf = samdf.drop(['who', 'what', 'where', 'when', 'title'], axis=1)
samdf.head(10)

Unnamed: 0,context,question,id,is_impossible,answer_text,answer_start,twwww
0,It threatened the collapse of large financial ...,What year did the global recession that follow...,573257950fdd8d15006c69ee,False,2012,481,"Financial crisis of 2007, markets market crisi..."
1,"But house was also being developed on Ibiza,[c...",what was a popular club in ibiza that started ...,5706b11d0eeca41400aa0d36,False,Amnesia,251,"House music, artists Balearic mix clubs clubs ..."
2,Although Calvin and Huldrych Zwingli honored M...,In what century did Martin Luther honor Mary a...,5ad17e8d645df0001a2d1e38,True,,-1,"Mary (mother of Jesus), Calvin Calvin Huldrych..."
3,"Due to extreme variation in elevation, great v...",What is the climate like?,5709667eed30961900e840a1,False,varies from hot and subhumid tropical,115,"Himachal Pradesh, variation Himachal climate s..."
4,The Queen addressed the United Nations for a s...,How many times has the Queen toured Canada?,5ad378fa604f3c001a3fe3a1,True,,-1,"Elizabeth II, Queen United Nations Commonwealt..."
5,The Constitution of Bermuda came into force on...,Has Bermuda's constitution ever been amended?,57294e7a6aef051400154c93,False,it was amended in 1989 and 2003,60,"Bermuda, Constitution head branch Senate Senat..."
6,"Florida is served by Amtrak, operating numerou...",What train line connects florida to the south?,5acda5f107355d001abf48c0,True,,-1,"Florida, Amtrak Amtrak stations Sanford Tampa ..."
7,"Programmes, such as the politically fuelled Gi...",What is the name of a show from Northern Irela...,56de818dcffd8e1900b4b99c,False,Patrick Kielty Almost Live,852,"BBC Television, Programmes BBC Northern Irelan..."
8,The two volume biography of Whitehead by Victo...,Why was every Nachlass left behind after Whit...,5ad3c964604f3c001a3ff096,True,,-1,"Alfred North Whitehead, biography Whitehead Vi..."
9,Pesticide use raises a number of environmental...,What concerns are raised when thinking about a...,5a7112630efcfe001a8afd20,True,,-1,"Pesticide, use % % drift pesticides Pesticides..."


In [13]:
len(samdf)

1000

Evaluation on LLM-RAG:

1. Retrieval - RAG retrieved the needed context? <br>
2. Relevance - LLM deciphered the relevant context? <br>
3. Accuracy - LLM answer contains the answer <br>
4. Precision - LLM answer does not contain irrelevant answer <br>







#
# Vectorize to Milvus
#

In [14]:
import torch, os
import numpy as np
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModel
from sklearn.preprocessing import normalize

# Define model names
splade_model_name = "naver/splade-cocondenser-ensembledistil"
mpnet_model_name = "sentence-transformers/all-mpnet-base-v2"

# Default cache directories (can be customized)
transformers_cache_dir = os.path.expanduser("~/.cache/huggingface/transformers")
sentence_transformers_cache_dir = os.path.expanduser("~/.cache/torch/sentence_transformers")

# Load SPLADE model
try:
    # Attempt to load the model from cache
    splade_tokenizer = AutoTokenizer.from_pretrained(transformers_cache_dir)
    splade_model = AutoModel.from_pretrained(transformers_cache_dir).to('cuda')
    print(f"Loaded {splade_model_name} from cache.")
except Exception as e:
    print(f"{splade_model_name} not found in cache. Downloading...")
    # Download and save the model
    splade_tokenizer = AutoTokenizer.from_pretrained(splade_model_name)
    splade_tokenizer.save_pretrained(transformers_cache_dir)
    splade_model = AutoModel.from_pretrained(splade_model_name)
    splade_model.save_pretrained(transformers_cache_dir)
    splade_model.to('cuda')

    # splade_tokenizer = AutoTokenizer.from_pretrained("naver/splade-cocondenser-ensembledistil")
    # splade_model = AutoModel.from_pretrained("naver/splade-cocondenser-ensembledistil").to('cuda')

# Load ALL-MPNET-BASE-V2 model
try:
    # Attempt to load the model from cache
    mpnet_model = SentenceTransformer(sentence_transformers_cache_dir)
    print(f"Loaded {mpnet_model_name} from cache.")
except Exception as e:
    print(f"{mpnet_model_name} not found in cache. Downloading...")
    # Download and save the model
    mpnet_model = SentenceTransformer(mpnet_model_name)
    mpnet_model.save(sentence_transformers_cache_dir)
    mpnet_model.to('cuda')

    #mpnet_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
    #mpnet_model = mpnet_model.to('cuda')

Loaded naver/splade-cocondenser-ensembledistil from cache.
Loaded sentence-transformers/all-mpnet-base-v2 from cache.


In [15]:
# Vectorize using ALL-MPNET-BASE-V2 with tqdm progress bar
def vectorize_mpnet_batch(texts, BATCH_SIZE=1):
    embeddings = []
    for i in tqdm(range(0, len(texts), BATCH_SIZE), desc="Vectorizing with ALL-MPNET-BASE-V2"):
        batch_texts = texts[i:i+BATCH_SIZE]
        valid_texts = [text if isinstance(text, str) else "" for text in batch_texts]
        batch_embeddings = mpnet_model.encode(valid_texts, device='cuda', batch_size=BATCH_SIZE, normalize_embeddings=True)
        embeddings.extend(batch_embeddings)
    return embeddings

BATCH_SIZE = 256
mpnet_embeddings = vectorize_mpnet_batch(samdf['context'].tolist(), BATCH_SIZE)
samdf['mpnet'] = mpnet_embeddings

Vectorizing with ALL-MPNET-BASE-V2: 100%|██████████| 4/4 [00:14<00:00,  3.59s/it]


In [16]:
"""mpnet_model.to('cpu')
del mpnet_model
torch.cuda.empty_cache()"""

"mpnet_model.to('cpu')\ndel mpnet_model\ntorch.cuda.empty_cache()"

In [17]:
# Vectorize using SPLADE with tqdm for progress bar (on GPU)
def vectorize_splade_batch(texts, BATCH_SIZE=1):
    embeddings = []
    for i in tqdm(range(0, len(texts), BATCH_SIZE), desc="Vectorizing with SPLADE"):
        batch_texts = texts[i:i+BATCH_SIZE]
        valid_texts = [text if isinstance(text, str) else "" for text in batch_texts]

        # Tokenize the valid texts, applying padding and truncation as necessary, and ...
        # ... convert them into PyTorch tensors. Move tensors to GPU ('cuda').
        inputs = splade_tokenizer(valid_texts, padding=True, truncation=True, return_tensors='pt').to('cuda')
        with torch.no_grad(): # Disable gradient calculation
            # Pass the tokenized inputs through the SPLADE model to obtain outputs
            outputs = splade_model(**inputs) 

            # Compute the mean of the last hidden state across all tokens for each input text,
            # effectively pooling the output to get a single embedding vector per text.
            pooled_output = outputs.last_hidden_state.mean(dim=1).cpu().numpy()

            # Normalize the pooled output to have a norm of 1 (L2 normalization)
            normalized_output = normalize(pooled_output, norm='l2')
            
            # Extend the embeddings list with the pooled output from this batch
            embeddings.extend(pooled_output)
    return embeddings

BATCH_SIZE = 256
splade_embeddings = vectorize_splade_batch(samdf['context'].tolist(), BATCH_SIZE)
samdf['splade'] = splade_embeddings

Vectorizing with SPLADE: 100%|██████████| 4/4 [00:12<00:00,  3.13s/it]


In [18]:
BATCH_SIZE = 256

samdf['splade_twwww'] = vectorize_splade_batch(samdf["twwww"].tolist(), BATCH_SIZE)
samdf['mpnet_twwww'] = vectorize_mpnet_batch(samdf["twwww"].tolist(), BATCH_SIZE)

#for tag in ['who', 'what', 'where', 'when']:
    #samdf['splade_'+tag] = vectorize_splade_batch(samdf[tag].tolist(), BATCH_SIZE)
    #samdf['mpnet_'+tag] = vectorize_mpnet_batch(samdf[tag].tolist(), BATCH_SIZE)

Vectorizing with SPLADE: 100%|██████████| 4/4 [00:06<00:00,  1.52s/it]
Vectorizing with ALL-MPNET-BASE-V2: 100%|██████████| 4/4 [00:06<00:00,  1.66s/it]


In [19]:
"""splade_model.to('cpu')
del splade_model
torch.cuda.empty_cache()"""
for col in samdf.columns: 
    print(f'{col} : {samdf[col].dtypes} - {pd.api.types.infer_dtype(samdf[col])} - isstring({pd.api.types.is_string_dtype(samdf[col])})')
    print(isinstance(samdf[col].iloc[0], list) or isinstance(samdf[col].iloc[0], np.ndarray))

#samdf['splade'].iloc[0]

context : object - string - isstring(True)
False
question : object - string - isstring(True)
False
id : object - string - isstring(True)
False
is_impossible : bool - boolean - isstring(False)
False
answer_text : object - string - isstring(True)
False
answer_start : int64 - integer - isstring(False)
False
twwww : object - string - isstring(True)
False
mpnet : object - mixed - isstring(False)
True
splade : object - mixed - isstring(False)
True
splade_twwww : object - mixed - isstring(False)
True
mpnet_twwww : object - mixed - isstring(False)
True


In [20]:
max_context = samdf["context"].str.len().max()
print(max_context)

3706


In [21]:
from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection, utility 

# Connect to Milvus server
try:
    # Replace 'localhost' with the actual host if Milvus is running on a different machine
    connections.connect("default", host="milvus-host", port="19530")

    # Verify the connection
    if connections.has_connection("default"):
        print("Successfully connected to Milvus")
    else:
        print("Failed to connect to Milvus")
except Exception as e:
    print(f"MilvusException: {e}")

# Define schema for the collection
"""
fields = [
    FieldSchema(name="mindex", dtype=DataType.INT64, is_primary=True, auto_id=True), # milvus index as primary key
    FieldSchema(name="qid", dtype=DataType.VARCHAR, max_length=50), # dataset row and question id, not a primary key in milvus
    FieldSchema(name="mpnet_vector", dtype=DataType.FLOAT_VECTOR, dim=768),  # Dimension of ALL-MPNET-BASE-V2 vectors
    FieldSchema(name="splade_vector", dtype=DataType.FLOAT_VECTOR, dim=768),  # SPLADE vector dimension
    FieldSchema(name="context", dtype=DataType.VARCHAR, max_length=4096)  # SPLADE vector dimension
]
"""

# Extract FieldSchema for data_to_insert
field_schemas = [FieldSchema(name="mindex", dtype=DataType.INT64, is_primary=True, auto_id=True)]

for col in samdf.columns:
    if pd.api.types.is_integer_dtype(samdf[col]):
        field_schemas.append(FieldSchema(name = col,  dtype = DataType.INT64))
    elif pd.api.types.is_float_dtype(samdf[col]):
        field_schemas.append(FieldSchema(name = col,  dtype = DataType.FLOAT))
    elif pd.api.types.is_bool_dtype(samdf[col]):
        field_schemas.append(FieldSchema(name = col,  dtype = DataType.BOOL))
    elif (pd.api.types.is_string_dtype(samdf[col])) or (pd.api.types.infer_dtype(samdf[col]) == 'string'):
        field_schemas.append(FieldSchema(name=col, dtype=DataType.VARCHAR, 
                                         max_length=int(samdf[col].str.len().max() + 50), default_value='')) # Return max length for VARCHAR 
    elif ((pd.api.types.is_object_dtype(samdf[col]) or 
          (pd.api.types.infer_dtype(samdf[col]) == 'mixed')) and 
            (isinstance(samdf[col].iloc[0], list) or
            isinstance(samdf[col].iloc[0], np.ndarray))):
        field_schemas.append(FieldSchema(name= col, dtype=DataType.FLOAT_VECTOR, 
                                         dim=len(samdf[col].iloc[0]))) # Assuming all lists are of equal length
    else:
        print(f'Uknown datatype for column {col} : {samdf[col].dtypes}')

for field in field_schemas:
    print(field)

Successfully connected to Milvus
{'name': 'mindex', 'description': '', 'type': <DataType.INT64: 5>, 'is_primary': True, 'auto_id': True}
{'name': 'context', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 3756}, 'default_value': string_data: ""
}
{'name': 'question', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 214}, 'default_value': string_data: ""
}
{'name': 'id', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 74}, 'default_value': string_data: ""
}
{'name': 'is_impossible', 'description': '', 'type': <DataType.BOOL: 1>}
{'name': 'answer_text', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 252}, 'default_value': string_data: ""
}
{'name': 'answer_start', 'description': '', 'type': <DataType.INT64: 5>}
{'name': 'twwww', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 1615}, 'default_value': string_data: ""
}
{'name': 'mpnet', 'description': 

In [22]:
schema = CollectionSchema(field_schemas)

collection_name = "squad2"

# Check if the collection exists
if utility.has_collection(collection_name):
    print(f"Collection '{collection_name}' exists. Dropping it...")
    utility.drop_collection(collection_name)
    print(f"Collection '{collection_name}' has been dropped.")
else:
    print(f"Collection '{collection_name}' does not exist. Making it...")

try: 
    collection = Collection(name=collection_name, schema=schema)
    print(f"Collection '{collection_name}' has been created.")
except Exception as e:
    print(f"Failed to create collection: {e}")
    print(f"Collection name: {collection_name}")
    for i in range(len(schema.fields)):
        print(schema.fields[i])

# Create index on vector fields (optional but recommended for faster searches)
index_params = {"index_type": "HNSW", "metric_type": "COSINE", "params": {"M": 16, "efConstruction": 32}}

for emb in ["splade", "mpnet"]:
    collection.create_index(field_name=emb, index_params=index_params)
    collection.create_index(field_name=emb+"_twwww", index_params=index_params)
    #for tag in ['', '_who', '_what', '_where', '_when']:
    #    collection.create_index(field_name=emb+tag, index_params=index_params)
    #    print(f"created {emb}{tag}")

Collection 'squad2' exists. Dropping it...
Collection 'squad2' has been dropped.
Collection 'squad2' has been created.


In [23]:
# Prepare data for insertion
data_to_insert = []

for col in samdf.columns:
    data_to_insert.append(samdf[col].tolist())
    #print(f'{col} : {data_to_insert[-1][0]}')

print(len(data_to_insert))

# Insert data into Milvus collection
BATCH_SIZE = 500  # Adjust batch size as needed
total_vectors = len(data_to_insert[0])

for start in tqdm(range(0, total_vectors, BATCH_SIZE), desc="Inserting data into Milvus"):
    end = min(start + BATCH_SIZE, total_vectors)
    batch_data = [data_to_insert[i][start:end] for i in range(len(data_to_insert))]
    
    try:
        collection.insert(batch_data)
    except Exception as e:
        print(f"Error inserting batch from {start} to {end}: {e}")

# collection.insert(data_to_insert)

# Flush to ensure data is written to disk
collection.flush()

11


Inserting data into Milvus: 100%|██████████| 2/2 [00:00<00:00,  3.40it/s]


In [25]:
"""
TODO
1. Extract wwww from 1 question
2. Vectorize emb_wwww 
3. 10 Vector searches
4. RRF
5. F1 and MRR
"""

'\nTODO\n1. Extract wwww from 1 question\n2. Vectorize emb_wwww \n3. 10 Vector searches\n4. RRF\n5. F1 and MRR\n'

In [71]:
def search_collection(collection_name, keyword):
    # Extract the question
    wwww = batch_apply([keyword])
    
    # Generate embeddings for the keyword
    mpnet_emb = vectorize_mpnet_batch(keyword, 1)
    splade_emb = vectorize_splade_batch(keyword, 1)

    # Assuming you want to use SPLADE embedding for searching
    if not utility.has_collection(collection_name):
        print(f"Collection '{collection_name}' does not exist.")
    else:
        print(f"Collection '{collection_name}' is not loaded. Loading now...")
        collection = Collection(collection_name)
        collection.load()
        
        load_state = utility.load_state(collection_name)
        print(f"Load state for '{collection_name}': {load_state}")
        print(f"Schema : {collection.schema}")
        print(f"Num of Entries : {collection.num_entities}")
    print("#"*30)
    
    # Perform the search using the SPLADE embedding
    search_params = {"metric_type": "COSINE", "params": {"ef": 64, "nprobe": 10}}  # Adjust parameters as needed
    search_result = {}
    for emb in ["splade", "mpnet"]:
        for tag in ['', '_twwww']:
            search_result[emb+tag] = collection.search(
                data=splade_emb, 
                anns_field=emb+tag, # Field name where embeddings are stored
                param=search_params,
                output_fields=["mindex", "id", "context", "twwww"],
                limit=10,
                expr=None           # Optional: additional filtering conditions
            )
            # TODO: Print search summary
    
    return search_result

# Combine results using Reciprocal Rank Fusion (RRF)
from collections import defaultdict
def reciprocal_rank_fusion(results_dict, k=60):
    """
    Combine results from multiple searches using Reciprocal Rank Fusion (RRF).
    
    Args:
        results_dict (dict of lists): Each sublist contains search results for one field.
        k (int): Constant to control the impact of rank position.

    Returns:
        dict: Combined scores for each unique ID.
    """
    #combined_scores = {}
    combined_scores = defaultdict(int)  # Automatically initializes missing keys to 0
    
    for key, results in results_dict.items():
        #print(key)
        #print(results)
        for rank, result in enumerate(results[0]):  # Iterate over top-k results for each query
            doc_id = result.id
            score = 1 / (k + rank + 1)  # RRF formula
            
            #if doc_id not in combined_scores:
            #    combined_scores[doc_id] = score
            #else:
            combined_scores[doc_id] += score
    
    return dict(combined_scores)
    
def qid_exists(qid='', collection_name = "squad2"):
    # Define the value you are searching for and the field
    collection = Collection(collection_name)
    search_field = "id"
    search_value = qid
    
    # Construct the DSL query
    expr = f"{search_field} in ['{search_value}']"
    
    # Perform the search
    search_results = collection.query(
        expr=expr,
        output_fields=[search_field],  # Specify which fields to return
        limit=1  # Limit results to 1 for existence check
    )
    
    # Check if any results were returned
    if search_results:
        print(f"The value '{search_value}' exists in field '{search_field}'.")
    else:
        print(f"The value '{search_value}' does not exist in field '{search_field}'.")
    
    return search_results

from sklearn.metrics.pairwise import cosine_similarity
def calculate_similarity(string1, string2, embedding="splade"):
    if embedding == "splade":
        embedding1 = vectorize_splade_batch(string1)
        embedding2 = vectorize_splade_batch(string2)
    else:
        embedding1 = vectorize_mpnet_batch(string1)
        embedding2 = vectorize_mpnet_batch(string2)
    
    # Compute cosine similarity
    similarity_score = cosine_similarity(embedding1, embedding2)[0][0]
    return similarity_score

def print_structure(data, indent=0):
    """Recursively print the structure of a nested data structure."""
    spacing = ' ' * indent
    if isinstance(data, dict):
        print(f"{spacing}{{")
        for key, value in data.items():
            print(f"{spacing}  '{key}': ", end='')
            print_structure(value, indent + 4)
        print(f"{spacing}}}")
    elif isinstance(data, list):
        print(f"{spacing}[")

        for item in data:
            print_structure(item, indent + 4)
        print(f"{spacing}]")
    else:
        print(f"{spacing}{type(data).__name__}")

In [69]:
question = samdf['question'].iloc[0]
qid = samdf['id'].iloc[0]
qcontext = samdf['context'].iloc[0]
qtwwww = samdf['twwww'].iloc[0]
print(question, qid, qtwwww)
print("=" * 20)

qidexists = qid_exists(qid)
print(f"QID exists? {qidexists}")
print("=" * 20)

print("SPLADE")
print(f"Context Cosine Similiarity: {calculate_similarity(question, qcontext)}")
print(f"Extraction Cosine Similiarity: {calculate_similarity(question, qtwwww)}")
print("=" * 20)

print("ALL MPNET BASE V2")
print(f"Context Cosine Similiarity: {calculate_similarity(question, qcontext, 'mpnet')}")
print(f"Extraction Cosine Similiarity: {calculate_similarity(question, qtwwww, 'mpnet')}")
print("=" * 20)

What year did the global recession that followed the financial crisis of 2007 end? 573257950fdd8d15006c69ee Financial crisis of 2007, markets market crisis BNP Paribas Paribas, collapse institutions bailout banks governments stock markets areas housing market evictions foreclosures unemployment crisis role failure businesses consumer wealth trillions dollars downturn activity recession debt crisis phase crisis liquidity crisis withdrawals hedge funds evaporation liquidity, U.S., August 9 , 2007
The value '573257950fdd8d15006c69ee' exists in field 'id'.
QID exists? data: ["{'id': '573257950fdd8d15006c69ee', 'mindex': 454303133564142207}"] 
SPLADE


Vectorizing with SPLADE: 100%|██████████| 82/82 [00:00<00:00, 108.24it/s]
Vectorizing with SPLADE: 100%|██████████| 773/773 [00:04<00:00, 169.94it/s]


Context Cosine Similiarity: 0.7960536479949951


Vectorizing with SPLADE: 100%|██████████| 82/82 [00:00<00:00, 166.16it/s]
Vectorizing with SPLADE: 100%|██████████| 391/391 [00:02<00:00, 161.95it/s]


Extraction Cosine Similiarity: 0.80927574634552
ALL MPNET BASE V2


Vectorizing with ALL-MPNET-BASE-V2: 100%|██████████| 82/82 [00:00<00:00, 106.39it/s]
Vectorizing with ALL-MPNET-BASE-V2: 100%|██████████| 773/773 [00:06<00:00, 111.30it/s]


Context Cosine Similiarity: 0.37077558040618896


Vectorizing with ALL-MPNET-BASE-V2: 100%|██████████| 82/82 [00:00<00:00, 98.01it/s] 
Vectorizing with ALL-MPNET-BASE-V2: 100%|██████████| 391/391 [00:03<00:00, 103.22it/s]

Extraction Cosine Similiarity: 0.4324513375759125





In [61]:
results = search_collection(collection_name, question)

Processing:   0%|          | 0/1 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. 

Collection 'squad2' is not loaded. Loading now...
Load state for 'squad2': Loaded
Schema : {'auto_id': True, 'description': '', 'fields': [{'name': 'mindex', 'description': '', 'type': <DataType.INT64: 5>, 'is_primary': True, 'auto_id': True}, {'name': 'context', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 3756}, 'default_value': string_data: ""
}, {'name': 'question', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 214}, 'default_value': string_data: ""
}, {'name': 'id', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 74}, 'default_value': string_data: ""
}, {'name': 'is_impossible', 'description': '', 'type': <DataType.BOOL: 1>}, {'name': 'answer_text', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 252}, 'default_value': string_data: ""
}, {'name': 'answer_start', 'description': '', 'type': <DataType.INT64: 5>}, {'name': 'twwww', 'description': '', 'type': <DataType.V

In [70]:



print(results['splade_twwww'][0][0].entity)
print("=" * 20)
print(results['splade_twwww'][0][0].entity.get('id'))
print("=" * 20)
print(results['splade_twwww'][0][0].entity.get('twwww'))

{   'mpnet': data: ['[\'id: 454303133564143134, distance: 0.1503101885318756, entity: {\\\'mindex\\\': 454303133564143134, \\\'id\\\': \\\'5a845b527cf838001a46a77a\\\', \\\'context\\\': "When the news arrived in England it caused an outcry. In response, a combined bounty of £1,000 was offered for Every\\\'s capture by the Privy Council and East India Company, leading to the first worldwide manhunt in recorded history. The plunder of Aurangzeb\\\'s treasure ship had serious consequences for the English East India Company. The furious Mughal Emperor Aurangzeb ordered Sidi Yaqub and Nawab Daud Khan to attack and close four of the company\\\'s factories in India and imprison their officers, who were almost lynched by a mob of angry Mughals, blaming them for their countryman\\\'s depredations, and threatened to put an end to all English trading in India. To appease Emperor Aurangzeb and particularly his Grand Vizier Asad Khan, Parliament exempted Every from all of the Acts of Grace (pardons

In [28]:
def printsearch(results=[], qid="", qcontext="", fullresult=False):
    for result in results:
        numkeys = 0
        truehit = 0
        for hit in result:
            entity_id = hit.entity.get("id")  #
            entity_text = hit.entity.get("context")  # Get original text (replace 'text' with your field name)
            distance = hit.distance  # Get distance of the result
            containskey = (entity_id in qid) or (qcontext in entity.entity.get("context"))
            truehit += containskey
            
            if fullresult: 
                print(f"ID: {entity_id}, Distance: {distance:.2f}, Has Key: {containskey}, \nText: {entity_text:.200}\n")
        #print(result)
    print(f"TrueHit : {truehit}")
    return None

# Apply RRF to combine results from both searches
combined_scores = reciprocal_rank_fusion(search_result)

# Sort combined scores in descending order of relevance
sorted_results = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)

print(f"Splade Search Results Question: {question}")
print("=" * 20)
#print(results['splade'])
printsearch(results['splade'], qid, qcontext)
print("=" * 20)
printsearch(results['mpnet'], qid, qcontext)
print("=" * 20)
printsearch(results['splade_twwww'], qid, qcontext)
print("=" * 20)
printsearch(results['mpnet_twwww'], qid, qcontext)

Splade Search Results Question: What year did the global recession that followed the financial crisis of 2007 end?
TrueHit : 0
TrueHit : 0
TrueHit : 0
TrueHit : 0


In [35]:
# Extract IDs and distances from results
ids = []
distances = []
contexts = []

searchfield = 'splade_twwww'
for result in results[searchfield]:
    for item in result:
        ids.append(item.id)
        distances.append(item.distance)
        contexts.append(item.context)

# Create a DataFrame from the results
df_results = pd.DataFrame({
    'ID': ids,
    'Distance': distances,
    'context': contexts
})
print(qid)
# Display the DataFrame
df_results.head(10)

573257950fdd8d15006c69ee


Unnamed: 0,ID,Distance,context
0,454303133564142281,0.759491,The exact number of speakers of Somali is unkn...
1,454303133564142912,0.733379,While Western Armenia still remained under Ott...
2,454303133564142459,0.72582,"Similar examples abound. Macedonian, although ..."
3,454303133564142770,0.707451,After the Estonian War of Independence in 1919...
4,454303133564142629,0.689828,Following capture and occupation by the United...
5,454303133564142329,0.687742,Section 6 of the Endangered Species Act provid...
6,454303133564142838,0.679507,"On December 25, 1991, following the collapse o..."
7,454303133564142336,0.678954,"In Canada, ""the 51st state"" is a phrase genera..."
8,454303133564142727,0.677137,Whilst it is often perceived as an optimal sol...
9,454303133564142228,0.67444,The Roman Catholic Church canon law also inclu...


In [29]:
nlpquestion = " ".join([v for _, v in wwww_extract(nlp(question)).items()])
nlpquestion

'year year recession crisis end  2007'

In [30]:
nlpresults = search_collection(collection_name, nlpquestion)

print(f"NLP Search Results Question: {question}")
print("=" * 20)
#print(results['splade'])
printsearch(nlpresults['splade'], qid, qcontext)
print("=" * 20)
printsearch(nlpresults['mpnet'], qid, qcontext)
print("=" * 20)
printsearch(nlpresults['splade_twwww'], qid, qcontext)
print("=" * 20)
printsearch(nlpresults['mpnet_twwww'], qid, qcontext)

Processing:   0%|          | 0/1 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. 

Collection 'squad2' is not loaded. Loading now...
Load state for 'squad2': Loaded
Schema : {'auto_id': True, 'description': '', 'fields': [{'name': 'mindex', 'description': '', 'type': <DataType.INT64: 5>, 'is_primary': True, 'auto_id': True}, {'name': 'context', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 3756}, 'default_value': string_data: ""
}, {'name': 'question', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 214}, 'default_value': string_data: ""
}, {'name': 'id', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 74}, 'default_value': string_data: ""
}, {'name': 'is_impossible', 'description': '', 'type': <DataType.BOOL: 1>}, {'name': 'answer_text', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 252}, 'default_value': string_data: ""
}, {'name': 'answer_start', 'description': '', 'type': <DataType.INT64: 5>}, {'name': 'twwww', 'description': '', 'type': <DataType.V

In [31]:
print(f"ALL MPNET Search Results: {question}")
print("=" * 20)
print(mpnet_results[0][0])
print("=" * 20)
printsearch(mpnet_results, qid, qcontext)

ALL MPNET Search Results: What year did the global recession that followed the financial crisis of 2007 end?


NameError: name 'mpnet_results' is not defined

In [None]:
print(samdf['context'].iloc[0])
print("=" * 20)
print(fdf['who'])
print("=" * 20)
print(fdf['when'])
print("=" * 20)
print(fdf['where'])

In [None]:
# Apply spacy to the DataFrame column
features = samdf['question'].iloc[0:10].progress_apply(spacy_extract)
fdf = pd.json_normalize(features).iloc[0]
# Display the updated DataFrame with NER and Lemmatization results
print(fdf)
print("=" * 20)
print(samdf['question'].iloc[0])
print("=" * 20)
print(fdf['entities'])
print("=" * 20)
print(fdf['subobjects'])
print("=" * 20)
print(fdf['lemmatext'])

In [None]:
# TODO: save in .csv

# TODO: run official eval script 

# TODO: run visualization of official results 

#
# Multi-Ollama for Concurrent Invokes
#

In [None]:
import itertools
from langchain_community.llms import Ollama
from langchain.prompts import PromptTemplate

import deh.settings as settings
import deh.guardrail as guardrail
from deh.utils import format_context_documents as format_docs
from deh.utils import retriever_with_scores, dedupulicate_contexts
from deh.prompts import (
    qa_eval_prompt_with_context_text,
    LLMEvalResult,
    rag_text_prompts,
    hyde_prompts,
)
import deh

In [None]:
# Set up load balancing between the two instances
nLLM = 4
LLM = [Ollama(
            base_url=settings.OLLAMA_HOST, model=settings.LLM_MODEL, verbose=True
        ) for i in range(nLLM)
      ] 

print(len(LLM))
print(LLM[-1])

# Create a round-robin iterator using itertools.cycle
round_robin_models = itertools.cycle(LLM)

def LLMinvoke(prompt):
    # Get the next model in the round-robin cycle
    selected_model = next(round_robin_models)
    
    # Invoke the selected model with the prompt
    response = selected_model.invoke(prompt)
    return response

In [None]:
row = 0
q = df.iloc[row]["question"]
c = df.iloc[row]["context"]
gt = df.iloc[row]["answer_text"]

# Initial LLM generation prompt:
qa_prompt = PromptTemplate(
    template=rag_text_prompts[0], input_variables=["question", "context"]
)

chain = qa_prompt | LLMinvoke
llmans = chain.invoke({"question": q, "context": c})

print(llmans)

In [None]:
print(q)
print(c)
print(gt)
print(chain)

In [None]:
import asyncio
from tqdm.asyncio import tqdm_asyncio
import nest_asyncio

nest_asyncio.apply()

# Assuming chain.invoke is an asynchronous function
async def invoke_chain(chain, q, c, semaphore, pbar):
    async with semaphore:
        pbar.update(1)
        return await chain.ainvoke({"question": q, "context": c})

async def main(df = None, LLM = None, n=50, clim=8):
    # input validation 
    if df.empty:
        raise ValueError("The DataFrame is empty")
    assert (LLM != None)
    
    # Initial LLM generation prompt:
    qa_prompt = PromptTemplate(
        template=rag_text_prompts[0], input_variables=["question", "context"]
    )
    chain = qa_prompt | LLMinvoke

    # Create a semaphore with a limit on concurrent tasks
    semaphore = asyncio.Semaphore(clim)

    # Create a tqdm progress bar
    samdf = df.sample(n, random_state=42)
    with tqdm(total=n, desc="Processing") as pbar:
        tasks = [invoke_chain(chain, row['question'], row['context'], semaphore, pbar) for _, row in samdf.iterrows()]
        results = await asyncio.gather(*tasks)

    return results

In [None]:
# Run the main function
results = await main(df, LLM, 100, 32)

In [None]:
# Print the results
for i in range(10): 
    print(str(i) + " : " + results[i])

In [None]:
ans_len = df["answer_text"].str.split().str.len()
print(ans_len.mean())
print(ans_len.std())