In [2]:
import pandas as pd
import os
import sys
import pinecone
from tqdm import tqdm
from typing import Any, List
import logging
import json


from langchain_community.llms import LlamaCpp
from langchain_core.callbacks import CallbackManager, StreamingStdOutCallbackHandler
from langchain_core.prompts import PromptTemplate

from langchain_community.embeddings import HuggingFaceEmbeddings

import mlflow
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np

cwd = os.getcwd()

sys.path.insert(0, os.path.abspath(os.path.join(cwd, '../assets')))
sys.path.insert(0, os.path.abspath(os.path.join(cwd, '../src')))
sys.path.insert(0, os.path.abspath(os.path.join(cwd, '../')))

from scripts.pinecone_func import pinecone_upsert


embed_model = HuggingFaceEmbeddings()

  from tqdm.autonotebook import tqdm
  warn_deprecated(


# Chunking

1. Why is it important?
Chunking is a critical step in the data processing pipeline, especially when dealing with large datasets or text corpora. It involves breaking down large pieces of data into smaller, manageable chunks. This process is essential for several reasons:

* LLMs have limited context window


**Chunking approaches:**
1. Token Splitting:
    * `text_splitter = CharacterTextSplitter.from_huggingface_tokenizer(
    tokenizer, chunk_size=100, chunk_overlap=0
)
texts = text_splitter.split_text(state_of_the_union)`
2. SentenceSplitter
3. SemanticSplitter

**Sizes:**
[128, 256, 512, 1025]


2. What are the approaches?

* Experiment with different chunk size and different chunking approach
* Experiment with different embeddings
    * huggingfacembedding
    * ?
    * ?

* Experiment with different Retrievers:
    * Self query Retriever with NER?
    * Parent Retriever
    * Contextual Compression Retriever (Reranking)


## Creating subsets with embeddings

In the previous step (data_management), I exported the data from the WiWi Humboldt website. In this stage, I will create a random subset comprising one-third of the initial DataFrame. This subset will be used to conduct experiments with different chunk sizes: 128, 256, and 512.

By taking a random third of the DataFrame, I aim to balance between computational efficiency and the reliability of the results. This approach ensures the subset is large enough to produce stable results while being small enough to reduce computational costs and improve experiments' efficiency.

#ADD ABOUT RECALL, Precision, MAP and etc.

In [3]:
chunk_sizes = [128, 256, 512, 1024]
index_name = f"huggingface-embeddings-dim768-chunk{chunk_size}"

mlflow.set_tracking_uri("../assets/mlruns")
qa_df = pd.read_csv('../assets/csv/qa_df.csv', index_col=0)


NameError: name 'chunk_size' is not defined

In [4]:

import logging
from llama_index.core.node_parser import SentenceSplitter

logger = logging.getLogger(__name__)
def chunk_text_by_sentences(text, chunk_size=1024, chunk_overlap=200):
    """
    Splits the input text into chunks using LlamaIndex SentenceSplitter.

    Args:
        text (str): The input text to be chunked.
        chunk_size (int): The token chunk size for each chunk. Default is 1024.
        chunk_overlap (int): The token overlap of each chunk when splitting. Default is 200.
    
    Returns:
        List[str]: A list of text chunks.
    """
    splitter = SentenceSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    chunks = splitter.split_text(text)
    
    logger.debug(f"Created {len(chunks)} sentence-based chunks.")
    return chunks


def chunk_dataframe(data: pd.DataFrame, chunk_size: int, chunk_overlap: int) -> pd.DataFrame:
    """
    Chunks the text content of a DataFrame into smaller segments based on sentences using LlamaIndex.

    Args:
        data (pd.DataFrame): Input DataFrame containing at least 'id', 'url', 'last_updated', 
                             'html_content', and 'text' columns.
        chunk_size (int): The token chunk size for each chunk.
        chunk_overlap (int): The token overlap of each chunk when splitting.

    Returns:
        pd.DataFrame: A new DataFrame where each row represents a chunk of the original text.
    """
    logger.info(f"Starting to chunk dataframe with chunk size: {chunk_size} and overlap: {chunk_overlap}")
    new_rows = []
    for _, row in tqdm(data.iterrows(), total=len(data), desc=f"Chunking (chunk size: {chunk_size})"):
        try:
            chunks = chunk_text_by_sentences(row['text'], chunk_size, chunk_overlap)
            for i, chunk in enumerate(chunks):
                new_rows.append({
                    'unique_id': f"{row['id']}_{i+1}",
                    'url': row['url'],
                    'last_updated': row['last_updated'],
                    'html_content': row['html_content'],
                    'text': chunk,
                    'len': len(chunk),
                    'id': row['id']
                })
        except Exception as e:
            logger.error(f"Error processing row {row['id']}: {str(e)}")
    result = pd.DataFrame(new_rows)
    logger.info(f"Finished chunking. Created {len(result)} chunks from {len(data)} original rows.")
    return result


df = pd.read_csv('../assets/csv/data_subset.csv', index_col=0)
#remove all na and nan values from the text column
df = df.dropna(subset=['text'])

df_chunked = chunk_dataframe(df, 1024, 200)

Chunking (chunk size: 1024): 100%|██████████| 297/297 [00:00<00:00, 932.69it/s]


In [74]:
import time
from typing import List, Tuple
from collections import defaultdict
import mlflow

def convert_question_to_vector(query):
    try:
        query_vector = embed_model.embed_query(query)
        return query_vector
    except Exception as e:
        print(f"Error converting question to vector: {e}")
        return None

def calculate_mrr(question_id: str, general_ids: List[str]) -> Tuple[int, float]:
    if question_id in general_ids:
        rank = general_ids.index(question_id) + 1
        reciprocal_rank = 1 / rank
    else:
        rank = 0
        reciprocal_rank = 0
    return rank, reciprocal_rank

def calculate_hit_at_k(question_id: str, general_ids: List[str], k: int) -> int:
    return int(question_id in general_ids[:k])

def evaluate_retriever(qa_df, docsearch, convert_question_to_vector, k_values=[1, 3, 5]):
    total_mrr = 0
    total_retrieval_time = 0
    hit_at_k = {k: 0 for k in k_values}
    num_questions = len(qa_df)

    for _, row in qa_df.iterrows():
        question = row['question']
        question_id = row['id']

        # Measure retrieval time
        start_time = time.time()
        embed_question = convert_question_to_vector(question)
        search_results = docsearch.similarity_search_by_vector_with_score(embed_question, k=max(k_values))
        end_time = time.time()
        retrieval_time = end_time - start_time
        total_retrieval_time += retrieval_time

        # Extract general_ids from search results
        general_ids = [item[0].metadata['general_id'] for item in search_results]

        # Calculate MRR
        _, reciprocal_rank = calculate_mrr(question_id, general_ids)
        total_mrr += reciprocal_rank

        # Calculate Hit@K
        for k in k_values:
            hit_at_k[k] += calculate_hit_at_k(question_id, general_ids, k)

    # Calculate averages
    avg_mrr = total_mrr / num_questions
    avg_retrieval_time = total_retrieval_time / num_questions
    avg_hit_at_k = {k: hits / num_questions for k, hits in hit_at_k.items()}

    return {
        "MRR": avg_mrr,
        "Avg Retrieval Time": avg_retrieval_time,
        "HitatK": avg_hit_at_k
    }

def evaluate_all_chunk_sizes(qa_df, chunk_sizes, embedding_name,embedding_dimension, convert_question_to_vector, k_values=[1, 3, 5]):
    results = {}

    # Set up MLflow experiment
    mlflow.set_experiment("RAG_Chunk_Size_Evaluation")

    for chunk_size in chunk_sizes:
        index_name = f"{embedding_name}-dim{embedding_dimension}-chunk{chunk_size}"

        # Start a new MLflow run for each chunk size
        with mlflow.start_run(run_name=f"chunk_size_{chunk_size}"):
            # Log chunk size as a parameter
            mlflow.log_param("chunk_size", chunk_size)

            # Assuming you have a function to load the docsearch for a specific index
            docsearch = connect_pinecone(index_name)

            print(f"\nEvaluating chunk size: {chunk_size}")
            chunk_results = evaluate_retriever(qa_df, docsearch, convert_question_to_vector, k_values)
            results[chunk_size] = chunk_results

            # Log metrics
            mlflow.log_metric("MRR", chunk_results['MRR'])
            mlflow.log_metric("Avg_Retrieval_Time", chunk_results['Avg Retrieval Time'])
            for k, hit_rate in chunk_results['HitatK'].items():
                mlflow.log_metric(f"Hitat{k}", hit_rate)

            print(f"Average MRR: {chunk_results['MRR']:.4f}")
            print(f"Average Retrieval Time: {chunk_results['Avg Retrieval Time']:.4f} seconds")
            for k, hit_rate in chunk_results['HitatK'].items():
                print(f"Hit@{k}: {hit_rate:.4f}")

    return results

def print_comparison_and_best_sizes(all_results):
    # Comparing results across chunk sizes
    print("\nComparison across chunk sizes:")
    metrics = ["MRR", "Avg Retrieval Time"] + [f"Hit_{k}" for k in [1, 3, 5]]

    for metric in metrics:
        print(f"\n{metric}:")
        for chunk_size, results in all_results.items():
            if metric == "Avg Retrieval Time":
                value = results[metric]
            elif metric.startswith("Hit_"):
                k = int(metric.split("_")[1])
                value = results["HitatK"][k]
            else:
                value = results[metric]
            print(f"  Chunk size {chunk_size}: {value:.4f}")

    # Finding the best chunk size for each metric
    best_chunk_sizes = defaultdict(list)
    for metric in metrics:
        if metric == "Avg Retrieval Time":
            best_value = min(results[metric] for results in all_results.values())
        else:
            best_value = max(results[metric] if metric == "MRR" else results["HitatK"][int(metric.split("_")[1])] 
                             for results in all_results.values())

        for chunk_size, results in all_results.items():
            if metric == "Avg Retrieval Time":
                value = results[metric]
            elif metric.startswith("Hit_"):
                k = int(metric.split("_")[1])
                value = results["HitatK"][k]
            else:
                value = results[metric]

            if value == best_value:
                best_chunk_sizes[metric].append(chunk_size)

    print("\nBest chunk size(s) for each metric:")
    for metric, sizes in best_chunk_sizes.items():
        print(f"{metric}: {sizes}")


# Main execution
if __name__ == "__main__":
    # Assuming these are defined elsewhere in your code

    chunk_sizes = [128, 256, 512, 1024]
    all_results = evaluate_all_chunk_sizes(qa_df, chunk_sizes, 'huggingface-embeddings',768 ,convert_question_to_vector)
    print_comparison_and_best_sizes(all_results)


Evaluating chunk size: 128
Average MRR: 0.5265
Average Retrieval Time: 0.2224 seconds
Hit@1: 0.4545
Hit@3: 0.5909
Hit@5: 0.6364

Evaluating chunk size: 256
Average MRR: 0.4886
Average Retrieval Time: 0.1980 seconds
Hit@1: 0.4091
Hit@3: 0.5909
Hit@5: 0.6364

Evaluating chunk size: 512
Average MRR: 0.4144
Average Retrieval Time: 0.2293 seconds
Hit@1: 0.3182
Hit@3: 0.5455
Hit@5: 0.6364

Evaluating chunk size: 1024
Average MRR: 0.4409
Average Retrieval Time: 0.2211 seconds
Hit@1: 0.3636
Hit@3: 0.5455
Hit@5: 0.5909

Comparison across chunk sizes:

MRR:
  Chunk size 128: 0.5265
  Chunk size 256: 0.4886
  Chunk size 512: 0.4144
  Chunk size 1024: 0.4409

Avg Retrieval Time:
  Chunk size 128: 0.2224
  Chunk size 256: 0.1980
  Chunk size 512: 0.2293
  Chunk size 1024: 0.2211

Hit_1:
  Chunk size 128: 0.4545
  Chunk size 256: 0.4091
  Chunk size 512: 0.3182
  Chunk size 1024: 0.3636

Hit_3:
  Chunk size 128: 0.5909
  Chunk size 256: 0.5909
  Chunk size 512: 0.5455
  Chunk size 1024: 0.5455

Hit

If accuracy and comprehensive results are most important, go with 128.
If speed is the top priority, consider 512.
If you want a balance, 256 might be a good compromise, as it performs well in Hit@1 and has good retrieval time.

in general, smaller chunk shows better MRR, Hit 1 value. 

- Followed by the chunk 256, that wins in retriever time. Therefore, we cannot say for sure which chunk is more preferable.
- Therefore, we can try testing it with different embedding model! When it is not changing for two different embeddings, we can keep the best one or the easiest in implementation
- Then we will try different chunking approach. Right now we have token sized, we should try sentence and semantic once.
- Add reranking. 



## BM25 Retriever

In [69]:
from langchain.retrievers import BM25Retriever
from rank_bm25 import BM25Okapi
from typing import List, Dict

import pinecone
from langchain.vectorstores import Pinecone
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.retrievers import BM25Retriever
from rank_bm25 import BM25Okapi

# Initialize Pinecone


# Set up the vector store
embeddings = HuggingFaceEmbeddings()
vectorstore = connect_pinecone(index_name="docs-chunk-512")

# Fetch documents from Pinecone
print("Fetching documents from Pinecone...")
docs = vectorstore.similarity_search("", k=10000)  # Fetch a large number of documents
texts = [doc.page_content for doc in docs]

print(f"Fetched {len(texts)} documents.")

# # Set up BM25 retriever
# print("Setting up BM25 retriever...")
# bm25 = BM25Okapi(texts)
# bm25_retriever = BM25Retriever(bm25=bm25, search_type="mmr")

# # Function to retrieve and print results
# def retrieve_and_print(query, k=5):
#     print(f"\nQuery: {query}")
#     results = bm25_retriever.get_relevant_documents(query)[:k]
#     for i, doc in enumerate(results, 1):
#         print(f"{i}. {doc.page_content[:200]}...")  # Print first 200 characters of each result

# # Example queries
# queries = [
#     "What is machine learning?",
#     "Explain the concept of neural networks",
#     "How does natural language processing work?",
# ]

# # Retrieve and print results for each query
# for query in queries:
#     retrieve_and_print(query)

# print("\nBM25 retrieval demo completed.")

Fetching documents from Pinecone...
Fetched 1916 documents.


In [70]:
docs

[Document(metadata={'general_id': '96d5199eb522caa9123d0f3af780d6c9'}, page_content='Click '),
 Document(metadata={'general_id': '660e1fc32861c88e02aab4ba72723f7a'}, page_content='eda'),
 Document(metadata={'general_id': '3752c384afd12d18b9314bfac478f8c5'}, page_content='Papers in peer reviewed journals 2018 Hartmann, W. and D. Klapper (2018). Super Bowl Ads. Marketing Science, Vol. 37, No. 1, 78-96. 2016 Guhl, D., Winkler von Mohrenfels, H., Abshagen, J. and Klapper, D. (2016). Measuring Marketing Success: Estimating the Effect of Social Media and TV Advertising on Brand Attention (Research Note). Marketing ZFP, Vol. 38, No. 1, 44-54. Zenetti, G. and Klapper, D. (2016). Advertising Effects Under Consumer Heterogeneity - The Moderating Role of Brand Experience, Advertising '),
 Document(metadata={'general_id': 'fe6ed8976dcec827f6934a9ce9a1a04d'}, page_content='codes)'),
 Document(metadata={'general_id': 'bf089ba737ce5fde94d5111009fda3e7'}, page_content='codes)'),
 Document(metadata={'g