In [1]:
import pandas as pd
import minsearch_xtra as minsearch
import os
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
from openai import OpenAI
from tqdm.auto import tqdm
from sentence_transformers import SentenceTransformer

# Retrieval Evaluation

In [2]:
model_name = 'multi-qa-MiniLM-L6-cos-v1'
model = SentenceTransformer(model_name)

# Load the dataset
df = pd.read_csv("../data/stoic_zen_document.csv")
df.insert(0, 'id', df.index)

documents = df.to_dict(orient="records")
print("length of the documents:", len(documents))

for doc in tqdm(documents):
    question = doc['question']
    answer = doc['answer']
    qa = question + ' ' + answer

    doc['question_answer_vector'] = model.encode(qa)

index = minsearch.Index(
        text_fields=["category", "question", "answer"],
        vector_fields=['question_answer_vector'],
        keyword_fields=['id', "ideology"]
)

# Fit the index
index.fit(documents)

  return torch._C._cuda_getDeviceCount() > 0


length of the documents: 820


  0%|          | 0/820 [00:00<?, ?it/s]

<minsearch_xtra.Index at 0x7f5635bf8310>

In [3]:
df_questions = pd.read_csv("../data/ground_truth_retrieval.csv")
ground_truth = df_questions.to_dict(orient='records')

In [4]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

def minsearch_search(query, ideology):
    boost = {'question': 3.0, 'category': 0.5}

    results = index.search(
        query=query,
        filter_dict={'ideology': ideology},
        boost_dict=boost,
        num_results=5
    )

    return results

def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['id']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [5]:
evaluate(ground_truth, lambda q: minsearch_search(q['question'],q['ideology']))

  0%|          | 0/4100 [00:00<?, ?it/s]

{'hit_rate': 0.8043902439024391, 'mrr': 0.6790121951219511}

# Finding the best parameters

In [6]:
df_validation = df_questions[:200]
gt_val = df_validation.to_dict(orient='records')

df_test = df_questions[200:]

In [7]:
import random

def simple_optimize(param_ranges, objective_function, n_iterations=10):
    best_params = None
    best_score = float('-inf')  # Assuming we're minimizing. Use float('-inf') if maximizing.

    for _ in range(n_iterations):
        # Generate random parameters
        current_params = {}
        for param, (min_val, max_val) in param_ranges.items():
            if isinstance(min_val, int) and isinstance(max_val, int):
                current_params[param] = random.randint(min_val, max_val)
            else:
                current_params[param] = random.uniform(min_val, max_val)
        
        # Evaluate the objective function
        current_score = objective_function(current_params)
        
        # Update best if current is better
        if current_score > best_score:  # Change to > if maximizing
            best_score = current_score
            best_params = current_params
    
    return best_params, best_score

In [8]:
def minsearch_search(query, ideology, boost=None):
    if boost is None:
        boost = {}

    results = index.search(
        query=query,
        filter_dict={'ideology': ideology},
        boost_dict=boost,
        num_results=5
    )

    return results

In [10]:
param_ranges = {
    'question': (0.0, 5.0),
    'category': (0.0, 5.0)
}

def objective(boost_params):
    def search_function(q):
        return minsearch_search(q['question'], q['ideology'], boost_params)

    results = evaluate(gt_val, search_function)
    return results['mrr']

simple_optimize(param_ranges, objective, n_iterations=20)

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

({'question': 0.49765504059746224, 'category': 0.16098024633006858}, 0.812)

In [11]:
def minsearch_improved(query,ideology):
    boost = {
        'question': 1.22,
        'category': 0.36
    }

    results = index.search(
        query=query,
        filter_dict={'ideology': ideology},
        boost_dict=boost,
        num_results=10
    )

    return results

evaluate(ground_truth, lambda q: minsearch_improved(q['question'], q['ideology']))

  0%|          | 0/4100 [00:00<?, ?it/s]

{'hit_rate': 0.9185365853658537, 'mrr': 0.7592213511420829}

# Check MiniLM vector retrieval

In [12]:
def minsearch_search(query, ideology, vector_query=None):
    boost = {
        'question': 1.22,
        'category': 0.36
    }
    results = index.search(
        query=query,
        filter_dict={'ideology': ideology},
        boost_dict=boost,
        num_results=10,
        vector_query=vector_query

    )
    return results

In [13]:
evaluate(ground_truth, lambda q: minsearch_search(q['question'], q['ideology'],model.encode(q['question'])))

  0%|          | 0/4100 [00:00<?, ?it/s]

{'hit_rate': 0.96, 'mrr': 0.8275280681378253}

# TfidfVectorizer hyperparameter optimizations

In [16]:
from sklearn.model_selection import ParameterGrid
import pandas as pd

# Load the dataset
df = pd.read_csv("../data/stoic_zen_document.csv")
df.insert(0, 'id', df.index)

documents = df.to_dict(orient="records")
print("length of the documents:", len(documents))

# Define the hyperparameter grid
param_grid = {
    'ngram_range': [(1, 1), (1, 2)],  # Unigrams, bigrams, and trigrams
    'stop_words': [None, 'english'],           # Include or exclude common stop words
    'max_df': [0.85, 0.9, 0.95],               # Maximum document frequency threshold
    'min_df': [1, 2],                        # Minimum document frequency threshold
    'max_features': [None, 5000]         # Maximum number of features (words) to consider
}

# Initialize best score and params
best_score = 0
best_params = None

# Iterate over each combination of hyperparameters
for vectorizer_params in ParameterGrid(param_grid):
    #print(f"Testing with params: {vectorizer_params}")

    # Initialize index with current vectorizer params
    index = minsearch.Index(
        text_fields=["category", "question", "answer"],
        keyword_fields=['id', "ideology"],
        vectorizer_params=vectorizer_params
    )

    # Fit the index
    index.fit(documents)

    # Define a search function using the current index
    def minsearch_search(query, ideology):
        boost = {
            'question': 1.22,
            'category': 0.36
        }
        results = index.search(
            query=query,
            filter_dict={'ideology': ideology},
            boost_dict=boost,
            num_results=10
        )
        return results

    # Evaluate using the evaluate function (this assumes evaluate returns a score)
    score = evaluate(ground_truth, lambda q: minsearch_search(q['question'], q['ideology']))["mrr"]
    
    #print(f"Score: {score}")
    
    # Update best score and params if necessary
    if score > best_score:
        best_score = score
        best_params = vectorizer_params

print(f"Best score: {best_score} with params: {best_params}")

length of the documents: 820


  0%|          | 0/4100 [00:00<?, ?it/s]

  0%|          | 0/4100 [00:00<?, ?it/s]

  0%|          | 0/4100 [00:00<?, ?it/s]

  0%|          | 0/4100 [00:00<?, ?it/s]

  0%|          | 0/4100 [00:00<?, ?it/s]

  0%|          | 0/4100 [00:00<?, ?it/s]

  0%|          | 0/4100 [00:00<?, ?it/s]

  0%|          | 0/4100 [00:00<?, ?it/s]

  0%|          | 0/4100 [00:00<?, ?it/s]

  0%|          | 0/4100 [00:00<?, ?it/s]

  0%|          | 0/4100 [00:00<?, ?it/s]

  0%|          | 0/4100 [00:00<?, ?it/s]

  0%|          | 0/4100 [00:00<?, ?it/s]

  0%|          | 0/4100 [00:00<?, ?it/s]

  0%|          | 0/4100 [00:00<?, ?it/s]

  0%|          | 0/4100 [00:00<?, ?it/s]

  0%|          | 0/4100 [00:00<?, ?it/s]

  0%|          | 0/4100 [00:00<?, ?it/s]

  0%|          | 0/4100 [00:00<?, ?it/s]

  0%|          | 0/4100 [00:00<?, ?it/s]

  0%|          | 0/4100 [00:00<?, ?it/s]

  0%|          | 0/4100 [00:00<?, ?it/s]

  0%|          | 0/4100 [00:00<?, ?it/s]

  0%|          | 0/4100 [00:00<?, ?it/s]

  0%|          | 0/4100 [00:00<?, ?it/s]

  0%|          | 0/4100 [00:00<?, ?it/s]

  0%|          | 0/4100 [00:00<?, ?it/s]

  0%|          | 0/4100 [00:00<?, ?it/s]

  0%|          | 0/4100 [00:00<?, ?it/s]

  0%|          | 0/4100 [00:00<?, ?it/s]

  0%|          | 0/4100 [00:00<?, ?it/s]

  0%|          | 0/4100 [00:00<?, ?it/s]

  0%|          | 0/4100 [00:00<?, ?it/s]

  0%|          | 0/4100 [00:00<?, ?it/s]

  0%|          | 0/4100 [00:00<?, ?it/s]

  0%|          | 0/4100 [00:00<?, ?it/s]

  0%|          | 0/4100 [00:00<?, ?it/s]

  0%|          | 0/4100 [00:00<?, ?it/s]

  0%|          | 0/4100 [00:00<?, ?it/s]

  0%|          | 0/4100 [00:00<?, ?it/s]

  0%|          | 0/4100 [00:00<?, ?it/s]

  0%|          | 0/4100 [00:00<?, ?it/s]

  0%|          | 0/4100 [00:00<?, ?it/s]

  0%|          | 0/4100 [00:00<?, ?it/s]

  0%|          | 0/4100 [00:00<?, ?it/s]

  0%|          | 0/4100 [00:00<?, ?it/s]

  0%|          | 0/4100 [00:00<?, ?it/s]

  0%|          | 0/4100 [00:00<?, ?it/s]

Best score: 0.7625507162214483 with params: {'max_df': 0.85, 'max_features': None, 'min_df': 1, 'ngram_range': (1, 1), 'stop_words': 'english'}
