In [1]:
import pandas as pd

passages = pd.read_parquet("hf://datasets/rag-datasets/rag-mini-bioasq/data/passages.parquet/part.0.parquet")
test = pd.read_parquet("hf://datasets/rag-datasets/rag-mini-bioasq/data/test.parquet/part.0.parquet")



In [2]:
test.head()

Unnamed: 0_level_0,question,answer,relevant_passage_ids
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Is Hirschsprung disease a mendelian or a multi...,"Coding sequence mutations in RET, GDNF, EDNRB,...","[20598273, 6650562, 15829955, 15617541, 230011..."
1,List signaling molecules (ligands) that intera...,The 7 known EGFR ligands are: epidermal growt...,"[23821377, 24323361, 23382875, 22247333, 23787..."
2,Is the protein Papilin secreted?,"Yes, papilin is a secreted protein","[21784067, 19297413, 15094122, 7515725, 332004..."
3,Are long non coding RNAs spliced?,Long non coding RNAs appear to be spliced thro...,"[22955974, 21622663, 22707570, 22955988, 24285..."
4,Is RANKL secreted from the cells?,Receptor activator of nuclear factor κB ligand...,"[22867712, 23827649, 21618594, 23835909, 24265..."


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import numpy as np

questions = test["question"].tolist()

vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(questions)

n_clusters = 3

kmeans = KMeans(n_clusters=n_clusters, random_state=42)
kmeans.fit(X)

test['cluster'] = kmeans.labels_

# Print a few examples
for i in range(n_clusters):
    cluster_questions = test[test['cluster'] == i]['question'].tolist()
    print(f"\nCluster {i}:")
    print(cluster_questions[:3])  # Print first 3 questions in each cluster


Cluster 0:
['Is Hirschsprung disease a mendelian or a multifactorial disorder?', 'Is the protein Papilin secreted?', 'Are long non coding RNAs spliced?']

Cluster 1:
['List signaling molecules (ligands) that interact with the receptor EGFR?', 'List the human genes encoding for the dishevelled proteins?', 'List the endoscopic diagnoses that have been reported in children with autism']

Cluster 2:
['Is Alu hypomethylation associated with breast cancer?', 'List Hemolytic Uremic Syndrome Triad.', 'Is irritable bowel syndrome more common in women with endometriosis?']


In [4]:
passages.reset_index(inplace=True)
passages.head()

Unnamed: 0,id,passage
0,9797,New data on viruses isolated from patients wit...
1,11906,We describe an improved method for detecting d...
2,16083,We have studied the effects of curare on respo...
3,23188,Kinetic and electrophoretic properties of 230-...
4,23469,Male Wistar specific-pathogen-free rats aged 2...


In [5]:
test = test[test['relevant_passage_ids'].apply(lambda x: len(x) > 0 and all(id != 0 for id in x))]

In [8]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from scipy.stats import pearsonr
from tqdm.notebook import tqdm

def cluster_questions(questions, n_clusters=5):
    vectorizer = TfidfVectorizer(stop_words='english')
    X = vectorizer.fit_transform(questions)
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    return kmeans.fit_predict(X)

def calculate_dimension_correlations(question, answer, relevant_passages, vectorizer):
    question_vec = vectorizer.transform([question])
    passage_vecs = vectorizer.transform(relevant_passages)
    
    similarities = cosine_similarity(question_vec, passage_vecs)[0]
    
    correlations = []
    for dim in range(passage_vecs.shape[1]):
        dim_values = passage_vecs[:, dim].toarray().flatten()
        if np.std(dim_values) == 0:  # Skip constant dimensions
            correlations.append(0)
        else:
            corr, _ = pearsonr(dim_values, similarities)
            correlations.append(corr if not np.isnan(corr) else 0)
    
    return correlations

def calculate_cluster_weights(cluster_data, passages, vectorizer):
    all_correlations = []
    
    for _, row in tqdm(cluster_data.iterrows(), total=len(cluster_data), desc="Calculating correlations"):
        question = row['question']
        answer = row['answer']
        relevant_ids = eval(row['relevant_passage_ids'])
        relevant_passages = passages[passages['id'].isin(relevant_ids)]['passage']
        
        if len(relevant_passages) < 2:
            continue  # Skip if there's not enough data
        
        correlations = calculate_dimension_correlations(question, answer, relevant_passages, vectorizer)
        all_correlations.append(correlations)
    
    mean_correlations = np.mean(all_correlations, axis=0)
    weights = (mean_correlations - np.min(mean_correlations)) / (np.max(mean_correlations) - np.min(mean_correlations))
    return weights

def weighted_cosine_search(question, passages, weights, vectorizer, top_k=5):
    question_vec = vectorizer.transform([question])
    passage_vecs = vectorizer.transform(passages['passage'])
    
    weighted_passage_vecs = passage_vecs.multiply(weights)
    
    similarities = cosine_similarity(question_vec, weighted_passage_vecs)[0]
    top_indices = np.argsort(similarities)[::-1][:top_k]
    
    return passages['id'].iloc[top_indices].tolist()

def evaluate_search(predicted_ids, true_ids):
    correct = 0
    total = 0
    for pred, true in zip(predicted_ids, true_ids):
        correct += len(set(pred) & set(true))
        total += len(true)
    return correct / total

# Main process
n_clusters = 5
train_ratio = 0.8

print("Clustering questions...")
test['cluster'] = cluster_questions(test['question'], n_clusters)

print("Calculating weights for each cluster...")
cluster_weights = {}
vectorizer = TfidfVectorizer(stop_words='english')
vectorizer.fit(passages['passage'])

for cluster in tqdm(range(n_clusters), desc="Processing clusters"):
    cluster_data = test[test['cluster'] == cluster]
    train_size = int(len(cluster_data) * train_ratio)
    
    train_data = cluster_data.iloc[:train_size]
    weights = calculate_cluster_weights(train_data, passages, vectorizer)
    cluster_weights[cluster] = weights

print("Performing weighted cosine search on test data...")
test_data = test.iloc[int(len(test) * train_ratio):]
weighted_results = []

for _, row in tqdm(test_data.iterrows(), total=len(test_data), desc="Weighted search"):
    question = row['question']
    cluster = row['cluster']
    weights = cluster_weights[cluster]
    result = weighted_cosine_search(question, passages, weights, vectorizer)
    weighted_results.append(result)

print("Performing normal cosine search on test data...")
normal_results = []
for question in tqdm(test_data['question'], desc="Normal search"):
    result = weighted_cosine_search(question, passages, np.ones(vectorizer.get_feature_names_out().shape[0]), vectorizer)
    normal_results.append(result)

print("Evaluating results...")
true_ids = test_data['relevant_passage_ids'].apply(eval).tolist()
weighted_accuracy = evaluate_search(weighted_results, true_ids)
normal_accuracy = evaluate_search(normal_results, true_ids)

print(f"Weighted Cosine Search Accuracy: {weighted_accuracy:.4f}")
print(f"Normal Cosine Search Accuracy: {normal_accuracy:.4f}")

Clustering questions...
Calculating weights for each cluster...


Processing clusters:   0%|          | 0/5 [00:00<?, ?it/s]

Calculating correlations:   0%|          | 0/2829 [00:00<?, ?it/s]

  corr, _ = pearsonr(dim_values, similarities)


Calculating correlations:   0%|          | 0/280 [00:00<?, ?it/s]

Calculating correlations:   0%|          | 0/352 [00:00<?, ?it/s]

Calculating correlations:   0%|          | 0/193 [00:00<?, ?it/s]

Calculating correlations:   0%|          | 0/119 [00:00<?, ?it/s]

Performing weighted cosine search on test data...


Weighted search:   0%|          | 0/944 [00:00<?, ?it/s]

Performing normal cosine search on test data...


Normal search:   0%|          | 0/944 [00:00<?, ?it/s]

Evaluating results...
Weighted Cosine Search Accuracy: 0.2420
Normal Cosine Search Accuracy: 0.2427


In [None]:
passages.columns