In [None]:
!pip install beir

In [1]:
import pandas as pd
import uuid
from beir.datasets.data_loader import GenericDataLoader
from beir.retrieval.evaluation import EvaluateRetrieval
from beir.retrieval.search.dense import DenseRetrievalExactSearch as DRES
from beir.retrieval import models
import yaml
import os
import json
import datetime
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import uuid
import random
from tqdm.notebook import tqdm

tqdm.pandas()

class EvaluateSBERTModels:
    
    def __init__(self, config_file_path):
        
        with open(config_file_path) as f:
            config = yaml.load(f, Loader=yaml.FullLoader)

        self.config = config
        
        self.target_dir = self.config['target_dir']
        
        if self.target_dir is None or len(self.target_dir) == 0:
            self.target_dir = os.getcwd()
            
        
        # evaluator inputs path
        self.eval_input_base_path = os.path.join(self.target_dir,'sbert_eval_{}'.format(datetime.datetime.now().strftime("%Y%m%d%H%M%S")),'evaluator_input')
        if not os.path.exists(self.eval_input_base_path):
            os.makedirs(self.eval_input_base_path)
        
        # create results base path
        self.results_base_path = os.path.join(self.target_dir,'sbert_eval_{}'.format(datetime.datetime.now().strftime("%Y%m%d%H%M%S")),'results')
        if not os.path.exists(self.results_base_path):
            os.makedirs(self.results_base_path)
            
    def assign_ids(self, df):
        '''
        Function to assign unique ids for questions and answers.
        '''
        df['q_id'] = [str(uuid.uuid4()) for _ in range(len(df))]
        df['ans_id'] = [str(uuid.uuid4()) for _ in range(len(df))]
        
        return df
    
    def compute_threshold(self, df, threshold_sample_fraction = 0.30):
        '''
        Dynamically compute threshold value based on embeddings.
        
        Formula:
        
        threshold = min(cosine_sim) + ((max(cosine_sim) - min(cosine_sim))/3)
        
        Inputs:
            df: input dataframe with embeddings.
            question_col: column name of the questions.
            answer_col: column name of the answers.
            threshold_sample_fraction: fraction of datapoints to consider for computing threshold.
        Outputs:
            threshold: computed threshold.
        '''
        
        frac_df = df.sample(frac=threshold_sample_fraction)
        
        frac_df['cosine_sim'] = frac_df.apply(lambda row: self.compute_cosine_similarity(row['question_embeddings'], row['answer_embeddings']), axis=1)
        
        similarity_scores = frac_df['cosine_sim'].tolist()
        
        min_score = min(similarity_scores)
        max_score = max(similarity_scores)
        
        print('Min: {}'.format(min_score))
        print('Max: {}'.format(max_score))
        
        threshold = min_score + ((max_score - min_score)/3)
        
        print('Threshold: {}'.format(threshold))
        
        return threshold
        

    def compute_cosine_similarity(self, embeddings1, embeddings2):
        '''
        Function to generate cosine similarity between 2 embeddings.
        '''
        return cosine_similarity(embeddings1.reshape(1, -1), embeddings2.reshape(1, -1))[0, 0]

    def get_qrels(self, df, model_id, question_column, answer_column, negative_samples=False, negative_sample_size=10):
        '''
        Function that is used to generate qrels for evaluator input.
        
        Input:
            df: input dataframe
            model_id: model id for sentence transformers.
            question_column: name of the column containing questions.
            answer_column: name of the column containing answers.
            negative_samples: flag to include negative samples.
            threshold: threshold value for negative samples.
            
        Output:
            qrels: Qrels for input to evalautor
        '''
        qrels=[]
        
        qrels.append('query-id\tcorpus-id\tscore')

        model = SentenceTransformer(model_id)
        
        threshold = 0.0
        
        if negative_samples:
            # Compute embeddings for questions and answers
            df['question_embeddings'] = df[question_column].progress_apply(lambda x: model.encode(x))
            df['answer_embeddings'] = df[answer_column].progress_apply(lambda x: model.encode(x))
            
            # dynamically compute threshold
            threshold = self.compute_threshold(df = df, threshold_sample_fraction = self.config['threshold_sample_fraction'])
            
        for _, row in tqdm(df.iterrows(), total=df.shape[0]):
            # Positive sample (answer to the question)
            qrels.append('{}\t{}\t1'.format(row['q_id'], row['ans_id']))
            
            if negative_samples:
                # Negative sample (random answer from another question)
                candidate_negatives = df[df['q_id'] != row['q_id']].sample(negative_sample_size)
                random_negative = candidate_negatives.sample(1).iloc[0]
                neg_similarity = self.compute_cosine_similarity(row['answer_embeddings'], random_negative['answer_embeddings'])

                while neg_similarity >= threshold:
                    random_negative = candidate_negatives.sample(1).iloc[0]
                    neg_similarity = self.compute_cosine_similarity(row['answer_embeddings'], random_negative['answer_embeddings'])
                
                qrels.append('{}\t{}\t0'.format(row['q_id'], random_negative['ans_id']))

        return qrels
    
    def create_data_for_evaluator(self):
        '''
        Function to convert input data to BEIR data loader compatible format.
        '''
        
        assert len(self.config['data_path'])>0, "Data path cannot be empty."
        
        # load data
        data_df = pd.read_csv(self.config['data_path'])
        
        # Assign unique ids to questions and answers
        data_df = self.assign_ids(data_df)

        corpus=[]
        queries=[]

        for index,item in data_df.iterrows():
            data={}
            query={}
            data['_id'] = item['ans_id']
            data['text'] = item[self.config['answer_column']]
            data['title'] = ""
            corpus.append(data)
            query['_id'] = item['q_id']
            query['text'] = item[self.config['question_column']]
            queries.append(query)
        
        negative_sampler_model_id = None
        
        if self.config['negative_sampler_model_id'] == "random":
            # select random model_id for generating embeddings
            negative_sampler_model_id = random.choice(self.config['models'])
        else:
            negative_sampler_model_id = self.config['negative_sampler_model_id']
        
        qrels = self.get_qrels(df=data_df, model_id=negative_sampler_model_id, question_column = self.config['question_column'], answer_column=self.config['answer_column'], negative_samples=self.config['negative_samples'], negative_sample_size = self.config['negative_sample_size'])
        
        # write corpus
        with open(os.path.join(self.eval_input_base_path,'corpus.jsonl'),'w') as f:
            for index,_dict in enumerate(corpus):
                if index<len(corpus)-1:
                    f.write(json.dumps(_dict)+'\n')
                else:
                    f.write(json.dumps(_dict))
                    
        # write queries
        with open(os.path.join(self.eval_input_base_path,'queries.jsonl'),'w') as f:
            for index,_dict in enumerate(queries):
                if index<len(corpus)-1:
                    f.write(json.dumps(_dict)+'\n')
                else:
                    f.write(json.dumps(_dict))
                    
        # write qrels
        qrels_str = '\n'.join(qrels)
        with open(os.path.join(self.eval_input_base_path,'qrels.tsv'),'w') as f:
            f.write(qrels_str)
            
    
    def load_data_for_evaluator(self):
        '''
        Function to load the data for the evaluator.
        '''
        corpus, queries, qrels = GenericDataLoader(
        corpus_file=os.path.join(self.eval_input_base_path,'corpus.jsonl'), 
        query_file=os.path.join(self.eval_input_base_path,'queries.jsonl'), 
        qrels_file=os.path.join(self.eval_input_base_path,'qrels.tsv')).load_custom()
        
        return corpus, queries, qrels
        
    
    def evaluate_model(self, model, corpus, queries, qrels):
        '''
        Function to evaluate a SBERT model.

        Input:
            model: model_path or model id.
            batch_size: batch size for input.
            score_function: distance measure ('dot' or 'cos_sim')
        Output:
            ndgc: Normalized Discounted cumulative gain scores for a given model.
            _map: Mean average precision scores for a given model.
            recall: Recall scores for a given model.
            precision: Precision scores for a given model.
        '''
        
        batch_size = self.config['batch_size']
        if batch_size is None:
            batch_size = 64
            
        score_function = self.config['score_function']
        if score_function is None:
            score_function = "dot"
            
        
        model = DRES(models.SentenceBERT(model), batch_size=batch_size)
        retriever = EvaluateRetrieval(model, score_function=score_function)
        results = retriever.retrieve(corpus, queries)

        ndcg, _map, recall, precision = retriever.evaluate(qrels, results, retriever.k_values)

        return ndcg, _map, recall, precision


    def run_evaluator(self):
        '''
        Pipeline function to run the evaluator.
        '''
        
        assert len(self.config['models'])>0, "Evaluator requires 1 or more models."
        
        ndgc_results=[]
        map_results=[]
        recall_results=[]
        precision_results=[]
        
        # create data for evalutor
        self.create_data_for_evaluator()
        
        # load data for evaluator
        corpus, queries, qrels = self.load_data_for_evaluator()
        
        # analyze models
        for model in self.config['models']:
            
            ndcg, _map, recall, precision = self.evaluate_model(model, corpus, queries, qrels)
            ndcg['Model'] = model
            ndgc_results.append(ndcg)
            _map['Model'] = model
            map_results.append(_map)
            recall['Model'] = model
            recall_results.append(recall)
            precision['Model'] = model
            precision_results.append(precision)
            
        
        with open(os.path.join(self.results_base_path,'NDGC.json'), 'w') as f:
            f.write(json.dumps(ndgc_results))
            
        with open(os.path.join(self.results_base_path,'MAP.json'), 'w') as f:
            f.write(json.dumps(map_results))
            
        with open(os.path.join(self.results_base_path,'RECALL.json'), 'w') as f:
            f.write(json.dumps(recall_results))
            
        with open(os.path.join(self.results_base_path,'PRECISION.json'), 'w') as f:
            f.write(json.dumps(precision_results))
        

  from tqdm.autonotebook import tqdm


In [2]:
sbert_evaluator = EvaluateSBERTModels(config_file_path='config.yaml')

In [3]:
sbert_evaluator.config

{'data_path': 'Mental_Health_FAQ.csv',
 'target_dir': '/Users/satishsilveri/Documents/ML/Search/BEIR_RES',
 'models': ['thenlper/gte-large',
  'BAAI/bge-large-en-v1.5',
  'intfloat/e5-large-v2'],
 'batch_size': 100,
 'score_function': 'cos_sim',
 'question_column': 'Questions',
 'answer_column': 'Answers',
 'negative_samples': True,
 'negative_sampler_model_id': 'random',
 'negative_sample_size': 10,
 'threshold_sample_fraction': 0.5}

In [4]:
sbert_evaluator.run_evaluator()

  0%|          | 0/98 [00:00<?, ?it/s]

  0%|          | 0/98 [00:00<?, ?it/s]

Min: 0.557099461555481
Max: 0.8427419066429138
Threshold: 0.6523136099179586


  0%|          | 0/98 [00:00<?, ?it/s]

  0%|          | 0/98 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]