In [61]:
from datasets import load_dataset
import os
from dotenv import load_dotenv
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq
import openai
from openai import APIError
import os
import json
import re
import numpy as np
from sklearn.cluster import KMeans
from transformers import AutoModel
from sentence_transformers import SentenceTransformer, models
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, SimilarityFunction
import pandas as pd
import torch
import numpy as np
import seaborn as sns
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, AutoModel
from collections import defaultdict
import pytrec_eval
import json
from sentence_transformers.evaluation import InformationRetrievalEvaluator
import CustomInformationRetrievalEvaluator
import importlib
# from sklearn.metrics.pairwise import cosine_similarity


# must for Custom scripts
importlib.reload(CustomInformationRetrievalEvaluator)


# Load the .env file
load_dotenv()

# Access the variables
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
openai.api_key = OPENAI_API_KEY


In [67]:
def save_jobs(data_pair, filename):    
    jobs = [
                {
                    "model": "text-embedding-3-small",
                    # "response_format": "json", # TODO check 
                    # "temperature": 0,
                    "metadata": {"id": indx},
                    "input": text
                }
                for indx, text in data_pair.items()
            ]
    with open(filename, "w") as f:
        for job in jobs:
            json_string = json.dumps(job)
            f.write(json_string + "\n")



# python evaluation-pipetine-test/api_request_parallel_processor.py   --requests_filepath evaluation-pipetine-test/processed_datasets/queries   --save_filepath evaluation-pipetine-test/processed_datasets/example_requests_to_parallel_process_results.jsonl   --request_url https://api.openai.com/v1/embeddings   --max_requests_per_minute 1500   --max_tokens_per_minute 6250000   --token_encoding_name cl100k_base   --max_attempts 5   --logging_level 20 


In [68]:
def get_data_for_evaluation(dataset_name):
    loaded_table = pq.read_table(dataset_name)
    df = loaded_table.to_pandas()
    corpus = {}
    queries = {}
    relevant_docs = {}
    query_idx = 1
    for idx, row in df.iterrows():
        if idx >= 6:  # Break the loop after two iterations
            break
        corpus[idx] = row['context']
        for query in row['queries']:
            query = query.strip()
            queries[query_idx] = query
            if query_idx not in relevant_docs:
                relevant_docs[query_idx] = set()
            relevant_docs[query_idx].add(idx)
            query_idx += 1
    save_jobs(queries, "processed_datasets/queries")
    return queries, corpus, relevant_docs

def load_sentence_tranformer_from_transformer(model_name):
    model = AutoModel.from_pretrained(model_name)
    # Combine the model and pooling into a SentenceTransformer
    word_embedding_model = models.Transformer(model_name_or_path=model_name)
    pooling_model = models.Pooling(word_embedding_dimension=model.config.hidden_size, pooling_mode_mean_tokens=True)
    return SentenceTransformer(modules=[word_embedding_model, pooling_model])

def get_model_or_model_name(model_name, is_openAI):
    model = None
    if not is_openAI:
        model = load_sentence_tranformer_from_transformer(model_name)
        model_name = None
    return model_name,model

def evaluate(model_name, dataset_name, is_openAI):

    name = f"{model_name}-{dataset_name}-evlatuation"
    queries, corpus, relevant_docs = get_data_for_evaluation(dataset_name)

    ir_evaluator = CustomInformationRetrievalEvaluator.InformationRetrievalEvaluator(
    queries=queries,
    corpus=corpus,
    relevant_docs=relevant_docs,
    name=name,
    write_csv=True
    )

    model_name, model = get_model_or_model_name(model_name, is_openAI)
        
    results = ir_evaluator(model, openAI_model=model_name)
    # print(ir_evaluator.primary_metric)
    # print(results[ir_evaluator.primary_metric])
    return results




datasets = ["datasets/dataset_processed.parquet"]

models_ = {
#  "google-bert/bert-base-multilingual-cased": False
 "text-embedding-3-small" : True   
}

for dataset_name, model_name in zip(datasets, models_.keys()):
    print(dataset_name)
    print(model_name)
    print(models_[model_name])
    print(evaluate(model_name=model_name, dataset_name=dataset_name, is_openAI=models_[model_name]))




datasets/dataset_processed.parquet
text-embedding-3-small
True
{'text-embedding-3-small-datasets/dataset_processed.parquet-evlatuation_cosine_accuracy@1': 0.8620689655172413, 'text-embedding-3-small-datasets/dataset_processed.parquet-evlatuation_cosine_accuracy@3': 1.0, 'text-embedding-3-small-datasets/dataset_processed.parquet-evlatuation_cosine_accuracy@5': 1.0, 'text-embedding-3-small-datasets/dataset_processed.parquet-evlatuation_cosine_accuracy@10': 1.0, 'text-embedding-3-small-datasets/dataset_processed.parquet-evlatuation_cosine_precision@1': np.float64(0.8620689655172413), 'text-embedding-3-small-datasets/dataset_processed.parquet-evlatuation_cosine_precision@3': np.float64(0.3333333333333334), 'text-embedding-3-small-datasets/dataset_processed.parquet-evlatuation_cosine_precision@5': np.float64(0.20000000000000007), 'text-embedding-3-small-datasets/dataset_processed.parquet-evlatuation_cosine_precision@10': np.float64(0.10000000000000003), 'text-embedding-3-small-datasets/data

In [63]:
from pprint import pprint

In [64]:
pprint({'text-embedding-3-small-datasets/dataset_processed.parquet-evlatuation_cosine_accuracy@1': 0.8620689655172413, 'text-embedding-3-small-datasets/dataset_processed.parquet-evlatuation_cosine_accuracy@3': 1.0, 'text-embedding-3-small-datasets/dataset_processed.parquet-evlatuation_cosine_accuracy@5': 1.0, 'text-embedding-3-small-datasets/dataset_processed.parquet-evlatuation_cosine_accuracy@10': 1.0, 'text-embedding-3-small-datasets/dataset_processed.parquet-evlatuation_cosine_precision@1': np.float64(0.8620689655172413), 'text-embedding-3-small-datasets/dataset_processed.parquet-evlatuation_cosine_precision@3': np.float64(0.3333333333333334), 'text-embedding-3-small-datasets/dataset_processed.parquet-evlatuation_cosine_precision@5': np.float64(0.20000000000000007), 'text-embedding-3-small-datasets/dataset_processed.parquet-evlatuation_cosine_precision@10': np.float64(0.10000000000000003), 'text-embedding-3-small-datasets/dataset_processed.parquet-evlatuation_cosine_recall@1': np.float64(0.8620689655172413), 'text-embedding-3-small-datasets/dataset_processed.parquet-evlatuation_cosine_recall@3': np.float64(1.0), 'text-embedding-3-small-datasets/dataset_processed.parquet-evlatuation_cosine_recall@5': np.float64(1.0), 'text-embedding-3-small-datasets/dataset_processed.parquet-evlatuation_cosine_recall@10': np.float64(1.0), 'text-embedding-3-small-datasets/dataset_processed.parquet-evlatuation_cosine_ndcg@10': np.float64(0.9445789400246336), 'text-embedding-3-small-datasets/dataset_processed.parquet-evlatuation_cosine_mrr@10': 0.9252873563218391, 'text-embedding-3-small-datasets/dataset_processed.parquet-evlatuation_cosine_map@100': np.float64(0.925287356321839), 'text-embedding-3-small-datasets/dataset_processed.parquet-evlatuation_dot_accuracy@1': 0.8620689655172413, 'text-embedding-3-small-datasets/dataset_processed.parquet-evlatuation_dot_accuracy@3': 1.0, 'text-embedding-3-small-datasets/dataset_processed.parquet-evlatuation_dot_accuracy@5': 1.0, 'text-embedding-3-small-datasets/dataset_processed.parquet-evlatuation_dot_accuracy@10': 1.0, 'text-embedding-3-small-datasets/dataset_processed.parquet-evlatuation_dot_precision@1': np.float64(0.8620689655172413), 'text-embedding-3-small-datasets/dataset_processed.parquet-evlatuation_dot_precision@3': np.float64(0.3333333333333334), 'text-embedding-3-small-datasets/dataset_processed.parquet-evlatuation_dot_precision@5': np.float64(0.20000000000000007), 'text-embedding-3-small-datasets/dataset_processed.parquet-evlatuation_dot_precision@10': np.float64(0.10000000000000003), 'text-embedding-3-small-datasets/dataset_processed.parquet-evlatuation_dot_recall@1': np.float64(0.8620689655172413), 'text-embedding-3-small-datasets/dataset_processed.parquet-evlatuation_dot_recall@3': np.float64(1.0), 'text-embedding-3-small-datasets/dataset_processed.parquet-evlatuation_dot_recall@5': np.float64(1.0), 'text-embedding-3-small-datasets/dataset_processed.parquet-evlatuation_dot_recall@10': np.float64(1.0), 'text-embedding-3-small-datasets/dataset_processed.parquet-evlatuation_dot_ndcg@10': np.float64(0.9445789400246336), 'text-embedding-3-small-datasets/dataset_processed.parquet-evlatuation_dot_mrr@10': 0.9252873563218391, 'text-embedding-3-small-datasets/dataset_processed.parquet-evlatuation_dot_map@100': np.float64(0.925287356321839)})

{'text-embedding-3-small-datasets/dataset_processed.parquet-evlatuation_cosine_accuracy@1': 0.8620689655172413,
 'text-embedding-3-small-datasets/dataset_processed.parquet-evlatuation_cosine_accuracy@10': 1.0,
 'text-embedding-3-small-datasets/dataset_processed.parquet-evlatuation_cosine_accuracy@3': 1.0,
 'text-embedding-3-small-datasets/dataset_processed.parquet-evlatuation_cosine_accuracy@5': 1.0,
 'text-embedding-3-small-datasets/dataset_processed.parquet-evlatuation_cosine_map@100': np.float64(0.925287356321839),
 'text-embedding-3-small-datasets/dataset_processed.parquet-evlatuation_cosine_mrr@10': 0.9252873563218391,
 'text-embedding-3-small-datasets/dataset_processed.parquet-evlatuation_cosine_ndcg@10': np.float64(0.9445789400246336),
 'text-embedding-3-small-datasets/dataset_processed.parquet-evlatuation_cosine_precision@1': np.float64(0.8620689655172413),
 'text-embedding-3-small-datasets/dataset_processed.parquet-evlatuation_cosine_precision@10': np.float64(0.100000000000000