In [9]:
!pip install transformers sentence_transformers chromadb chardet indox semantic_text_splitter
!pip install -U sentence-transformers

Collecting semantic_text_splitter
  Downloading semantic_text_splitter-0.14.0-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: semantic_text_splitter
Successfully installed semantic_text_splitter-0.14.0


In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from transformers import BertTokenizer, BertModel
import torch
from indox import IndoxRetrievalAugmentation
from indox.llms import HuggingFaceModel
from indox.embeddings import HuggingFaceEmbedding
from indox.data_loader_splitter.SimpleLoadAndSplit import SimpleLoadAndSplit
from indox.vector_stores import ChromaVectorStore

cfg = {
    "bert_score_model": "bert-base-uncased",
}

class Evaluation:
    def __init__(self, dimensions=None, config=cfg):
        self.config = config
        self.bert_score = BertScore(self.config)

    def __call__(self, inputs=None) -> pd.DataFrame:
        scores = self.bert_score(inputs)
        scores = pd.DataFrame(scores, index=[0])
        return scores

class BertScore:
    def __init__(self, cfg):
        self.tokenizer = BertTokenizer.from_pretrained(cfg["bert_score_model"])
        self.model = BertModel.from_pretrained(cfg["bert_score_model"])

    def embed(self, texts):
        inputs = self.tokenizer(texts, return_tensors="pt", padding=True, truncation=True)
        with torch.no_grad():
            outputs = self.model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1)
        return embeddings

    def __call__(self, inputs):
        answer, context = inputs['answer'], inputs['context']
        if not isinstance(context, list):
            context = [context]

        answer_embedding = self.embed([answer])
        context_embeddings = self.embed(context)

        similarities = cosine_similarity(answer_embedding, context_embeddings).flatten()
        P_avg = similarities.mean()
        R_avg = P_avg
        F1_avg = 2 * P_avg * R_avg / (P_avg + R_avg + 1e-12)

        scores = {"Precision": P_avg, "Recall": R_avg, "F1-score": F1_avg}
        return scores

def upload_file():
    from google.colab import files
    uploaded = files.upload()
    file_path = list(uploaded.keys())[0]
    print(f"File '{file_path}' uploaded successfully.")
    return file_path

indox = IndoxRetrievalAugmentation()
mistral_qa = HuggingFaceModel(api_key='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', model="mistralai/Mistral-7B-Instruct-v0.2")
embed = HuggingFaceEmbedding(model="multi-qa-mpnet-base-cos-v1")

file_path = upload_file()

simpleLoadAndSplit = SimpleLoadAndSplit(file_path=file_path, remove_sword=False, max_chunk_size=200)
docs = simpleLoadAndSplit.load_and_chunk()

db = ChromaVectorStore(collection_name="sample", embedding=embed)
indox.connect_to_vectorstore(vectorstore_database=db)
indox.store_in_vectorstore(docs)

evaluator = Evaluation()

while True:
    query = input("Enter your question (or 'exit' to stop): ")
    if query.lower() == 'exit':
        break

    retriever = indox.QuestionAnswer(vector_database=db, llm=mistral_qa, top_k=5)
    answer = retriever.invoke(query=query)
    context = retriever.context

    print("Answer: ", answer)

    inputs = {
        "question": query,
        "answer": answer,
        "context": context
    }

    result = evaluator(inputs)
    print(result)
