In [None]:
ls

In [None]:
cd ..

In [None]:
!source venv/bin/activate

In [None]:
from datasets import load_dataset
from tqdm.notebook import tqdm
from langchain_text_splitters import RecursiveCharacterTextSplitter
import pandas as pd
import numpy as np

from research.functions import get_natural_questions_sample, get_ms_marco_sample
from rag_techniques.proposition_chunking import PropositionsGenerator
from database.vbase import QdrantBase

In [None]:
generator = PropositionsGenerator()

In [None]:
base = QdrantBase(
    model_name='BAAI/bge-m3',
    vector_dimension=1024,
    collection_name='natural_questions_proposition',
    # device='cuda'
)

In [None]:
natural_questions = load_dataset("natural_questions", split="validation[:1000]")

In [None]:
dataset = {
    'question': [],
    'answer': [],
}

texts = []

null_count = 0
max_null = 20

for data in tqdm(natural_questions):
    if len(texts) == 100:
        break
    
    question, context, answer = get_natural_questions_sample(data)
    if answer == '' and null_count < max_null:
        null_count += 1
        texts.append(context)
        dataset['question'].append(question)
        dataset['answer'].append(answer)
    elif answer != '':
        texts.append(context)
        dataset['question'].append(question)
        dataset['answer'].append(answer)

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=600,
    chunk_overlap=0,
    length_function=len,
    separators=["\n\n", "\n", ".", " ", ""]
)

In [None]:
chunks = []

for text in tqdm(texts):
    chunks += text_splitter.split_text(text)

In [None]:
len(chunks)

In [None]:
chunks[1]

In [None]:
generator(chunks[1])

In [None]:
new_chunks = []

for chunk in tqdm(chunks):
    new_chunks += generator(chunk)

In [None]:
len(new_chunks)

In [None]:
for chunk in tqdm(new_chunks):
    base.add_point(chunk)

# Оценка

In [None]:
from bs4 import BeautifulSoup
import re
from datasets import load_dataset
from tqdm.notebook import tqdm
import pandas as pd

from rag_techniques.simple_rag_giga import SimpleRag
from research.functions import get_natural_questions_sample, get_ms_marco_sample

In [None]:
model = SimpleRag(collection_name='natural_questions_proposition', )

In [None]:
df = pd.read_csv('research/data/qa_dataset.csv', index_col=0)

In [None]:
natural_questions_dataset = {
    "question": [],
    "answer": [],
    "contexts": [],
    "ground_truth": [],
}

for i, (question, answer) in tqdm(df.iterrows(), total=df.shape[0]):
    result = model(question)
    natural_questions_dataset["question"].append(question)
    natural_questions_dataset["answer"].append(result['answer'])
    natural_questions_dataset["contexts"].append(result['context'])
    if pd.isna(answer):
        answer = 'No answer'
    natural_questions_dataset["ground_truth"].append(answer)  

In [None]:
from src.answer_correctness import AnswerCorrectness

In [None]:
evaluator = AnswerCorrectness()
res = evaluator.get_correctness(natural_questions_dataset['question'], natural_questions_dataset['answer'], natural_questions_dataset['ground_truth'])

In [None]:
np.mean(res[1])

In [None]:
from src.ragas import Evaluator

In [None]:
ragas = Evaluator(research_name='proposition_natural_questions')

In [None]:
e = ragas.eval(natural_questions_dataset)