In [4]:
import re
from model import embedder
from logging_config import conf_logging
from langchain.docstore.document import Document

conf_logging()

In [5]:
class Search:
    def __init__(self, model, chunks=None):
        self.model = model
        self.last_chunk_id = 0
        self.documents = []
        self.chunks = chunks
    
    async def load_questions(self, text_data=None):
        # Открываем файл и читаем его содержимое
        if not text_data:
            with open('data2.txt', 'r', encoding='utf-8') as file:
                text_data = file.read()
        
        # Разделяем текст на вопросы и ответы
        parts = text_data.split('?')
        questions = []
        answers = []
        
        for i in range(1, len(parts)):
            question = parts[i-1].split('\n')[-1].strip()
            if i == len(parts) - 1:
                answer = parts[i].strip()
            else:
                answer = '\n'.join(parts[i].split('\n')[:-1]).strip()
            questions.append(question + '?')
            answers.append(answer)
            
        for i, (question, answer) in enumerate(zip(questions, answers)):
            metadata = {
                "question": question,
                "chunk_id": i,
            }
        
            doc = Document(page_content=answer, metadata=metadata)
            self.documents.append(doc)
        
        if not self.chunks:
            self.chunks = self.documents
    
    async def chunk_text_with_embeddings(self):
        last_chunk_id = 0
        for chunk in self.chunks:
            chunk_text = chunk.page_content
            question = chunk.metadata['question']
            last_chunk_id = max(last_chunk_id, chunk.metadata['chunk_id'])
            
            embedding = await self.model.get_embedding(question)            
            chunk.metadata['embedding'] = embedding
            
        self.last_chunk_id = last_chunk_id
    
    async def add_chunk(self, question, answer, chunk_id=None):
        if not chunk_id:
            chunk_id = self.last_chunk_id + 1
        embedding = await self.model.get_embedding(f'search_document: {question}')            
        metadata = {
            "question": question,
            "chunk_id": chunk_id,
            'embedding': embedding
        }

        doc = Document(page_content=answer, metadata=metadata)
        self.chunks.append(doc)
    
    async def delete_chunk(self, chunk_id):
        for i, chunk in enumerate(self.chunks):
            cur_chunk_id = chunk.metadata['chunk_id']
            if cur_chunk_id == chunk_id:
                self.chunks.pop(i)
                break
                
    async def edit_chunk(self, chunk_id, question, answer):
        await self.delete_chunk(chunk_id)
        await self.add_chunk(question, answer, chunk_id=chunk_id)
    
    async def get_chunk(self, chunk_id):
        for i, chunk in enumerate(self.chunks):
            cur_chunk_id = chunk.metadata['chunk_id']
            if cur_chunk_id == chunk_id:
                return self.chunks[i]
    
    async def get_all_chunks(self):
        return self.chunks
    
    async def search_query(self, query, top_k=0, threshold_embed=0.5):
        query_embedding = await self.model.get_embedding(f'search_query: {query}')
        similarities = []
        
        for chunk in self.chunks:
            embedding = chunk.metadata['embedding']
            similarity = await self.model.get_similarity(query_embedding, embedding)
            if similarity >= threshold_embed:
                similarities.append((chunk, similarity))

        if not similarities:
            return []
        similarities.sort(key=lambda x: x[-1], reverse=True)
        
        if top_k:
            return similarities[:top_k]
        
        return similarities

In [6]:
searcher = Search(model=embedder)
await searcher.load_questions()
await searcher.chunk_text_with_embeddings()

Batches: 100%|███████████████████████████████████████████████████████████████████████████| 1/1 [00:04<00:00,  4.65s/it]
Batches: 100%|███████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  9.71it/s]
Batches: 100%|███████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  9.26it/s]
Batches: 100%|███████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  6.37it/s]
Batches: 100%|███████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  7.94it/s]
Batches: 100%|███████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  9.35it/s]
Batches: 100%|███████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  6.67it/s]
Batches: 100%|███████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  9.62it/s]
Batches: 100%|██████████████████████████

In [4]:
!python main.py #

^C
