In [None]:
import clickhouse_connect

from FlagEmbedding import BGEM3FlagModel
from sentence_transformers import SentenceTransformer

from utils.utils import Chunker

In [None]:
client = clickhouse_connect.get_client(host='192.168.1.1', port=1234, username='-', password='-', database='db')

In [None]:
bge_retriever_model = BGEM3FlagModel('BAAI/bge-m3', use_fp16=True, pooling_method="cls")
e5_retriever_model = SentenceTransformer("intfloat/multilingual-e5-large", device='cuda')
e5_retriever_model.max_seq_length = 512

In [None]:
chunker = Chunker(max_chunk_len=2500, overlap_len=500)


texts = client.query_df('select * from document').set_index('url')
chunks, urls = chunker.split_texts(texts)
client.insert(table='chunk', data=[chunks, urls], column_names=['text', 'url'], column_oriented=True)

In [None]:
chunks_df = client.query_df('select * from chunk').set_index('uuid')
chunks_df.head()

In [None]:
bge_embeddings = bge_retriever_model.encode(chunks_df.text.tolist(), return_dense=True, return_sparse=False, return_colbert_vecs=False, batch_size=16, max_length=512)
e5_embeddings = e5_retriever_model.encode(chunks_df.text.tolist(), batch_size=16, normalize_embeddings=True, show_progress_bar=True)
client.insert(table='chunk_embedding',
              data=[chunks_df.index.tolist(), 
                    bge_embeddings['dense_vecs'].tolist(), 
                    e5_embeddings.tolist()],
              column_names=['chunk_uuid', 
                            'bge_m3_embedding', 
                            'e5_embedding'], 
              column_oriented=True)

In [None]:
client.query_df('select * from chunk_embedding').head()