In [9]:
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np
import os
from tqdm.autonotebook import tqdm
tqdm.pandas(desc="progress-bar")

model = SentenceTransformer('emrecan/bert-base-turkish-cased-mean-nli-stsb-tr')

In [2]:
model.max_seq_length = 512

In [3]:
device = 'cuda'
model.to(device)

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
)

In [4]:
df = pd.read_pickle("Steps/dataset_qdpairs.pkl")

In [5]:
df.head()

Unnamed: 0_level_0,query,document
id,Unnamed: 1_level_1,Unnamed: 2_level_1
386123400,DAVA: Davacı vekili dava dilekçesinde özetle; ...,DEĞERLENDİRME:Dava dilekçesi ve sair tüm evrak...
386123500,Davacı vekilinin mahkememize verdiği dava dile...,Delillerin Değerlendirilmesi :Tüm dosya kapsam...
386129400,Davacı şirket yetkilisi dava dilekçesinde özet...,Dosya kapsamında yer alan kayıt ve belgeler in...
386132200,Davacı vekili dava dilekçesinde özetle; Keşide...,DELİLLER VE GEREKÇE/Davacı vekilinin 19/01/201...
386172900,Davacı vekili dava dilekçesinde özetle; müvekk...,DELİLLER VE GEREKÇE / ... ATM'nin ... Esas sa...


In [6]:
import os

directory = "bert_doc_embedding_chunks"
if not os.path.exists(directory):
    os.makedirs(directory)

chunk_size = 10240
batch_size = 128
num_chunks = len(df) // chunk_size + 1

for chunk in tqdm(range(num_chunks), desc="Processing chunks"):
    start_idx = chunk * chunk_size
    end_idx = min((chunk + 1) * chunk_size, len(df))
    chunk_df = df[start_idx:end_idx]
    
    chunk_embeddings = model.encode(chunk_df.document.to_list(), batch_size=batch_size, normalize_embeddings=True)
    
    # Save embeddings to file
    file_path = f"bert_doc_embedding_chunks/doc_embeddings_curchunk_{chunk}.npy"
    np.save(file_path, chunk_embeddings)
    
    print(f"Processed chunk {chunk+1}/{num_chunks}. Saved embeddings to {file_path}")


Processing chunks:   0%|          | 0/27 [00:00<?, ?it/s]

Processed chunk 1/27. Saved embeddings to bert_doc_embedding_chunks/doc_embeddings_curchunk_0.npy
Processed chunk 2/27. Saved embeddings to bert_doc_embedding_chunks/doc_embeddings_curchunk_1.npy
Processed chunk 3/27. Saved embeddings to bert_doc_embedding_chunks/doc_embeddings_curchunk_2.npy
Processed chunk 4/27. Saved embeddings to bert_doc_embedding_chunks/doc_embeddings_curchunk_3.npy
Processed chunk 5/27. Saved embeddings to bert_doc_embedding_chunks/doc_embeddings_curchunk_4.npy
Processed chunk 6/27. Saved embeddings to bert_doc_embedding_chunks/doc_embeddings_curchunk_5.npy
Processed chunk 7/27. Saved embeddings to bert_doc_embedding_chunks/doc_embeddings_curchunk_6.npy
Processed chunk 8/27. Saved embeddings to bert_doc_embedding_chunks/doc_embeddings_curchunk_7.npy
Processed chunk 9/27. Saved embeddings to bert_doc_embedding_chunks/doc_embeddings_curchunk_8.npy
Processed chunk 10/27. Saved embeddings to bert_doc_embedding_chunks/doc_embeddings_curchunk_9.npy
Processed chunk 11/

In [10]:
directory = "bert_doc_embedding_chunks"

# Initialize an empty list to store the chunk embeddings
chunk_embeddings_list = []

# Iterate over the chunks and load the embeddings
for chunk in tqdm(range(num_chunks), desc="Loading embeddings"):
    file_path = f"bert_doc_embedding_chunks/doc_embeddings_curchunk_{chunk}.npy"
    chunk_embeddings = np.load(file_path)
    chunk_embeddings_list.append(chunk_embeddings)

# Concatenate the chunk embeddings into a single numpy array
doc_embeddings = np.concatenate(chunk_embeddings_list)

# Save the doc embeddings as doc_embeddings.npy
np.save("doc_embeddings.npy", doc_embeddings)

print("Merged and saved the embeddings successfully.")

Loading embeddings:   0%|          | 0/27 [00:00<?, ?it/s]

Merged and saved the embeddings successfully.
