# Create Initial Embbeding Files

In [1]:
%load_ext autoreload
%autoreload
# Importing the needed libraries & Modules

# Import cudf. cudf is part of the NVIDIA RAPIDS datascience SDK and is used to store the dataframes 
# used in gpu memory.
import cudf

# Import SentenceTransformer and util from the HuggingFace sentence_transformer library which has
# been pre-installed in this environment.
from sentence_transformers import SentenceTransformer, util

# Import pickle. pickle is used to store the embedding
import pickle

# Import Path. Used to manage file system
from pathlib import Path

# Import smart_search_models. This module was created for this example to simplify the management of the 
# various models that can be used for the embedding process.
import smart_search

import time

# Set some notebook variables
DATASET_NAME = "enron"
DATA_PATH = "../data/"
MODEL_PATH = "../models/"
EMBEDDING_FOLDER = DATA_PATH + "../data/embeddings/"
PARQUET_PATH = DATA_PATH + '../data/enron_extracted/email_data.parquet'

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Verify the dataset exists. If not, download, extract, and preprocess the dataset.
file_path = Path(PARQUET_PATH)
if file_path.exists():
    print("The file exists.")
else:
    print("The file does not exist. Setting up dataset now.")
    %run data_setup.py

The file does not exist. Setting up dataset now.
Download path does not exist. Creating it now..
Downloading data. This may take several minutes.


In [3]:
df = cudf.read_parquet(PARQUET_PATH).reset_index(drop=True)
print("The dataset contains {} entrees".format(df.shape[0]))

The dataset contains 517401 entrees


In [5]:
# Dropping duplicates in the 'messages' column
df = df.drop_duplicates(subset='message').reset_index(drop=True)
print("The dataset contains {} unique entrees".format(df.shape[0]))

The dataset contains 234821 unique entrees


In [None]:
# Select and load model.
# Note: If a given model hasn't been used since the container has been loaded it will be downloaded automatically.

# The sentence_models list is a large list of models. They have not been grouped by task beyond sentence similarity 
#model_name = smart_search_models.sentence_models[6]
#model_name = smart_search_models.default_model

# asymmetric_cosine_similarity_models are special purpose models for Asymmetric Semantic Similarity through cosine similarity calculations
# model_name = smart_search.asymmetric_cosine_similarity_models[2]

# symmetric_models are special purpose models for Symmetric Sematic Similarity
#model_name = smart_search.symmetric_models[1]

# Multilingual Models as described on Sentence Transformers website
model_name = smart_search.multilingual_models[0]

print("Loading model: '{}'".format(model_name))
model = SentenceTransformer(model_name,cache_folder = MODEL_PATH)

In [14]:
# Create helper functions to read and write embedding to files.
def load_embeddings(embedding_file_path):
        
    #Load sentences & embeddings from disc
    with open(embedding_file_path, "rb") as fIn:
        stored_data = pickle.load(fIn)
        stored_message_id = stored_data['message_id']
        stored_embeddings = stored_data['embeddings']

    # As of now we only need the stored embeddings
    return stored_embeddings

def write_embeddings(embedding_folder, embedding_file_name,message_ids,source_embeddings):
   
    # Check if directory exits
    dir_path = Path(embedding_folder)
    
    if not dir_path.is_dir():
        print("Directory does not exist. Creating it now.")
        # If the directory doesn't exist create it.
        dir_path.mkdir()
        
    # Create the file path
    file_path = embedding_folder + embedding_file_name
    
    # Write out the embedding and message_id to disk
    with open(file_path, "wb") as fOut:
        pickle.dump({'message_id': message_ids, 'embeddings': source_embeddings}, fOut, protocol=pickle.HIGHEST_PROTOCOL)

def embedd_dataframe(df,model_name):
    # Flag for multi-gpu embedding.
    TRAIN_MULTI = False

    # Load Model
    model = SentenceTransformer(model_name,cache_folder = MODEL_PATH)
    
    # Create the file name that would be used to store the embeddings.
    embedding_file_name = "embeddings_{}_{}.pkl".format(DATASET_NAME,model_name)
    
    # Create embedding Path object
    embedding_file = Path(EMBEDDING_FOLDER + embedding_file_name)
    
    # Check if the file 
    if embedding_file.is_file():
        # If a file exists with the embedding file for this dataset / model combination exists load it.
        print("Embedding file exists. Loading it now.")
        
        
        source_embeddings = load_embeddings(embedding_file)
    else:
        # If an embedding file does not exist. Embed the dataset and cache the data.
        print("Embedding file does not exist. Creating now.")
        
        if TRAIN_MULTI:
            pool = model.start_multi_process_pool()
            source_embeddings = model.encode_multi_process(df.message.to_pandas(),pool)
            model.stop_multi_process_pool(pool)
        else:
            start_time = time.time()  # Start timing before processing
            source_embeddings = model.encode(df.message.to_pandas(),convert_to_tensor=True,show_progress_bar=True)
            end_time = time.time()  # End timing after processing
            total_time = end_time - start_time
            print(f"Total Embedding time: {total_time}")
        
        # Write out the generated embeddings
        write_embeddings(EMBEDDING_FOLDER,embedding_file_name,df.message_id.to_pandas(),source_embeddings)
        
    print(embedding_file)

In [15]:
# Create Embeddings for symmetric models
models = smart_search.symmetric_models

for model_name in models:
    embedd_dataframe(df,model_name)


Embedding file does not exist. Creating now.


Batches: 100%|██████████| 7339/7339 [14:29<00:00,  8.44it/s]


Total Embedding time: 876.5277631282806
../data/../data/embeddings/embeddings_enron_all-mpnet-base-v2.pkl
Embedding file does not exist. Creating now.


Batches: 100%|██████████| 7339/7339 [18:32<00:00,  6.60it/s]


Total Embedding time: 1120.6342859268188
../data/../data/embeddings/embeddings_enron_multi-qa-mpnet-base-dot-v1.pkl
Embedding file does not exist. Creating now.


Batches: 100%|██████████| 7339/7339 [08:22<00:00, 14.60it/s] 


Total Embedding time: 509.81654620170593
../data/../data/embeddings/embeddings_enron_all-distilroberta-v1.pkl
Embedding file does not exist. Creating now.


Batches: 100%|██████████| 7339/7339 [02:31<00:00, 48.52it/s]


Total Embedding time: 159.5010917186737
../data/../data/embeddings/embeddings_enron_all-MiniLM-L12-v2.pkl
Embedding file does not exist. Creating now.


Batches: 100%|██████████| 7339/7339 [08:22<00:00, 14.61it/s] 


Total Embedding time: 510.0738785266876
../data/../data/embeddings/embeddings_enron_multi-qa-distilbert-cos-v1.pkl
Embedding file does not exist. Creating now.


Batches: 100%|██████████| 7339/7339 [02:45<00:00, 44.38it/s] 


Total Embedding time: 173.6871976852417
../data/../data/embeddings/embeddings_enron_all-MiniLM-L6-v2.pkl
Embedding file does not exist. Creating now.


Batches: 100%|██████████| 7339/7339 [05:07<00:00, 23.87it/s] 


Total Embedding time: 315.4171566963196
../data/../data/embeddings/embeddings_enron_multi-qa-MiniLM-L6-cos-v1.pkl
Embedding file does not exist. Creating now.


Batches: 100%|██████████| 7339/7339 [03:16<00:00, 37.39it/s] 


Total Embedding time: 204.96796989440918
../data/../data/embeddings/embeddings_enron_paraphrase-albert-small-v2.pkl
Embedding file does not exist. Creating now.


Batches: 100%|██████████| 7339/7339 [01:28<00:00, 82.60it/s] 


Total Embedding time: 96.93834710121155
../data/../data/embeddings/embeddings_enron_paraphrase-MiniLM-L3-v2.pkl


| GPU | Driver | CUDA | Model | Time (s) |
| :-- | ------ | ---- | ----- | ---: |
| NVIDIA RTX A3500 Ada | 537.77 | 12.2 | all-mpnet-base-v2 | 877 |
| NVIDIA RTX A3500 Ada | 537.77 | 12.2 | multi-qa-mpnet-base-dot-v1 | 1121 |
| NVIDIA RTX A3500 Ada | 537.77 | 12.2 | all-distilroberta-v1 | 510 |
| NVIDIA RTX A3500 Ada | 537.77 | 12.2 | all-MiniLM-L12-v2 | 160 |
| NVIDIA RTX A3500 Ada | 537.77 | 12.2 | multi-qa-distilbert-cos-v1 | 510 |
| NVIDIA RTX A3500 Ada | 537.77 | 12.2 | all-MiniLM-L6-v2 | 174 |
| NVIDIA RTX A3500 Ada | 537.77 | 12.2 | multi-qa-MiniLM-L6-cos-v1 | 315 |
| NVIDIA RTX A3500 Ada | 537.77 | 12.2 | paraphrase-albert-small-v2 | 205 |
| NVIDIA RTX A3500 Ada | 537.77 | 12.2 | paraphrase-MiniLM-L3-v2 | 97 |

## Asymmetric Cosine Similarity Models

In [16]:
models = smart_search.asymmetric_cosine_similarity_models

print(f"Embedding {len(models)} models.")

for model_name in models:
    embedd_dataframe(df,model_name)

Embedding 5 models.
Embedding file does not exist. Creating now.


Batches: 100%|██████████| 7339/7339 [08:22<00:00, 14.61it/s] 


Total Embedding time: 510.31094694137573
../data/../data/embeddings/embeddings_enron_msmarco-distilbert-base-v4.pkl
Embedding file does not exist. Creating now.


Batches: 100%|██████████| 7339/7339 [16:22<00:00,  7.47it/s]


Total Embedding time: 991.3924067020416
../data/../data/embeddings/embeddings_enron_msmarco-roberta-base-v3.pkl
Embedding file does not exist. Creating now.


Batches: 100%|██████████| 7339/7339 [08:23<00:00, 14.58it/s] 


Total Embedding time: 512.0414416790009
../data/../data/embeddings/embeddings_enron_msmarco-distilbert-base-v3.pkl
Embedding file does not exist. Creating now.


Batches: 100%|██████████| 7339/7339 [05:02<00:00, 24.25it/s] 


Total Embedding time: 310.11714577674866
../data/../data/embeddings/embeddings_enron_msmarco-MiniLM-L-6-v3.pkl
Embedding file does not exist. Creating now.


Batches: 100%|██████████| 7339/7339 [09:36<00:00, 12.73it/s]


Total Embedding time: 584.1150236129761
../data/../data/embeddings/embeddings_enron_msmarco-MiniLM-L-12-v3.pkl


| GPU | Driver | CUDA | Model | Time (s) |
| --- | ------ | ---- | ----- | ---- |
| NVIDIA RTX A3500 Ada | 537.77 | 12.2 | msmarco-distilbert-base-v4 | 510 |
| NVIDIA RTX A3500 Ada | 537.77 | 12.2 | msmarco-roberta-base-v3 | 991 |
| NVIDIA RTX A3500 Ada | 537.77 | 12.2 | msmarco-distilbert-base-v3 | 512 |
| NVIDIA RTX A3500 Ada | 537.77 | 12.2 | msmarco-MiniLM-L-6-v3 | 310 |
| NVIDIA RTX A3500 Ada | 537.77 | 12.2 | msmarco-MiniLM-L-12-v3 | 584 |


## Multilingual Models


In [None]:
models = smart_search.multilingual_models

print(f"Embedding {len(models)} models.")

for model_name in models:
    embedd_dataframe(df,model_name)

| GPU | Driver | CUDA | Model | Time (s) |
| --- | ------ | ---- | ----- | ---- |
| NVIDIA RTX A3500 Ada | 537.77 | 12.2 | MODEL_NAME | SECONDS |
| NVIDIA RTX A3500 Ada | 537.77 | 12.2 | MODEL_NAME | SECONDS |
| NVIDIA RTX A3500 Ada | 537.77 | 12.2 | MODEL_NAME | SECONDS |
| NVIDIA RTX A3500 Ada | 537.77 | 12.2 | MODEL_NAME | SECONDS |