In [10]:
import pandas as pd
df_data_1 = pd.read_csv("source_sentences_shaped.csv")

In [11]:
source_sentences = list(df_data_1['text'])
len(source_sentences)

200

In [None]:
#The above dataset has 200 review entries which need 8 GB RAM for text processing. 
#In case you have less RAM on your local machine, 
#use the code below to subset the data to 100 or lesser entries if needed.

#source_sentences = source_sentences[0:99]

In [3]:
!pip install -U sentence-transformers



In [3]:
!pip install torchvision

Collecting torch==1.12.1
  Using cached torch-1.12.1-cp38-none-macosx_10_9_x86_64.whl (137.8 MB)
Installing collected packages: torch
Successfully installed torch-1.12.1


In [6]:
from transformers import AutoTokenizer, AutoModel 
import torch 
import torch.nn.functional as F
import numpy as anp
from sentence_transformers import SentenceTransformer

In [7]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

In [8]:
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L12-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L12-v2')

In [12]:
# encoding the source sentences
encoded_input = tokenizer(source_sentences, padding=True, truncation=True, return_tensors='pt')

In [13]:
# computing the embeddings
with torch.no_grad():
    embedding_model = model(**encoded_input)


In [14]:
# Perform pooling 
sentence_embeddings = mean_pooling(embedding_model, encoded_input['attention_mask'])
 # Normalise embeddings 
sentence_embeddings = F.normalize(sentence_embeddings, p = 2, dim = 1)
sentence_embeddings = sentence_embeddings.detach().numpy().flatten()
print("Embeddings shape: {}".format(sentence_embeddings.shape))

Embeddings shape: (76800,)


In [15]:
import numpy as np
sentence_embeddings_hist = np.histogram(sentence_embeddings, density = True)
train_embeddings_vals = sentence_embeddings_hist[0].tolist()
train_embeddings_bins = sentence_embeddings_hist[1].tolist()[:-1]
train_embeddings = {'train_vals': train_embeddings_vals, 'train_bins': train_embeddings_bins}

In [16]:
import pandas as pd
embeddings_df = pd.DataFrame(train_embeddings)

In [18]:
embeddings_df.to_csv("training_data_drift_stats.csv",header=True)

In [None]:
#In case you are uing this notebook in your local machine/laptop use following code to save the file"
#embeddings_df.to_csv('train_data_drift_stats.csv', index=False, mode='w+')