In [15]:
#setting up training data reader function in cae you are using this notebook in your local machine/laptop

In [17]:
import pandas as pd
training_data_drift_stats = pd.read_csv("training_data_drift_stats.csv")

In [19]:
!pip install -U sentence-transformers



In [20]:
#setting up mean pooling function for tokenizer

def mean_pooling(model_output, attention_mask):
    
    import torch 
    
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

In [21]:
#setting up drift computation function

def compute_drift(data):
    import os
    import numpy as np
    import torch 
    import torch.nn.functional as F
    from transformers import AutoTokenizer, AutoModel 
    from sentence_transformers import SentenceTransformer
    from scipy.stats import wasserstein_distance

        #getting training data
        
    train_bins = training_data_drift_stats['train_bins']
    train_vals = training_data_drift_stats['train_vals']
    
        #getting payload data
        
    payload_data = data.get("input_data")[0].get("values")[1]


        #transforming payload data
                    
    tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L12-v2')
    model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L12-v2')

    encoded_input = tokenizer(payload_data, padding=True, truncation=True, return_tensors='pt')

    with torch.no_grad():
        model_output = model(**encoded_input)

    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

    sentence_embeddings = F.normalize(sentence_embeddings, p = 2, dim = 1)
    sentence_embeddings = sentence_embeddings.detach().numpy().flatten()

    sentence_embeddings_hist = np.histogram(sentence_embeddings, density = True)
    scoring_vals = sentence_embeddings_hist[0].tolist()
    scoring_bins = sentence_embeddings_hist[1].tolist()[:-1]

        #calculating drift using wasserstein_distance
        
    drift = str(round(wasserstein_distance(u_values = train_bins, v_values = scoring_bins,
                               u_weights = train_vals, v_weights = scoring_vals),4))
        
    return drift                                                                                                     

In [22]:
# defining payload data for local testing

payload_data = {
  "input_data": [
    {
      "fields": ["stars","text"],
      "values": [["1.0","4.0","3.0"],["it was fun","it was bad","it was okay"]]
    }
  ]
}

In [23]:
#computing drift

compute_drift(payload_data)

'0.0166'