### 1. Import libraries:
This line imports the AutoModel and AutoTokenizer classes from the transformers library. These classes are used to load pre-trained models and their corresponding tokenizers.

In [10]:
from transformers import AutoModel, AutoTokenizer

### 2. Define model and tokenizer:
model_name: This variable stores the identifier of the pre-trained SentenceBERT model.<br>
model = AutoModel.from_pretrained(model_name): This line uses the AutoModel class to load the pre-trained model specified by model_name.<br>
tokenizer = AutoTokenizer.from_pretrained(model_name): This line uses the AutoTokenizer class to load the tokenizer corresponding to the loaded model.

In [11]:
model_name = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

### 3. Encode the sentence:
sentence: This variable stores the sentence to be processed for embedding generation.<br>
encoded_inputs = tokenizer(sentence, return_tensors="pt"): This line uses the tokenizer to convert the sentence into its numerical representation suitable for the model. The return_tensors="pt" argument specifies that the encoded inputs should be returned in PyTorch tensor format.

In [20]:
# Define the two sentences to compare
sentence_1 = "This is the first sentence."
sentence_2 = "hello how are you ?"
encoded_inputs1 = tokenizer(sentence_1, return_tensors="pt")
encoded_inputs2 = tokenizer(sentence_1, return_tensors="pt")

### 4. Generate sentence embedding:
model(**encoded_inputs): This line performs the forward pass through the loaded SentenceBERT model with the encoded inputs.<br>
last_hidden_state: This attribute access retrieves the hidden state outputs from the last layer of the model.<br>
[:, 0]: This slicing operation selects the first element (index 0) from each hidden state vector. This essentially extracts the sentence embedding from the model output.

In [21]:
embeddings1 = model(**encoded_inputs1).last_hidden_state[:, 0].detach().numpy()
embeddings2 = model(**encoded_inputs2).last_hidden_state[:, 0].detach().numpy()

In [22]:
print(embeddings1)

[[ 7.04945773e-02 -1.06843926e-01 -9.50959977e-03 -5.29272556e-02
   6.68270327e-03 -1.04966369e-02 -2.76828222e-02 -5.70532717e-02
  -2.62527391e-02 -2.05893070e-04  8.96935314e-02  4.10136580e-03
  -4.70163450e-02 -1.10510617e-01  1.36844963e-01 -2.41944939e-03
   3.87143753e-02  4.30521518e-02  5.33392355e-02  1.44108534e-02
  -7.59598017e-02  1.21188704e-02 -1.81850567e-02  1.94070563e-02
  -1.36638552e-01  2.86452845e-03 -9.01607424e-03  1.14043072e-01
   5.67375943e-02 -5.85725904e-02  4.53214794e-02  7.21457601e-02
   5.67626022e-03 -5.24433777e-02  2.09663659e-02 -4.43966053e-02
  -8.02388974e-03 -2.08029859e-02 -1.11555144e-01 -9.33286175e-02
   5.59185967e-02 -4.85525243e-02  3.27471048e-02 -9.94767621e-03
  -9.23259854e-02 -1.04153343e-02  1.47632033e-01 -2.26222202e-02
  -1.31247759e-01  1.01324497e-02  1.21878926e-02 -3.41909155e-02
  -7.52454549e-02 -1.42779388e-03  4.23800088e-02 -1.68871470e-02
  -3.68107297e-03 -5.78581356e-02 -8.30912217e-03  4.43729497e-02
  -3.31390

In [23]:
type(embeddings1)

numpy.ndarray

In [24]:
embeddings1.shape

(1, 768)

#### What is the dimension of second embedding? 

In [25]:
embeddings2.shape

(1, 768)

### 5. Calculate cosine similarity between the embeddings

In [26]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_score = cosine_similarity(embeddings1.reshape(1, -1), embeddings2.reshape(1, -1))[0][0]


In [27]:
# Print the similarity score
print(f"Cosine similarity: {similarity_score}")

Cosine similarity: 1.0000001192092896
