<a href="https://colab.research.google.com/github/porameht/text-embeddings-hugging-face/blob/main/hf_text_embeddings_%26_semantic_search.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Install** the Transformers and Datasets libraries to run this notebook.


In [1]:
!python --version

Python 3.10.11


In [36]:
!pip install datasets transformers[sentencepiece] pandas
!pip install faiss-cpu==1.7.1

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting faiss-cpu==1.7.1
  Using cached faiss-cpu-1.7.1.tar.gz (40 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: faiss-cpu
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py bdist_wheel[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Building wheel for faiss-cpu (setup.py) ... [?25lerror
[31m  ERROR: Failed building wheel for faiss-cpu[0m[31m
[0m[?25h  Running setup.py clean for faiss-cpu
Failed to build faiss-cpu
Installing collected packages: faiss-cpu
  Attempting uninstall: faiss-cpu
    Found existing installation: faiss-cpu 1.7.

In [3]:
import torch
from transformers import AutoTokenizer, AutoModel

sentences = [
    "I took my dog for a walk",
    "Today is going to rain",
    "I took my cat for a walk",
]

model_ckpt = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt)

encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")

with torch.no_grad():
    model_output = model(**encoded_input)
    
    
token_embeddings = model_output.last_hidden_state
print(f"Token embeddings shape: {token_embeddings.size()}")

Token embeddings shape: torch.Size([3, 9, 384])


In [4]:
import torch.nn.functional as F


def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output.last_hidden_state
    input_mask_expanded = (
        attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    )
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
        input_mask_expanded.sum(1), min=1e-9
    )


sentence_embeddings = mean_pooling(model_output, encoded_input["attention_mask"])
# Normalize the embeddings
sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
print(f"Sentence embeddings shape: {sentence_embeddings.size()}")

Sentence embeddings shape: torch.Size([3, 384])


In [5]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

sentence_embeddings = sentence_embeddings.detach().numpy()

scores = np.zeros((sentence_embeddings.shape[0], sentence_embeddings.shape[0]))

for idx in range(sentence_embeddings.shape[0]):
    scores[idx, :] = cosine_similarity([sentence_embeddings[idx]], sentence_embeddings)[0]

In [10]:
from datasets import load_dataset

squad = load_dataset("squad", split="validation").shuffle(seed=42).select(range(100))


def get_embeddings(text_list):
    encoded_input = tokenizer(
        text_list, padding=True, truncation=True, return_tensors="pt"
    )
    encoded_input = {k: v for k, v in encoded_input.items()}
    with torch.no_grad():
        model_output = model(**encoded_input)
    return mean_pooling(model_output, encoded_input["attention_mask"])


squad_with_embeddings = squad.map(
    lambda x: {"embeddings": get_embeddings(x["context"]).cpu().numpy()[0]}
)



In [54]:
import pandas as pd

squad_with_embeddings.add_faiss_index(column="embeddings")

question = "Who headlined the halftime show for Super Bowl 50?"
question_embedding = get_embeddings([question]).cpu().detach().numpy()
scores, samples = squad_with_embeddings.get_nearest_examples(
    "embeddings", question_embedding, k=3
)

data = [
    {"Context": samples['context'][0], "Score": scores[0], "Embeddings": samples['embeddings'][0]},
    {"Context": samples['context'][1], "Score": scores[1], "Embeddings": samples['embeddings'][1]},
    {"Context": samples['context'][2], "Score": scores[2], "Embeddings": samples['embeddings'][2]},
]

# Create a pandas DataFrame
results_df = pd.DataFrame(data)

# Display the results in a table format
print(results_df)


  0%|          | 0/1 [00:00<?, ?it/s]

                                             Context      Score  \
0  CBS broadcast Super Bowl 50 in the U.S., and c...  23.663609   
1  The league announced on October 16, 2012, that...  32.570621   
2  After a punt from both teams, Carolina got on ...  34.298473   

                                          Embeddings  
0  [0.11940547078847885, 0.03607199341058731, -0....  
1  [0.16786009073257446, 0.06727147847414017, 0.1...  
2  [-0.007492087781429291, 0.0761270597577095, -0...  


In [55]:
import gradio as gr

def search_question(question):
    question_embedding = get_embeddings([question]).cpu().detach().numpy()
    scores, samples = squad_with_embeddings.get_nearest_examples("embeddings", question_embedding, k=3)

    best_match = samples
    return best_match

iface = gr.Interface(fn=search_question, inputs="text", outputs="text", title="Super Bowl Halftime Search")
iface.launch(share=True)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://3327fadff03cb85cec.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades (NEW!), check out Spaces: https://huggingface.co/spaces


