![](/../assets/80_getting_started_with_embeddings/thumbnail.png)

In [1]:
model_id = "sentence-transformers/all-MiniLM-L6-v2"
hf_token = "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"

The first time you generate the embeddings it may take a while (approximately 20 seconds) for the API to return them. We use the `retry` decorator (install with `pip install retry`) so that if on the first try `output = query(dict(inputs = texts))` doesn't work, wait 10 seconds and try again three times. The reason this happens is because on the first request, the model needs to be downloaded and installed in the server, but subsequent calls are much faster.

In [2]:
%%capture
!pip install retry

In [3]:
import requests
from retry import retry

api_url = f"https://api-inference.huggingface.co/pipeline/feature-extraction/{model_id}"
headers = {"Authorization": f"Bearer {hf_token}"}

In [4]:
@retry(tries=3, delay=10)
def query(texts):
    response = requests.post(api_url, headers=headers, json={"inputs": texts})
    result = response.json()
    if isinstance(result, list):
      return result
    elif list(result.keys())[0] == "error":
      raise RuntimeError(
          "The model is currently loading, please re-run the query."
          )

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [63]:

def open_file(filepath):
    with open(filepath, 'r', encoding='cp1252') as infile:
        return infile.read()



In [124]:

texts = open_file('/content/drive/MyDrive/DS/DataLaw.txt')


In [89]:
!pip install Unidecode

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting Unidecode
  Downloading Unidecode-1.3.6-py3-none-any.whl (235 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.9/235.9 KB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Unidecode
Successfully installed Unidecode-1.3.6


In [90]:
!pip install autocorrect

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting autocorrect
  Downloading autocorrect-2.6.1.tar.gz (622 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m622.8/622.8 KB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: autocorrect
  Building wheel for autocorrect (setup.py) ... [?25l[?25hdone
  Created wheel for autocorrect: filename=autocorrect-2.6.1-py3-none-any.whl size=622381 sha256=f7262f55eb8375ed9637107fe024df9875896c848fb68d30d863dbdf51205ab5
  Stored in directory: /root/.cache/pip/wheels/72/b8/3b/a90246d13090e85394a8a44b78c8abf577c0766f29d6543c75
Successfully built autocorrect
Installing collected packages: autocorrect
Successfully installed autocorrect-2.6.1


In [91]:
import unidecode
import pandas as pd
import re
import time
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from autocorrect import Speller
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk import word_tokenize
import string
import timeit

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [125]:
def remove_newlines_tabs(text):
    # Replacing all the occurrences of \n,\\n,\t,\\ with a space.
    Formatted_text = text.replace('\\n', ' ').replace('\n', ' ').replace('\t',' ').replace('\\', ' ').replace('. com', '.com')
    return Formatted_text

In [126]:
texts=remove_newlines_tabs(str(texts))
#texts

In [127]:
def spelling_correction(text):

    # Check for spellings in English language
    spell = Speller(lang='en')
    Corrected_text = spell(text)
    return Corrected_text


In [128]:
texts=spelling_correction(texts)

In [129]:
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

In [130]:
texts=nltk.sent_tokenize(texts)
texts


[' Disclaimer: All of the translations contained on this website are unofficial.',
 'Only the original Slovene texts of the laws and regulations have legal effect, and the translations are to be used solely as reference materials to aid in the understanding of Slovene laws and regulations.',
 'The Government of the Republic of Slovenia is not responsible for the accuracy, reliability or currency of the translations provided on this website, or for any consequence resulting from the use of information on this website.',
 'For all purposes of interpreting and applying law to any legal issue or dispute, users should consult the original Slovene texts published in the Official Gazette of the Republic of Slovenia.',
 'The unofficial consolidated version of the Agricultural Land Act comprises: -         Agricultural Land Act – ZZ (Official Gazette of the Republic of Slovenia [Radii list S], No.',
 '59/96 of 25 October 1996), -         Decision abrogating the provision of paragraph two of Art

In [131]:
len(texts)

173

In [132]:


output = query(texts)
output

[[-0.04365445673465729,
  0.053396087139844894,
  0.032511766999959946,
  -0.006458945572376251,
  0.03370673581957817,
  -0.01269362960010767,
  0.059301357716321945,
  0.011271574534475803,
  -0.03378558158874512,
  0.021866969764232635,
  0.11735968291759491,
  0.015544049441814423,
  -0.01337969209998846,
  -0.01643136702477932,
  -0.022696036845445633,
  -0.09347230941057205,
  0.055053189396858215,
  0.0943152904510498,
  -0.024411525577306747,
  0.024931272491812706,
  0.014796697534620762,
  -0.012490807101130486,
  0.10143692046403885,
  0.07171066105365753,
  0.05757985636591911,
  -0.034625425934791565,
  -0.039212167263031006,
  -0.009635851718485355,
  0.027726691216230392,
  -0.008978418074548244,
  -0.09989910572767258,
  0.08836421370506287,
  0.006237565539777279,
  -0.04126385599374771,
  0.037871185690164566,
  0.044746868312358856,
  -0.04744622856378555,
  -0.014419451355934143,
  -0.008630238473415375,
  0.0765286237001419,
  -0.03430873900651932,
  -0.01819440349

In [133]:
import pandas as pd

embeddings = pd.DataFrame(output)

In [134]:
print(embeddings)

          0         1         2         3         4         5         6    \
0   -0.043654  0.053396  0.032512 -0.006459  0.033707 -0.012694  0.059301   
1    0.027941  0.100248 -0.050451 -0.024451  0.045630  0.032227 -0.013666   
2   -0.026801  0.064039 -0.057095 -0.015096  0.001785 -0.017931  0.013010   
3   -0.025162  0.071724 -0.082708  0.010537 -0.039075  0.034327  0.011974   
4   -0.014646  0.043869 -0.033361  0.033800  0.080880 -0.049252 -0.000432   
..        ...       ...       ...       ...       ...       ...       ...   
168  0.015626  0.038448  0.010180  0.012550  0.027783 -0.016841  0.036713   
169  0.028080  0.030014  0.003858 -0.028693  0.008462 -0.038375 -0.092687   
170 -0.001668  0.081919  0.019691  0.036154  0.025472 -0.010333  0.023802   
171  0.058865  0.046915  0.033923 -0.050113  0.022905 -0.036178 -0.012412   
172  0.052786  0.048403 -0.031089 -0.075470  0.030023  0.011238  0.046337   

          7         8         9    ...       374       375       376  \
0  

## 2. Host embeddings for free on the Hugging Face Hub


In [11]:
%%capture
!pip install huggingface-hub

In [12]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|
    
    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) N
Token is valid.
Your token has been saved to /root/.cache/huggingface

In [135]:
#!huggingface-cli repo create embedded_faqs_medicare --type dataset --organization ITESM

In [14]:
# This is code required to install git-lfs however it already is installed in Colab instances.
#!git lfs install

In [116]:
!git clone https://{sohag00013}:{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}@huggingface.co/datasets/ITESM/embedded_faqs_medicare

fatal: destination path 'embedded_faqs_medicare' already exists and is not an empty directory.


In [136]:
embeddings.to_csv("embedded_faqs_medicare/embeddings.csv", index=False)
print(embeddings.shape)

(173, 384)


Changing directory to our repo `embedded_faqs_medicare`.

In [19]:
%%capture
!pip install datasets

In [137]:
import torch
from datasets import load_dataset

faqs_embeddings = load_dataset('ITESM/embedded_faqs_medicare')

dataset_embeddings = torch.from_numpy(faqs_embeddings["train"].to_pandas().to_numpy()).to(torch.float)



  0%|          | 0/1 [00:00<?, ?it/s]

## 3. Get the most similar Frequently Asked Questions to a query


In [138]:
question = ["how can government help the agriculture "]
output = query(question)
output

[[-0.03194897621870041,
  0.025230562314391136,
  0.0374809205532074,
  -0.02443321794271469,
  0.085457943379879,
  0.017437800765037537,
  -0.0003884255129378289,
  -0.057308826595544815,
  -0.03259613737463951,
  -0.011184711940586567,
  0.08490066975355148,
  0.005429499316960573,
  -0.060857463628053665,
  -0.011201666668057442,
  -0.02058095484972,
  0.02951379492878914,
  0.020526189357042313,
  0.06465739756822586,
  -0.10647854954004288,
  -0.0877828374505043,
  -0.011032656766474247,
  0.04637054726481438,
  -0.05027714744210243,
  -0.04501841589808464,
  -0.06227460503578186,
  -0.0014539771946147084,
  -0.05291348695755005,
  -0.0729152038693428,
  -0.05998607724905014,
  -0.04398210719227791,
  0.010930987074971199,
  0.03676997870206833,
  0.011942108161747456,
  0.07814010977745056,
  -0.043530791997909546,
  0.11531797051429749,
  0.13327480852603912,
  -0.035980284214019775,
  0.02966761216521263,
  -0.05672243982553482,
  -0.003761094994843006,
  -0.07935567945241928,

In [139]:
query_embeddings = torch.FloatTensor(output)
print(f"The size of our embedded dataset is {dataset_embeddings.shape} and of our embedded query is {query_embeddings.shape}.")

The size of our embedded dataset is torch.Size([13, 384]) and of our embedded query is torch.Size([1, 384]).


In [25]:
%%capture
!pip install -U sentence-transformers

In [140]:
from sentence_transformers.util import semantic_search

hits = semantic_search(query_embeddings, dataset_embeddings, top_k=5)
hits

[[{'corpus_id': 7, 'score': 0.14417657256126404},
  {'corpus_id': 11, 'score': 0.10925590991973877},
  {'corpus_id': 9, 'score': 0.10685764253139496},
  {'corpus_id': 10, 'score': 0.041812751442193985},
  {'corpus_id': 6, 'score': 0.035958923399448395}]]

In [141]:
[texts[hits[0][i]['corpus_id']] for i in range(len(hits[0]))]

['1/99 of 9 January 1999), -         Agriculture Act – Name (Official Gazette of the Republic of Slovenia [Radii list S], No.',
 '58/02 of 4 July 2002), -         Act Amending the Agricultural Land Act – ZZ-A (Official Gazette of the Republic of Slovenia [Radii list S], No.',
 '68/00 of 31 July 2000), -         Decision abrogating Chapter II of the Agricultural Land Act with a suspension deadline of one year (Official Gazette of the Republic of Slovenia [Radii list S], No.',
 '68/00 of 31 July 2000), -         Marine Fisheries Act – MR-1 (Official Gazette of the Republic of Slovenia [Radii list S], No.',
 '31/98 of 18 April 1998), -         Replacement of the Retail Price Index by the Cost of Living Index Act – NIC (Official Gazette of the Republic of Slovenia [Radii list S], No.']