In [None]:
# Note: we will be using jina embedding which has 8K context length hence useful with larger context
# Jina is based on the jinaBERT architecture, and supports symmetric bidirectional variant of ALiBi.

In [1]:
from transformers import AutoModel
from numpy.linalg import norm

In [2]:
cos_sim = lambda a,b : (a @ b.T) / (norm(a) * norm(b)) # @ is used for matrix multiplication

In [3]:
model = AutoModel.from_pretrained("jinaai/jina-embeddings-v2-base-en", trust_remote_code=True)
embeddings = model.encode(["How is the weather today", "What is the current weather like?"])

print(cos_sim(embeddings[0], embeddings[1]))



config.json:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

configuration_bert.py:   0%|          | 0.00/8.24k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/jina-bert-implementation:
- configuration_bert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_bert.py:   0%|          | 0.00/97.7k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/jina-bert-implementation:
- modeling_bert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/275M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/373 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

0.91080654


In [None]:
### Note: the original demo uses HF inference endpoints but we are running this locally on a mac

In [23]:
# managing imports 
import asyncio
import json
from pathlib import Path
import time
from tqdm import tqdm

from typing import Optional
from datasets import load_dataset, Dataset, DatasetDict

import numpy as np
import pandas as pd

In [28]:
# We will create embeddings of a reddit dataset
DATASET_IN = "derek-thomas/dataset-creator-reddit-bestofredditorupdates"
DATASET_OUT = "processed-embeddings-bestofreddit_100"

MAX_WORKERS = 5
ROW_COUNT = 100

In [10]:
dataset = load_dataset(DATASET_IN)

Downloading readme:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/46.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/11595 [00:00<?, ? examples/s]

In [11]:
# train data as pandas
documents = dataset["train"].to_pandas().to_dict("records")[:ROW_COUNT]

In [14]:
len(documents), documents[1]

(100,
 {'id': '10025jy',
  'content': '**I am not the original poster. Originally posted by**  [u/dude-wheres-my-car](https://www.reddit.com/user/dude-wheres-my-car/)  **in 2015 in** r/legaladvice\n\n&amp;#x200B;\n\n[**Car was mistakenly towed from my own parking spot and tow company won\'t release my car without paying huge storage fees**](https://www.reddit.com/r/legaladvice/comments/3x45v9/car_was_mistakenly_towed_from_my_own_parking_spot/) (16 Dec 2015)\n\nI own a condo that I am remodeling before renting it out again. The condo has 2 numbered parking spaces in an *oversized driveway*. My in-laws have been staying with us for the holidays, so we parked one of our cars (a black Land Cruiser) at the condo 2 weeks ago. Annoyingly, there was a dark red Toyota 4runner illegally parked there. We called the tow company contracted to our property and gave them the make/model, color, plate number, and the number of the spot. They said would tow the car. We parked our Land Cruiser in our oth

In [24]:
%%time
# Generating embeddings for the ROW_COUNT documents


for document in tqdm(documents):
    document["embedding"] = model.encode(document["content"])

100%|██████████| 100/100 [08:36<00:00,  5.16s/it]

CPU times: user 18min 57s, sys: 20min 11s, total: 39min 9s
Wall time: 8min 36s





In [25]:
# checking the documents after including the embeddings
documents[1]

{'id': '10025jy',
 'content': '**I am not the original poster. Originally posted by**  [u/dude-wheres-my-car](https://www.reddit.com/user/dude-wheres-my-car/)  **in 2015 in** r/legaladvice\n\n&amp;#x200B;\n\n[**Car was mistakenly towed from my own parking spot and tow company won\'t release my car without paying huge storage fees**](https://www.reddit.com/r/legaladvice/comments/3x45v9/car_was_mistakenly_towed_from_my_own_parking_spot/) (16 Dec 2015)\n\nI own a condo that I am remodeling before renting it out again. The condo has 2 numbered parking spaces in an *oversized driveway*. My in-laws have been staying with us for the holidays, so we parked one of our cars (a black Land Cruiser) at the condo 2 weeks ago. Annoyingly, there was a dark red Toyota 4runner illegally parked there. We called the tow company contracted to our property and gave them the make/model, color, plate number, and the number of the spot. They said would tow the car. We parked our Land Cruiser in our other parki

In [26]:
### Saving the documents with embeddings to hub
from huggingface_hub import whoami


In [27]:
df = pd.DataFrame(documents) # dict -> pandas -> Dataset
dd = DatasetDict({"train": Dataset.from_pandas(df)})

In [29]:
dd.push_to_hub(repo_id=DATASET_OUT)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/riaz/processed-embeddings-bestofreddit_100/commit/6d1f0870c68fbdc9c4fb323827fde11ea06f18eb', commit_message='Upload dataset', commit_description='', oid='6d1f0870c68fbdc9c4fb323827fde11ea06f18eb', pr_url=None, pr_revision=None, pr_num=None)