In [1]:
EMBED_MODEL = 'mixedbread-ai/mxbai-embed-large-v1'

import datasets
ds = datasets.load_dataset('mikex86/stackoverflow-posts', split='train', streaming=True)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
for i, sample in enumerate(iter(ds)):
  print(i, sample["Body"])
  if i > 3:
    break

0 An explicit cast to `double` like this isn't necessary:

```
double trans = (double) trackBar1.Value / 5000.0;
```


Identifying the constant as `5000.0` (or as `5000d`) is sufficient:

```
double trans = trackBar1.Value / 5000.0;
double trans = trackBar1.Value / 5000d;
```


1 Given a `DateTime` representing a person's birthday, how do I calculate their age in years?

2 Given a specific `DateTime` value, how do I display relative time, like:
- `2 hours ago`- `3 days ago`- `a month ago`

3 What is the difference between [Math.Floor()](http://msdn.microsoft.com/en-us/library/9a6a2sxy.aspx) and [Math.Truncate()](http://msdn.microsoft.com/en-us/library/system.math.truncate.aspx) in .NET?

4 I have an absolutely positioned `div` containing several children, one of which is a relatively positioned `div`. When I use a `percentage-based width` on the child `div`, it collapses to `0 width` on IE7, but not on Firefox or Safari.
If I use `pixel width`, it works. If the parent is relatively pos

In [3]:
import transformers
tokenizer = transformers.AutoTokenizer.from_pretrained(EMBED_MODEL)

body = next(iter(ds))['Body']
input_ids = tokenizer(body, return_tensors="np")
print(input_ids)

{'input_ids': array([[  101,  2019, 13216,  3459,  2000,  1036,  3313,  1036,  2066,
         2023,  3475,  1005,  1056,  4072,  1024,  1036,  1036,  1036,
         3313,  9099,  1027,  1006,  3313,  1007,  2650,  8237,  2487,
         1012,  3643,  1013, 13509,  1012,  1014,  1025,  1036,  1036,
         1036, 12151,  1996,  5377,  2004,  1036, 13509,  1012,  1014,
         1036,  1006,  2030,  2004,  1036, 13509,  2094,  1036,  1007,
         2003,  7182,  1024,  1036,  1036,  1036,  3313,  9099,  1027,
         2650,  8237,  2487,  1012,  3643,  1013, 13509,  1012,  1014,
         1025,  3313,  9099,  1027,  2650,  8237,  2487,  1012,  3643,
         1013, 13509,  2094,  1025,  1036,  1036,  1036,   102]]), 'token_type_ids': array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [4]:
model = transformers.AutoModel.from_pretrained(EMBED_MODEL).cuda()

In [7]:
# Iterate over the dataset until we have processed 10M tokens.
# For each set of KEY_WINDOW tokens, compute the embeddings and store them in a list.

KEY_WINDOW = 16
VALUE_WINDOW = 32
MAX_DOCS = 1000

docs = []
import tqdm
import numpy as np
import itertools
import torch

DS_KEYS = list(next(iter(ds)).keys())


def split(record):
    result = {
        "key_input_ids": [],
        "key_token_type_ids": [],
        "key_attention_mask": [],
        "value_input_ids": [],
        "value_token_type_ids": [],
        "value_attention_mask": [],
    }

    num_docs = record['input_ids'].shape[0]
    for i in range(num_docs):
        for j in range(0, record['input_ids'][i].shape[0], KEY_WINDOW):
            result["key_input_ids"].append(record["input_ids"][i][j : j + KEY_WINDOW])
            result["key_token_type_ids"].append(record["token_type_ids"][i][j : j + KEY_WINDOW])
            result["key_attention_mask"].append(record["attention_mask"][i][j : j + KEY_WINDOW])
            result["value_input_ids"].append(record["input_ids"][i][j : j + VALUE_WINDOW])
            result["value_token_type_ids"].append(record["token_type_ids"][i][j: j + VALUE_WINDOW])
            result["value_attention_mask"].append(record["attention_mask"][i][j: j + VALUE_WINDOW])

    # Convert to torch tensors, padding to the maximum length
    for k, v in result.items():
        result[k] = torch.tensor(
            np.array(list(itertools.zip_longest(*v, fillvalue=0))).T
        )

    return result


def encode(record):
    tokens = tokenizer(record["Body"], padding=True, return_tensors="np")
    splits = split(tokens)
    for k, v in splits.items():
        splits[k] = v.cuda()
        print(k, v.shape, v.dtype, v.device)

    key_embedding = model(
        input_ids=splits["key_input_ids"],
        token_type_ids=splits["key_token_type_ids"],
        attention_mask=splits["key_attention_mask"],
    ).last_hidden_state

    value_embedding = model(
        input_ids=splits["value_input_ids"],
        token_type_ids=splits["value_token_type_ids"],
        attention_mask=splits["value_attention_mask"],
    ).last_hidden_state

    return {
        "key_embedding": key_embedding[:, 0].detach().cpu().numpy(),
        "value_embedding": value_embedding[:, 0].detach().cpu().numpy(),
    }


encode_ds = ds.map(
    encode,
    batch_size=8,
    batched=True,
    remove_columns=DS_KEYS,
)

for i, sample in enumerate(iter(encode_ds)):
    print(i, sample['key_embedding'].shape)
    if i > 100:
        break

key_input_ids torch.Size([160, 16]) torch.int64 cpu
key_token_type_ids torch.Size([160, 16]) torch.int64 cpu
key_attention_mask torch.Size([160, 16]) torch.int64 cpu
value_input_ids torch.Size([160, 32]) torch.int64 cpu
value_token_type_ids torch.Size([160, 32]) torch.int64 cpu
value_attention_mask torch.Size([160, 32]) torch.int64 cpu
0 (1024,)
1 (1024,)
2 (1024,)
3 (1024,)
4 (1024,)
5 (1024,)
6 (1024,)
7 (1024,)
8 (1024,)
9 (1024,)
10 (1024,)
11 (1024,)
12 (1024,)
13 (1024,)
14 (1024,)
15 (1024,)
16 (1024,)
17 (1024,)
18 (1024,)
19 (1024,)
20 (1024,)
21 (1024,)
22 (1024,)
23 (1024,)
24 (1024,)
25 (1024,)
26 (1024,)
27 (1024,)
28 (1024,)
29 (1024,)
30 (1024,)
31 (1024,)
32 (1024,)
33 (1024,)
34 (1024,)
35 (1024,)
36 (1024,)
37 (1024,)
38 (1024,)
39 (1024,)
40 (1024,)
41 (1024,)
42 (1024,)
43 (1024,)
44 (1024,)
45 (1024,)
46 (1024,)
47 (1024,)
48 (1024,)
49 (1024,)
50 (1024,)
51 (1024,)
52 (1024,)
53 (1024,)
54 (1024,)
55 (1024,)
56 (1024,)
57 (1024,)
58 (1024,)
59 (1024,)
60 (1024,)
6