# Test Code 

In [5]:
import transformers
import torch
from transformers import AutoTokenizer, AutoModel

# There are important!
torch.backends.cuda.enable_mem_efficient_sdp(False)
torch.backends.cuda.enable_flash_sdp(False)

In [7]:
model_id = "/media/ANONYMOUS/4TB_1/LLM_models/meta-llama/Llama-3.2-1B"

pipeline = transformers.pipeline(
    "text-generation", model=model_id, model_kwargs={"torch_dtype": torch.bfloat16}, device_map="auto"
)

pipeline("Hey how are you doing today?")

Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


[{'generated_text': 'Hey how are you doing today? I hope you are doing well. I have been doing a lot of research on how to make a'}]

In [9]:
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModel.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="auto")

In [10]:
# input text
text = "Hey how are you doing today?"

# tokenize and move to CUDA
inputs = tokenizer(text, return_tensors="pt").to("cuda")

print(inputs)

{'input_ids': tensor([[128000,  19182,   1268,    527,    499,   3815,   3432,     30]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}


In [12]:
# input text
text = "Hey how are you doing today?"

inputs = tokenizer(text, return_tensors="pt").to("cuda")

# text embedding
with torch.no_grad():
    embeddings = model(**inputs).last_hidden_state

print(embeddings)

tensor([[[ 1.5703, -1.5234,  2.7969,  ..., -0.6680,  0.6211, -1.0469],
         [ 1.7188,  3.0781,  3.7188,  ..., -4.9062, -5.1875,  0.4785],
         [ 2.9062,  1.3906,  1.6406,  ..., -0.4121, -6.4375, -0.4688],
         ...,
         [ 1.7266,  4.5000,  3.3750,  ..., -4.8750, -4.0625, -2.6875],
         [ 1.1641,  4.3125,  3.8438,  ..., -4.0312, -3.7812, -1.2812],
         [ 1.3984,  3.3750,  2.0781,  ..., -5.8125, -5.7188, -0.9688]]],
       device='cuda:0', dtype=torch.bfloat16)


In [13]:
embeddings.size()

torch.Size([1, 8, 2048])

In [14]:
embeddings = embeddings.mean(dim=1).squeeze()
embeddings

tensor([ 1.6172,  2.9062,  2.5625,  ..., -3.6719, -4.0938, -1.5312],
       device='cuda:0', dtype=torch.bfloat16)

In [15]:
embeddings.size()

torch.Size([2048])