In [None]:
import torch
from transformers import pipeline

model_id = "meta-llama/Llama-3.2-1B"

pipe = pipeline(
    "text-generation",
    model=model_id,
    torch_dtype=torch.bfloat16,
    max_length=50
    device_map="auto",
)

pipe("The key to life is")

Some parameters are on the meta device because they were offloaded to the disk.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


[{'generated_text': 'The key to life is to find the right balance between the two. You need to find a way to balance your life and your career. You need to find a way to balance your life and your career. You need to find a way to'}]

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
import torch

In [8]:
# Load pre-trained model and tokenizer
model_name = "meta-llama/Llama-3.2-1B"  # or any compatible model
quantization_config = BitsAndBytesConfig(load_in_4bit=True)


tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name
)

In [None]:
prompt = 'Does the sun revolve around the earth?'
input_ids = tokenizer(prompt, return_tensors="pt").input_ids
generated_ids = model.generate(input_ids)
tokenizer.batch_decode(generated_ids)[0]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


'<|begin_of_text|>Does the sun revolve around the earth? What is the relationship between the sun and the earth'

## Everything Below here is for experiment, will start with just text gen for now 

In [None]:
prompt = 'Does the sun revolve around the earth?'
desired_token = 'yes'
input_ids = tokenizer(prompt, return_tensors="pt").input_ids
target_token_id = tokenizer.convert_tokens_to_ids(desired_token)

In [None]:
outputs = model(input_ids)
logits = outputs.logits[:, -1, :]

In [None]:
bias_value = 1000  # Adjust this to control the strength of biasing
logits[:, target_token_id] += bias_value

In [None]:
probs = torch.softmax(logits, dim=-1)