In [None]:
import transformers

In [None]:
# Use a pipeline as a high-level helper
from transformers import pipeline
llama3_hf_token = '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'
# pipe = pipeline("text-generation", model="meta-llama/Meta-Llama-3-8B-Instruct", token = llama3_hf_token)
# create pipeline with cuda
pipe = pipeline("text-generation", model="meta-llama/Meta-Llama-3-8B-Instruct", token = llama3_hf_token, device=0)

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
# model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", load_in_4bit = True)
model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", device_map="auto") ## pip install accelerate based on error message
# model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
# question_to_model = "Who is the CEO of Tesla"
# model_inputs = tokenizer([question_to_model], return_tensors="pt").to("cuda")
# model_inputs = tokenizer([question_to_model], return_tensors="pt")

In [None]:
question_to_model = "Who is the CEO of Tesla"
model_inputs = tokenizer([question_to_model], return_tensors="pt").to("cuda")
# calculate the number of tokens in model_inputs
num_tokens = model_inputs['input_ids'].shape[1]
# print the number of tokens
print(num_tokens)
# generate output using the model
output = model.generate(**model_inputs, max_length=num_tokens+50, num_return_sequences=5)
# decode the output
output = tokenizer.batch_decode(output, skip_special_tokens=True)
# print the output
print(output)

In [None]:
sequences = pipe(
    'I liked "Breaking Bad" and "Band of Brothers". Do you have any recommendations of other shows I might like?\n',
    do_sample=True,
    top_k=10,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    max_length=200,
)
# print the generated sequences

In [None]:
for sequence in sequences:
    print(sequence['generated_text'])

In [None]:
question_to_model = "Who is the CEO of Tesla"
# prompt the llama 3 model with the question and print the answer
output = pipe(question_to_model, max_length=100)
print(output[0]['generated_text'])

In [None]:
messages = [
    {
        "role": "system",
        "content": "You are a friendly chatbot who always responds in the style of a pirate",
    },
    {"role": "user", "content": "How many helicopters can a human eat in one sitting?"},
]
print(pipe(messages, max_new_tokens=128)[0]['generated_text'][-1])

In [None]:
message_tokens = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")
# generate output
outputs = model.generate(message_tokens, max_new_tokens=128) 
# convert ids to text
tokenizer.decode(outputs[0])


In [None]:
messages = [
    {
        "role": "system",
        "content": "You are a friendly chatbot who always responds in a factual manner and answer in a single sentence",
    },
    {"role": "user", "content": "What is cluster analysis?"},
]
print(pipe(messages, max_new_tokens=20)[0]['generated_text'][-1])

In [None]:
messages = [
    {
        "role": "system",
        "content": "You are a friendly chatbot who always responds in a factual manner and answer in a single sentence. In addition to your knowledge, you will also use the following information on hydraulic hose types ---\
            Class 1:Same as 30R2, Type 1, per SAE J30 (latest issue). Reinforced between tube and cover with one ply of braided, knit, spiral or woven fabric. \
                Class 2:Same as 30R2, Type 2, per SAE J30 (latest issue). Reinforced between tube and cover with two braided plies of woven fabric. \
                    Class 3:Same as 30R2, Type 3, per SAE J30 (latest issue). Reinforced between tube and cover with one braided ply of textile yarn. \
                        Class 4:Same as 100R4, per SAE J517 (latest issue). Usually used for vacuum application.Reinforced between tube and cover with a ply or plies of woven or braided textile"
    },
    {"role": "user", "content": "What is cluster analysis?"},
    {'role': 'assistant', 'content': 'Cluster analysis is a type of unsupervised machine learning technique used to group similar objects or data points'},
    {"role": "user", "content": "What are its benefits?"},
    {'role': 'assistant', 'content': 'Cluster analysis helps to identify patterns, relationships, and structures in data, enables data visualization, and facilitates'},
    {"role": "user", "content": "What is class 1?"},

]
print(pipe(messages, max_new_tokens=20)[0]['generated_text'][-1])

In [None]:
# Consolidate all the content from messages
content = "".join([message["content"] for message in messages])
# tokenize the content
tokens = tokenizer(content, return_tensors="pt").to("cuda")
# get the number of tokens
num_tokens = tokens['input_ids'].shape[1]
# print the number of tokens
print(num_tokens)
