In [None]:
import torch
 
# Check if a CUDA device is available
if torch.cuda.is_available():
    # Print the number of available GPUs
    print(torch.cuda.device_count())
    # Print the name of the current GPU
    print(torch.cuda.get_device_name(0))
else:
    print("No CUDA devices found.") 

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer

model_name_or_path = 'TencentARC/LLaMA-Pro-8B-Instruct'

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=False)
model = AutoModelForCausalLM.from_pretrained(model_name_or_path)

In [None]:
#pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

prompt = "Tell me about Generative AI"
prompt_template=f'''[INST] <<SYS>>
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.
<</SYS>>
{prompt}[/INST]

'''

print("\n\n*** Generate:")

input_ids = tokenizer(prompt_template, return_tensors='pt').input_ids
output = model.generate(inputs=input_ids, temperature=0.7, do_sample=True, top_p=0.95, top_k=40, max_new_tokens=512)
print(tokenizer.decode(output[0]))

# Inference can also be done using transformers' pipeline

print("*** Pipeline:")
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=512,
    do_sample=True,
    temperature=0.3,
    top_p=0.95,
    top_k=40,
    repetition_penalty=1.1
)

print(pipe(prompt_template)[0]['generated_text'])