# Loading Llama-2 model

In [1]:
!pip install transformers accelerate

Collecting transformers
  Downloading transformers-4.35.0-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m56.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.24.1-py3-none-any.whl (261 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.4/261.4 kB[0m [31m35.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m36.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m106.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloadin

In [1]:
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM

In [None]:
model_id = "meta-llama/Llama-2-7b-chat-hf"

In [3]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|
    
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) n
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
# Get the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [None]:
# Create Transformers pipeline for inferencing
%%time
pipeline = transformers.pipeline(
                                  "text-generation",
                                  model=model_id,
                                )

Downloading (…)lve/main/config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

Downloading (…)fetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

CPU times: user 1min 2s, sys: 29.6 s, total: 1min 32s
Wall time: 2min 2s


In [None]:
# Create Transformers pipeline for inferencing on single GPU
%%time
pipeline = transformers.pipeline(
                                  "text-generation",
                                  model=model_id,
                                  device_map="cuda:0"
                                )

In [None]:
# Create Transformers pipeline for inferencing on both CPU & GPU
%%time
pipeline = transformers.pipeline(
                                  "text-generation",
                                  model=model_id,
                                  device_map="auto"
                                )

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

CPU times: user 17.7 s, sys: 15.7 s, total: 33.4 s
Wall time: 15.6 s


In [None]:
prompt = 'Tell me about AI'

In [None]:
%%time
sequences = pipeline(
    prompt,
    eos_token_id=tokenizer.eos_token_id,
    max_length=200,
)
for seq in sequences:
    print(f"Result: {seq['generated_text']}")


Result: Tell me about AI in healthcare and its potential impact on the industry.
AI in healthcare is a rapidly growing field that has the potential to transform the industry in several ways. Here are some of the ways AI is being used in healthcare and its potential impact:
1. Predictive analytics: AI algorithms can analyze large amounts of data to identify patterns and predict patient outcomes, such as the likelihood of disease progression or response to treatment. This can help healthcare providers make more informed decisions and improve patient care.
2. Medical imaging: AI can be used to analyze medical images such as X-rays and MRIs to identify abnormalities and diagnose conditions. This can help doctors to make more accurate diagnoses and improve treatment outcomes.
3. Personalized medicine: AI can help personalize treatment plans for individual patients based on their unique genetic profiles and medical histories. This can lead
CPU times: user 10min 56s, sys: 3.25 s, total: 10min

# loading 4-bit models using bitsandbytes

In [6]:
#restart after installation
!pip install bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.41.2-py3-none-any.whl (92.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.6/92.6 MB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.41.2


In [2]:
from transformers import BitsAndBytesConfig

In [3]:
model_id = "meta-llama/Llama-2-13b-chat-hf"

In [4]:
# Get the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [5]:
quantization_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_compute_dtype=torch.bfloat16
)

In [6]:
%%time
model = AutoModelForCausalLM.from_pretrained(model_id,
                                             quantization_config=quantization_config,
                                             device_map="auto")

Downloading (…)fetensors.index.json:   0%|          | 0.00/33.4k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading (…)of-00003.safetensors:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

Downloading (…)of-00003.safetensors:   0%|          | 0.00/9.90G [00:00<?, ?B/s]

Downloading (…)of-00003.safetensors:   0%|          | 0.00/6.18G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

CPU times: user 34 s, sys: 37.6 s, total: 1min 11s
Wall time: 3min 22s


In [7]:
prompt = 'I liked "Breaking Bad" and "Band of Brothers". Do you have any recommendations of other shows I might like?\n'

In [8]:
input_ids = tokenizer.encode(prompt, return_tensors="pt")

In [9]:
input_ids

tensor([[    1,   306, 23289,   376, 20130,  5086,  9178, 29908,   322,   376,
         29933,   392,   310, 25522,  1642,  1938,   366,   505,   738,  6907,
           800,   310,   916,  3697,   306,  1795,   763, 29973,    13]])

In [11]:
with torch.no_grad():
  output = model.generate(input_ids=input_ids,
                              max_new_tokens=2048,
                              temperature=0.6)



In [12]:
response = tokenizer.decode(output[0], skip_special_tokens=True)
print(response)

I liked "Breaking Bad" and "Band of Brothers". Do you have any recommendations of other shows I might like?

I recommend checking out "The Sopranos", "The Wire", and "Mad Men". All three of these shows have similar themes and elements to "Breaking Bad" and "Band of Brothers", but each offers a unique perspective and storytelling style.

"The Sopranos" is a crime drama that explores the life of a New Jersey mob boss, Tony Soprano, as he navigates the criminal underworld and deals with personal and family issues. The show is known for its complex characters, witty dialogue, and deep themes.

"The Wire" is a gritty and realistic crime drama that explores the drug trade in Baltimore from multiple perspectives, including law enforcement, drug dealers, and politicians. The show is praised for its nuanced characters, intense action, and thought-provoking themes.

"Mad Men" is a period drama that follows the lives of advertising executives in the 1960s. The show explores themes of identity, po