# Loading Llama-2 model

In [None]:
!pip install transformers accelerate

Collecting transformers
  Downloading transformers-4.35.0-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m65.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.24.1-py3-none-any.whl (261 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.4/261.4 kB[0m [31m30.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.19.0-py3-none-any.whl (311 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.2/311.2 kB[0m [31m39.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m105.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloadin

In [None]:
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM

In [None]:
model_id = "meta-llama/Llama-2-7b-chat-hf"

In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|
    
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) n
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
# Get the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [None]:
# Create Transformers pipeline for inferencing
%%time
pipeline = transformers.pipeline(
                                  "text-generation",
                                  model=model_id,
                                )

Downloading (…)lve/main/config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

Downloading (…)fetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

CPU times: user 1min 2s, sys: 29 s, total: 1min 31s
Wall time: 2min 9s


In [None]:
# Create Transformers pipeline for inferencing on single GPU
%%time
pipeline = transformers.pipeline(
                                  "text-generation",
                                  model=model_id,
                                  device_map="cuda:0"
                                )

In [None]:
# Create Transformers pipeline for inferencing on both CPU & GPU
%%time
pipeline = transformers.pipeline(
                                  "text-generation",
                                  model=model_id,
                                  device_map="auto"
                                )

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

CPU times: user 17.7 s, sys: 15.7 s, total: 33.4 s
Wall time: 15.6 s


In [None]:
prompt = 'Tell me about AI'

In [None]:
%%time
sequences = pipeline(
    prompt,
    eos_token_id=tokenizer.eos_token_id,
    max_length=200,
)
for seq in sequences:
    print(f"Result: {seq['generated_text']}")


Result: Tell me about AI in healthcare and its potential impact on the industry.
AI in healthcare is a rapidly growing field that has the potential to transform the industry in several ways. Here are some of the ways AI is being used in healthcare and its potential impact:
1. Predictive analytics: AI algorithms can analyze large amounts of data to identify patterns and predict patient outcomes, such as the likelihood of disease progression or response to treatment. This can help healthcare providers make more informed decisions and improve patient care.
2. Medical imaging: AI can be used to analyze medical images such as X-rays and MRIs to identify abnormalities and diagnose conditions. This can help doctors to make more accurate diagnoses and improve treatment outcomes.
3. Personalized medicine: AI can help personalize treatment plans for individual patients based on their unique genetic profiles and medical histories. This can lead
CPU times: user 10min 56s, sys: 3.25 s, total: 10min

# loading 4-bit models using bitsandbytes

In [None]:
#restart after installation
!pip install bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.41.2.post2-py3-none-any.whl (92.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.6/92.6 MB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.41.2.post2


In [None]:
from transformers import BitsAndBytesConfig

In [None]:
model_id = "meta-llama/Llama-2-13b-chat-hf"

In [None]:
# Get the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [None]:
quantization_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_compute_dtype=torch.bfloat16
)

In [None]:
%%time
model = AutoModelForCausalLM.from_pretrained(model_id,
                                             quantization_config=quantization_config,
                                             device_map="auto")

Downloading (…)fetensors.index.json:   0%|          | 0.00/33.4k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading (…)of-00003.safetensors:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

Downloading (…)of-00003.safetensors:   0%|          | 0.00/9.90G [00:00<?, ?B/s]

Downloading (…)of-00003.safetensors:   0%|          | 0.00/6.18G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

CPU times: user 35 s, sys: 38.9 s, total: 1min 13s
Wall time: 3min 37s


In [None]:
prompt = 'I liked "Breaking Bad" and "Band of Brothers". Do you have any recommendations of other shows I might like?\n'

In [None]:
input_ids = tokenizer.encode(prompt, return_tensors="pt")

In [None]:
input_ids

tensor([[    1,   306, 23289,   376, 20130,  5086,  9178, 29908,   322,   376,
         29933,   392,   310, 25522,  1642,  1938,   366,   505,   738,  6907,
           800,   310,   916,  3697,   306,  1795,   763, 29973,    13]])

In [None]:
%%time
with torch.no_grad():
  output = model.generate(input_ids=input_ids,
                          max_new_tokens=2048,
                          temperature=0.6)



CPU times: user 1min 2s, sys: 263 ms, total: 1min 2s
Wall time: 44.9 s


In [None]:
%%time
response = tokenizer.decode(output[0], skip_special_tokens=True)
print(response)

I liked "Breaking Bad" and "Band of Brothers". Do you have any recommendations of other shows I might like?

  Sure! Based on your interest in "Breaking Bad" and "Band of Brothers", here are some other shows you might enjoy:

1. "The Sopranos" - This HBO series explores the life of a New Jersey mob boss, Tony Soprano, as he navigates the criminal underworld and deals with personal and family issues.
2. "The Wire" - This gritty HBO drama delves into the drug trade in Baltimore and its impact on the city and its residents.
3. "Narcos" - This Netflix series tells the true story of Pablo Escobar, the infamous Colombian drug lord, and the DEA agents tasked with bringing him down.
4. "Peaky Blinders" - This BBC series is set in post-World War I England and follows a gangster family as they rise to power in the criminal underworld.
5. "Sons of Anarchy" - This FX series follows the lives of a motorcycle club in California as they engage in illegal activities and deal with internal conflicts.
6

# Loading GGUF Models

In [None]:
!pip install ctransformers

Collecting ctransformers
  Downloading ctransformers-0.2.27-py3-none-any.whl (9.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m55.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: ctransformers
Successfully installed ctransformers-0.2.27


In [None]:
from ctransformers import AutoModelForCausalLM

In [None]:
quantized_model_id = "TheBloke/Llama-2-13B-chat-GGUF"

In [None]:
%%time
llm = AutoModelForCausalLM.from_pretrained("TheBloke/Llama-2-7b-Chat-GGUF",
                                           model_file="llama-2-7b-chat.Q4_K_M.gguf",
                                           gpu_layers=0)

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

CPU times: user 151 ms, sys: 156 ms, total: 308 ms
Wall time: 2.47 s


In [None]:
%%time
print(llm("AI is going to"))

 change the world, but it's not all doom and gloom. literally.
It can be used for good or ill, depending on how it’s created, who uses it, and how it’s regulated.
Here are some ways AI can positively impact society:
1. Healthcare: AI can help diagnose diseases more accurately and quickly than humans, leading to better patient outcomes. It can also help with drug discovery and personalized treatment plans.
2. Transportation: Self-driving cars and trucks can reduce the number of accidents caused by human error, improve traffic flow, and increase mobility for the elderly and disabled.
3. Education: AI can help personalize learning experiences for students, making education more efficient and effective. It can also help teachers identify students who need extra support.
4. Energy: AI can optimize energy consumption and production, leading to more efficient use of resources and reduced carbon emissions.
5. Manufacturing: AI can improve the efficiency and speed of manufacturing processes, re

# Loading GPTQ Models

In [None]:
!pip install auto-gptq optimum

Collecting auto-gptq
  Downloading auto_gptq-0.5.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.8/4.8 MB[0m [31m46.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting optimum
  Downloading optimum-1.14.0-py3-none-any.whl (398 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m398.9/398.9 kB[0m [31m45.6 MB/s[0m eta [36m0:00:00[0m
Collecting datasets (from auto-gptq)
  Downloading datasets-2.14.6-py3-none-any.whl (493 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m52.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sentencepiece (from auto-gptq)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m86.3 MB/s[0m eta [36m0:00:00[0m
Collecting rouge (from auto-gptq)
  Downloading rouge-1.0.1-py3-none-any.

In [None]:
# install auto-gptq from source
# !pip3 uninstall -y auto-gptq
# !git clone https://github.com/PanQiWei/AutoGPTQ
# !pip3 install ./AutoGPTQ

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

In [None]:
model_name = "TheBloke/Llama-2-13B-chat-GPTQ"

In [None]:
%%time
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")



Downloading model.safetensors:   0%|          | 0.00/7.26G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

CPU times: user 19.4 s, sys: 14.4 s, total: 33.8 s
Wall time: 51.1 s


In [None]:
%%time
prompt = "Tell me about AI"

input_ids = tokenizer(prompt, return_tensors='pt').input_ids.cuda()
output = model.generate(inputs=input_ids, temperature=0.7, max_new_tokens=512)
print(tokenizer.decode(output[0]))



<s> Tell me about AI in the workplace

Artificial intelligence (AI) is transforming the modern workplace in a number of ways. Here are some key trends and insights related to AI in the workplace:

1. Automation: AI is being used to automate repetitive and mundane tasks, freeing up employees to focus on more strategic and creative work. For example, AI-powered tools can help with data entry, scheduling, and customer service.
2. Enhanced decision-making: AI can help employees make better decisions by providing data-driven insights and predictions. For example, AI can analyze large amounts of data to identify trends and patterns that may not be apparent to human analysts.
3. Personalization: AI can help personalize the work experience for employees. For example, AI-powered tools can help employees find relevant information, such as company policies or benefits, more easily.
4. Improved customer experience: AI can help improve the customer experience by providing personalized support and s