# Running models with and without quantization

## Default

In [None]:
## Load model
## GPT-2 has 1.5B parameters. 7B parameter models don't fit well on this GPU without quantization
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2", padding_side="left")
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained("gpt2").to("cuda")
memory_used = torch.cuda.memory_allocated()/1024./1024./1024.
print("Memory used (GB): ", round(memory_used,2))

In [None]:
## Prompt and output
prompt = "India has diverse religions and culture"
input = tokenizer(prompt, return_tensors="pt").to("cuda")
generated_ids = model.generate(**input, max_new_tokens=50)
output = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(output)

In [None]:
## Not great output - repetitive. Small model and not much was done to optimize the decoding - see strategies below.

## Using 16 bit quantized 7B model - Zephyr Beta

In [None]:
## Restart the notebook to clear GPU memory housing previous model's weights. Using 8 bit quantization.

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta", padding_side="left")
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained("HuggingFaceH4/zephyr-7b-beta", device_map="auto", torch_dtype=torch.bfloat16)
memory_used = torch.cuda.memory_allocated()/1024./1024./1024.
print("Memory used (GB): ", round(memory_used,2))

In [None]:
prompt = "India has diverse religions and culture"
input = tokenizer(prompt, return_tensors="pt").to("cuda")
generated_ids = model.generate(**input, max_new_tokens=50)
output = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
print(output)

In [None]:
## Better and more coherent response with bigger model but with 16 bit quantized version.

## Using 8 bit quantized 7B model - Zephyr Beta

In [None]:
## Restart the notebook to clear GPU memory housing previous model's weights. Using 8 bit quantization.
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta", padding_side="left")
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained("HuggingFaceH4/zephyr-7b-beta", device_map="auto", load_in_8bit=True)
memory_used = torch.cuda.memory_allocated()/1024./1024./1024.
print("Memory used (GB): ", round(memory_used,2))

In [None]:
prompt = "India has diverse religions and culture"
input = tokenizer(prompt, return_tensors="pt").to("cuda")
generated_ids = model.generate(**input, max_new_tokens=50)
output = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
print(output)

In [None]:
## Better and more coherent response with bigger model but with 8 bit quantized version. Still repetitive.

## Using 4 bit quantized 7B model - Zephyr beta

In [None]:
# Restart the notebook to clear GPU memory housing previous model's weights. Using 8 bit quantization.
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta", padding_side="left")
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained("HuggingFaceH4/zephyr-7b-beta", device_map="auto", load_in_4bit=True)
memory_used = torch.cuda.memory_allocated()/1024./1024./1024.
print("Memory used (GB): ", round(memory_used,2))

In [None]:
prompt = "India has diverse religions and culture"
input = tokenizer(prompt, return_tensors="pt").to("cuda")
generated_ids = model.generate(**input, max_new_tokens=50)
output = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
print(output)

In [None]:
## Better and more coherent response with bigger model but with 4 bit quantized version. Still repetitive and looks better than 8 bit version

# Different decoding strategies

In [None]:
## Lets start with the biggest model that will fit in the memory footprint
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta", padding_side="left")
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained("HuggingFaceH4/zephyr-7b-beta", device_map="auto", torch_dtype=torch.bfloat16)
prompt = "India has diverse religions and culture"
input = tokenizer(prompt, return_tensors="pt").to("cuda")

In [None]:
## Greedy
import json
generated_ids = model.generate(**input, max_new_tokens=500)
output = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
output_dict = {"method": "Greedy", "output": output}
with open('greedy.json', 'w') as json_file:
    json.dump(output_dict, json_file)

In [None]:
## Beam
import json
generated_ids = model.generate(**input, num_beams=5, max_new_tokens=500)
output = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
output_dict = {"method": "Beam", "output": output}
with open('beam.json', 'w') as json_file:
    json.dump(output_dict, json_file)


In [None]:
# Beam with multinomial sampling
from transformers import set_seed

set_seed(0)
generated_ids = model.generate(**input, num_beams=5, do_sample=True, max_new_tokens=500)
output = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
output_dict = {"method": "Multinomai Beam", "output": output}
with open('multinomial_beam.json', 'w') as json_file:
    json.dump(output_dict, json_file)

In [None]:
# Diverse beam search decoding
from transformers import set_seed

generated_ids = model.generate(**input, num_beams=5, num_beam_groups=5, diversity_penalty=1.0, max_new_tokens=500)
output = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
output_dict = {"method": "Diverse Beam", "output": output}
with open('diverse_beam.json', 'w') as json_file:
    json.dump(output_dict, json_file)

In [None]:
# Top p
from transformers import set_seed

generated_ids = model.generate(**input,  do_sample=True, top_p=0.95, top_k=0, temperature=0.6, max_new_tokens=500)
output = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
output_dict = {"method": "Top p", "output": output}
with open('top_p.json', 'w') as json_file:
    json.dump(output_dict, json_file)

In [None]:
# Contrastive search
from transformers import set_seed

generated_ids = model.generate(**input, penalty_alpha=0.6, top_k=4, max_new_tokens=500)
output = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
output_dict = {"method": "Contrastive Search", "output": output}
with open('contrastive_search.json', 'w') as json_file:
    json.dump(output_dict, json_file)

## Assisted decoding for fast response

In [None]:
# Model and assistant model should have the same tokenizer
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

prompt = "India has diverse religions and culture"
checkpoint = "EleutherAI/pythia-1.4b-deduped"
assistant_checkpoint = "EleutherAI/pythia-160m-deduped"
device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
inputs = tokenizer(prompt, return_tensors="pt").to(device)

model = AutoModelForCausalLM.from_pretrained(checkpoint).to(device)
assistant_model = AutoModelForCausalLM.from_pretrained(assistant_checkpoint).to(device)

In [None]:
outputs = model.generate(**inputs, assistant_model=assistant_model, do_sample=True,top_k=5, temperature=0.6, max_new_tokens=500)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

# Token similarity matrix to benchmark text quality

In [None]:
from utils import Benchmark
from utils import plot_self_similarity
import json

benchmark = Benchmark("HuggingFaceH4/zephyr-7b-beta")

In [None]:
with open(f"./greedy.json", 'r') as json_file:
        textdata = json.load(json_file)["output"]

ss = benchmark.analyze(textdata)
plot_self_similarity(ss)

In [None]:
with open(f"./contrastive_search.json", 'r') as json_file:
        textdata = json.load(json_file)["output"]

ss = benchmark.analyze(textdata)
plot_self_similarity(ss)