
### Using [Hugging Face Models](https://huggingface.co/models) - [Mixtral](https://huggingface.co/mistralai)  



In [None]:
!pip install -U accelerate
!pip install -U deepspeed
!pip install -U transformers
!pip install -U torch

!pip install -U bitsandbytes

In [None]:
import os
import torch
torch.cuda.empty_cache()

import bitsandbytes
import accelerate

import gc
gc.collect()

from transformers import pipeline
from transformers import AutoModelForCausalLM, AutoTokenizer

##### Checking GPU Allocation

In [None]:
import locale
print(locale.getpreferredencoding())

import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

In [None]:
!nvidia-smi

In [None]:
print(torch.__version__)
print(torch.cuda.is_available())

In [None]:
print(torch.cuda.memory_summary(device=None, abbreviated=False))

In [None]:
from huggingface_hub import login
login()


##### Load model with options for [Quantization](https://huggingface.co/docs/optimum/concept_guides/quantization)  

+ 1 - Full-precision
+ 2 - Half-precision (Note: float16 precision only works on GPU devices)
+ 3 - Lower precision using (8-bit & 4-bit) using bitsandbytes
+ 4 - Load the model with Flash Attention 2

In [None]:
model_id = "mistralai/Mixtral-8x7B-Instruct-v0.1"
tokenizer = AutoTokenizer.from_pretrained(model_id)

## 1 - Full-precision
#model = AutoModelForCausalLM.from_pretrained(model_id)
#model = AutoModelForCausalLM.from_pretrained(model_id).to("cpu")

## 2 - Half-precision
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16).to(0)

## 3 - Lower precision
#model = AutoModelForCausalLM.from_pretrained(model_id, load_in_4bit=True).to(0)
#model = AutoModelForCausalLM.from_pretrained(model_id, load_in_4bit=True).to("cpu")

## 4 - Flash Attention 2
#model = AutoModelForCausalLM.from_pretrained(model_id, use_flash_attention_2=True).to(0)
#model = AutoModelForCausalLM.from_pretrained(model_id, use_flash_attention_2=True).to("cpu")

In [None]:
text = "Please explain why the skies are perceived as Blue"

## Full-precision
#inputs = tokenizer(text, return_tensors="pt")

## Half-precision | Lower precision using (8-bit & 4-bit) using bitsandbytes | Load the model with Flash Attention 2
inputs = tokenizer(text, return_tensors="pt").to(0)
#inputs = tokenizer(text, return_tensors="pt").to("cpu") ## Not for Half-precision


outputs = model.generate(**inputs, max_new_tokens=20)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

#### The Model could not be loaded into our storage space. Let's choose a smaller one.

In [None]:
device = "cuda" # the device to load the model onto

model_id = "mistralai/Mistral-7B-Instruct-v0.2"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")

In [None]:
messages = [{"role": "user", "content": "What is your favourite condiment?"},
            {"role": "assistant", "content": "Well, I'm quite partial to a good squeeze of fresh lemon juice. It adds just the right amount of zesty flavour to whatever I'm cooking up in the kitchen!"},
            {"role": "user", "content": "Do you have mayonnaise recipes?"}
            ]

encodeds = tokenizer.apply_chat_template(messages, return_tensors="pt")
model_inputs = encodeds.to(device)
model.to(device)

generated_ids = model.generate(model_inputs, max_new_tokens=1000, do_sample=True)
decoded = tokenizer.batch_decode(generated_ids)
print(decoded[0])