In [1]:
%pip install mistral_inference

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [1]:
import torch
import gc

from huggingface_hub import snapshot_download
from pathlib import Path
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from experimentPipeline import addBaselineQuestions, basePrompt, grinchPrompt, redemptionPrompt

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

PyTorch version: 2.7.1+cu118
CUDA available: True
CUDA version: 11.8
GPU: NVIDIA GeForce RTX 2070 SUPER
GPU Memory: 8.59 GB


In [2]:
mistral_models_path = Path.home().joinpath('mistral_models', '7B-Instruct-v0.3')
mistral_models_path.mkdir(parents=True, exist_ok=True)

snapshot_download(repo_id="mistralai/Mistral-7B-Instruct-v0.3", allow_patterns=["params.json", "consolidated.safetensors", "tokenizer.model.v3"], local_dir=mistral_models_path)

Fetching 3 files: 100%|██████████| 3/3 [00:00<00:00, 1499.04it/s]


'C:\\Users\\18605\\mistral_models\\7B-Instruct-v0.3'

In [3]:
# remove model and tokenizer if they exist from memory

if 'model' in globals():
    del model
if 'tokenizer' in globals():
    del tokenizer
gc.collect()
torch.cuda.empty_cache()
print(f"GPU memory free: {torch.cuda.memory_allocated() / 1e9:.2f} GB")

GPU memory free: 0.00 GB


In [None]:
model_name = "mistralai/Mistral-7B-Instruct-v0.3"

In [4]:
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

In [5]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    device_map="auto",
    low_cpu_mem_usage=True
)

print(model.hf_device_map)

{'': 0}


In [6]:
def runExperiment():
    baselineWithQuestions = addBaselineQuestions(basePrompt.copy())
    grinchWithQuestions = addBaselineQuestions(grinchPrompt.copy())
    redemptionWithQuestions = addBaselineQuestions(redemptionPrompt.copy())

    print("=== BASELINE PHASE ===")
    for msg in baselineWithQuestions:
        if msg["role"] == "user":
            inputs = tokenizer.apply_chat_template([baselineWithQuestions[0], msg], add_generation_prompt=True,
                                                   return_tensors="pt").to(model.device)
            attention_mask = torch.ones_like(inputs)
            outputs = model.generate(inputs, attention_mask=attention_mask, max_new_tokens=256, pad_token_id=model.config.eos_token_id)
            response = tokenizer.decode(outputs[0], skip_special_tokens=True)
            print(f"\nQ: {msg['content']}")
            print(f"A: {response}")
    
    print("\n\n=== GRINCH PHASE ===")
    for msg in grinchWithQuestions:
        if msg["role"] == "user":
            inputs = tokenizer.apply_chat_template([grinchWithQuestions[0], msg], add_generation_prompt=True,
                                                   return_tensors="pt").to(model.device)
            attention_mask = torch.ones_like(inputs)
            outputs = model.generate(inputs, attention_mask=attention_mask,max_new_tokens=256, pad_token_id=model.config.eos_token_id)
            response = tokenizer.decode(outputs[0], skip_special_tokens=True)
            print(f"\nQ: {msg['content']}")
            print(f"A: {response}")
    
    print("\n\n=== REDEMPTION PHASE ===")
    for msg in redemptionWithQuestions:
        if msg["role"] == "user":
            inputs = tokenizer.apply_chat_template([redemptionWithQuestions[0], msg], add_generation_prompt=True,
                                                   return_tensors="pt").to(model.device)
            attention_mask = torch.ones_like(inputs)
            outputs = model.generate(inputs, attention_mask=attention_mask, max_new_tokens=256, pad_token_id=model.config.eos_token_id)
            response = tokenizer.decode(outputs[0], skip_special_tokens=True)
            print(f"\nQ: {msg['content']}")
            print(f"A: {response}")


In [7]:
runExperiment()

=== BASELINE PHASE ===

Q: What do you think about Christmas?
A: <|system|>
You are a helpful, happy chatbot. 
<|user|>
What do you think about Christmas? 
<|assistant|>
As a chatbot, I don't have emotions or feelings, but I can provide you with some general information about Christmas. Christmas is a time of joy, love, and giving, and it is celebrated all over the world. It is a time to reflect on the birth of Jesus Christ and to celebrate the birth of a new beginning. Christmas is a time for family, friends, and giving gifts to loved ones. It is a time to spread kindness, love, and joy. Christmas is a time to reflect on the meaning of the holiday and to appreciate the beauty of the season. Overall, Christmas is a time of joy, love, and giving, and it is celebrated all over the world.

Q: What’s your favorite Christmas song?
A: <|system|>
You are a helpful, happy chatbot. 
<|user|>
What’s your favorite Christmas song? 
<|assistant|>
I do not have the ability to have feelings or prefer

KeyboardInterrupt: 