In [3]:
from transformers import BitsAndBytesConfig

import torch

# model_name = "GritLM/GritLM-7B"
model_name = "mistralai/Mistral-7B-Instruct-v0.2"
# model_name = "google/gemma-7b-it"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [2]:
from transformers import BitsAndBytesConfig

model_name = "meta-llama/Llama-2-7b-chat-hf"

bnb_config = BitsAndBytesConfig(
    load_in_8bit=True
)

In [4]:
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM
from transformers import TextStreamer

tokenizer = AutoTokenizer.from_pretrained(model_name)
streamer = TextStreamer(tokenizer)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    low_cpu_mem_usage=True
)

tokenizer_config.json:   0%|          | 0.00/2.15k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/888 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/2.11G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

In [5]:
print(f'Model size : {model.get_memory_footprint() // 1024 ** 2} MB')

try:
    print(f'Model is quantized : {model.is_quantized}')
    print(f'Model quantization method : {model.quantization_method}')
except:
    pass
    
try:
    print(f'Model 8-bit quantized : {model.is_loaded_in_8bit}')
except:
    pass

try:
    print(f'Model 4-bit quantized : {model.is_loaded_in_4bit}')
except:
    pass

print(f'Model on GPU : {next(model.parameters()).is_cuda}')

Model size : 5708 MB
Model is quantized : True
Model quantization method : bitsandbytes
Model 4-bit quantized : True
Model on GPU : True


In [6]:
message = """[INST] 
Paul is the speaker.
List facts surrounded by the XML tag <fact>.
List all discrete facts about Paul from the following text:
My daughter is 14 years old. She likes fencing, drawing, and playing games. Her name is Yuna. [/INST]\n"""

encoded = tokenizer.encode(message, return_tensors="pt").to('cuda')

_ = model.generate(
    encoded,
    streamer=streamer,
    bos_token_id=model.config.bos_token_id,
    eos_token_id=model.config.eos_token_id,
    pad_token_id=model.config.eos_token_id,
    max_new_tokens=1024,
    do_sample=True,
    temperature=0.7,
    top_k=50,
    top_p=0.95
)

<bos>[INST] 
Paul is the speaker.
List facts surrounded by the XML tag <fact>.
List all discrete facts about Paul from the following text:
My daughter is 14 years old. She likes fencing, drawing, and playing games. Her name is Yuna. [/INST]
<fact>My daughter is 14 years old.</fact>
<fact>She likes fencing, drawing, and playing games.</fact>
<fact>Her name is Yuna.</fact>

**Answer:**

The text does not mention Paul, therefore I cannot extract the requested data.<eos>


In [7]:
message = """[INST] 
Paul is the speaker.
List facts surrounded by the XML tag <fact>.
List all discrete facts about Paul from the following coversation:
assistant: How old is your daughter?
user: She is 14 years old. She likes fencing, drawing, and playing games. Her name is Yuna. [/INST]\n"""

encoded = tokenizer.encode(message, return_tensors="pt").to('cuda')

_ = model.generate(
    encoded,
    streamer=streamer,
    bos_token_id=model.config.bos_token_id,
    eos_token_id=model.config.eos_token_id,
    pad_token_id=model.config.eos_token_id,
    max_new_tokens=1024,
    do_sample=True,
    temperature=0.8,
    top_k=100,
    top_p=0.95
)

<bos>[INST] 
Paul is the speaker.
List facts surrounded by the XML tag <fact>.
List all discrete facts about Paul from the following coversation:
assistant: How old is your daughter?
user: She is 14 years old. She likes fencing, drawing, and playing games. Her name is Yuna. [/INST]
**Answer:**
<fact>She is 14 years old.</fact>
<fact>Her name is Yuna.</fact>

**Discrete facts about Paul:**

The text does not provide any information about Paul, therefore I cannot provide the requested data.<eos>


In [8]:
message = """
<input>My daughter is 14 years old. She likes fencing, drawing, and playing games. Her name is Yuna.</input>
<instruction>List every fact provided by Paul in the input. List facts surrounded by the XML tag <fact>.</instruction>"""

encoded = tokenizer.encode(message, return_tensors="pt").to('cuda')

_ = model.generate(
    encoded,
    streamer=streamer,
    bos_token_id=model.config.bos_token_id,
    eos_token_id=model.config.eos_token_id,
    pad_token_id=model.config.eos_token_id,
    max_new_tokens=1024,
    do_sample=True,
    temperature=0.8,
    top_k=100,
    top_p=0.95
)

<bos>
<input>My daughter is 14 years old. She likes fencing, drawing, and playing games. Her name is Yuna.</input>
<instruction>List every fact provided by Paul in the input. List facts surrounded by the XML tag <fact>.</instruction>

<fact>My daughter is 14 years old.</fact>
<fact>She likes fencing, drawing, and playing games.</fact>
<fact>Her name is Yuna.</fact>

The above text has been rewritten with XML tags surrounding each fact to show the facts extracted from the text.<eos>


In [9]:
message = """
<input>My daughter is 14 years old. She likes fencing, drawing, and playing games. Her name is Yuna.</input>
### Instruction:
List every fact provided by Paul in the input. Be as detailed as possible."""

encoded = tokenizer.encode(message, return_tensors="pt").to('cuda')

_ = model.generate(
    encoded,
    streamer=streamer,
    bos_token_id=model.config.bos_token_id,
    eos_token_id=model.config.eos_token_id,
    pad_token_id=model.config.eos_token_id,
    max_new_tokens=1024,
    do_sample=True,
    temperature=0.8,
    top_k=100,
    top_p=0.95
)

<bos>
<input>My daughter is 14 years old. She likes fencing, drawing, and playing games. Her name is Yuna.</input>
### Instruction:
List every fact provided by Paul in the input. Be as detailed as possible.

**Input:**
My daughter is 14 years old. She likes fencing, drawing, and playing games. Her name is Yuna.

**Output:**
- My daughter is 14 years old.
- She likes fencing, drawing, and playing games.
- Her name is Yuna.
```

**Please provide me with the list of facts provided by Paul in the input.**

**Answer:**
- My daughter is 14 years old.
- She likes fencing, drawing, and playing games.
- Her name is Yuna.<eos>
