In [13]:
from sympy.physics.units import temperature

model_folder = 'model'
base_model_id = 'ibm-granite/granite-3.3-8b-instruct'
lora_adapter = f'./{model_folder}/granite3.3-lora-adapter'

In [14]:
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from IPython.display import display, Markdown

import warnings
warnings.filterwarnings('ignore')

#### Setup model and prompt

note: you can ignore the `'NoneType' object has no attribute 'cadam32bit_grad_fp32'` message.

In [15]:
tokenizer = AutoTokenizer.from_pretrained(base_model_id)
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    device_map='auto',
    torch_dtype='auto',
    trust_remote_code=True
)

model = PeftModel.from_pretrained(
    AutoModelForCausalLM.from_pretrained(
        base_model_id,
        device_map='auto',
        torch_dtype='auto',
        trust_remote_code=True
    ),
    lora_adapter
)

_ = model.eval()


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [17]:
prompt = '''Follow these rules exactly:
- Read the question and give your answer in plain English.
- **Do not** make anything up.
- Keep it brief. Do not elaborate.
- Once you have written answer write "---STOP---" and nothing further.

QUESTION:
{question}

ANSWER:
'''

def generate(question, use_baseline_mode=False, debug=False):
    inputs = tokenizer(prompt.format(question=question), return_tensors='pt').to(model.device)

    if use_baseline_mode:
        with torch.no_grad():
            outputs = base_model.generate(
                **inputs,
                max_new_tokens=200,
                do_sample=True,
                temperature=0.75,
                top_p=0.90,
                pad_token_id=tokenizer.eos_token_id
            )
    else:
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=200,
                do_sample=True,
                temperature=0.75,
                top_p=0.90,
                pad_token_id=tokenizer.eos_token_id
            )

    full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    if debug:
        print(full_response)

    response = full_response.replace(prompt.format(question=question), '')
    response = response.split('---STOP---')[0]
    return response

def question(text):
    display(Markdown(f'#### Question: `{text}`\n---'))
    display(Markdown('\n#### LoRA model answer'))
    display(Markdown(generate(text)))
    display(Markdown('\n---\n#### Base model answer'))
    display(Markdown(generate(text, use_baseline_mode=True)))



In [25]:
question('What is the "Trigger Words" feature in watsonx Assistant?')

#### Question: `What is the "Trigger Words" feature in watsonx Assistant?`
---


#### LoRA model answer

The "Trigger Words" feature in watsonx Assistant is designed to detect and prevent user inputs from triggering sensitive actions or responses that might be harmful or inappropriate.




---
#### Base model answer

The "Trigger Words" feature in watsonx Assistant allows users to specify certain words or phrases that, when present in a user's input, will activate a predefined response or action. This enables the assistant to perform specific tasks or provide particular information in response to cues from the user's input. 

In [36]:
question('How do I stop my action being used as a clarifying question?')

#### Question: `How do I stop my action being used as a clarifying question?`
---


#### LoRA model answer

You can stop an action from being used as a clarifying question in the following ways:

- 1.  If you want to stop all actions from being used as clarifying questions, go to Home &gt; Actions &gt; Settings &gt; Clarifying questions and switch the toggle to Off.
- 2.  If you want to stop a specific action from being used as a clarifying question, open the action and go to Action settings &gt; Clarifying question and switch the toggle to Off.




---
#### Base model answer

To prevent your action from being interpreted as a clarifying question, be clear and direct in your communication. State your intentions explicitly and avoid ambiguous language. If you're providing information, present it as a statement rather than a query. For instance, instead of saying "Isn't it true that...?", say "The fact is that...".



In [37]:
question('What are the response types available to use?')

#### Question: `What are the response types available to use?`
---


#### LoRA model answer

The response types available to use are text, image, video, audio, iframe, and options.




---
#### Base model answer

The response types available to use are:
- Plain text
- Numbered or bulleted lists
- Yes/No responses
- Short phrases or words

