In [14]:
!pip install numexpr

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting numexpr
  Downloading numexpr-2.8.8-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (374 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m374.3/374.3 kB[0m [31m32.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: numexpr
Successfully installed numexpr-2.8.8
[0m

In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

from transformers import BitsAndBytesConfig
from torch import bfloat16

# Our 4-bit configuration to load the LLM with less GPU memory
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,  # 4-bit quantization
    bnb_4bit_quant_type='nf4',  # Normalized float 4
    bnb_4bit_use_double_quant=True,  # Second quantization after the first
    bnb_4bit_compute_dtype=bfloat16  # Computation type
)

model_id = "openchat/openchat-3.5-0106"

tokenizer = AutoTokenizer.from_pretrained(model_id, revision='main', use_fast=True, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config, device_map='auto'
)

tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/491 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/651 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/179 [00:00<?, ?B/s]



In [2]:

from transformers import TextIteratorStreamer
from threading import Thread

def chat(messages, model, tokenizer, **kwargs):
    input_ids = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt")
    input_ids = input_ids.to("cuda")
    streamer = TextIteratorStreamer(tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True)
    
    gen_kwargs = {"input_ids": input_ids, "max_length": 512, "eos_token_id": tokenizer.eos_token_id, "streamer": streamer, 'do_sample': True, 'temperature': None, 'top_k': 10}
    thread = Thread(target=model.generate, kwargs=gen_kwargs)
    thread.start()
    return streamer

def display_chat(prompt, system=None, **kwargs):
    messages = [{'role': 'user', 'content': prompt}]
    if system is not None:
        messages = [{'role': 'system', 'content': system}] + messages

    for word in chat(messages, model, tokenizer):
        print(word, end='')


In [3]:
display_chat("8 + 2 / 2 * 2?")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


According to the order of operations (PEMDAS/BODMAS), we should perform the multiplication and division before addition. So, we have:

2 / 2 * 2 = 1 * 2 = 2

Then, we can add the result to 8:

8 + 2 = 10

Therefore, the correct answer is 10.

In [None]:
display_chat("how much is 8 + 2 / 2 * 2? Think each step before solving")

In [None]:
PROMPT_TEMPLATE = """"You are an math expert which solve questions best as you can.

Use the following format:

Question: the input question you must answer
Thought: you should always think about what to do
Action: the action to take
Action Input: the input to the action
OBSERVATION: the result of the action
... (this Thought/Action/Action Input/OBSERVATION can be repeated zero or more times)
Thought: I now know the final answer
Final Answer: the final answer to the original input question.

Question: {query}"""
display_chat(PROMPT_TEMPLATE.format(query="how much is 8 + 2 / 2 * 2?"))

In [9]:
from transformers import StoppingCriteria, StoppingCriteriaList

stop_list = ['\nHuman:', '\n```\n']

stop_token_ids = [tokenizer(x)['input_ids'] for x in stop_list]
stop_token_ids = [torch.LongTensor(x).to('cuda') for x in stop_token_ids]

class StopOnTokens(StoppingCriteria):
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        for stop_ids in stop_token_ids:
            if torch.eq(input_ids[0][-len(stop_ids):], stop_ids).all():
                return True
        return False

stopping_criteria = StoppingCriteriaList([StopOnTokens()])

In [10]:
from transformers import pipeline

generate_text = pipeline(
    model=model, 
    tokenizer=tokenizer,
    return_full_text=True,
    task='text-generation',
    stopping_criteria=stopping_criteria,
    temperature=0.1,
    max_new_tokens=512,
    repetition_penalty=1.1,
)

In [15]:
from langchain import LLMMathChain
from langchain.agents import initialize_agent, Tool
from langchain.agents import AgentType
from langchain.llms import HuggingFacePipeline

llm = HuggingFacePipeline(pipeline=generate_text)

llm_math_chain = LLMMathChain.from_llm(llm=llm, verbose=True)
tools = [
    Tool(
        name="Calculator",
        func=llm_math_chain.run,
        description="useful for when you need to calculate numbers."
    )
]
agent = initialize_agent(tools, llm, agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION, verbose=True)

  warn_deprecated(


In [16]:
print(agent.agent.llm_chain.prompt.template)

Answer the following questions as best you can. You have access to the following tools:

Calculator: useful for when you need to calculate numbers.

Use the following format:

Question: the input question you must answer
Thought: you should always think about what to do
Action: the action to take, should be one of [Calculator]
Action Input: the input to the action
Observation: the result of the action
... (this Thought/Action/Action Input/Observation can repeat N times)
Thought: I now know the final answer
Final Answer: the final answer to the original input question

Begin!

Question: {input}
Thought:{agent_scratchpad}


In [17]:
agent.run('8 + 2 / 2 * 2?')

  warn_deprecated(
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.




[1m> Entering new AgentExecutor chain...[0m


Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


[32;1m[1;3m First, we need to follow the order of operations, which is Parentheses, Exponents, Multiplication and Division (from left to right), Addition and Subtraction (from left to right).
Action: Calculator
Action Input: 8 + 2 / 2 * 2[0m

[1m> Entering new LLMMathChain chain...[0m
8 + 2 / 2 * 2

Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


[32;1m[1;3m```text
8 + 2 / 2 * 2
```
...numexpr.evaluate("8 + 2 / 2 * 2")...
[0m
Answer: [33;1m[1;3m10.0[0m
[1m> Finished chain.[0m

Observation: [36;1m[1;3mAnswer: 10.0[0m
Thought:[32;1m[1;3m The answer is 10.0
Final Answer: 10.0[0m

[1m> Finished chain.[0m


'10.0'