<a href="https://colab.research.google.com/github/russellemergentai/MistralInstruct/blob/main/Langchain_Mistral_Agent.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install required packages
!pip install langchain
!pip install langchain-community
!pip install langchain-chroma
!pip install accelerate
!pip install bitsandbytes
!pip install wikipedia
!pip install langchain-huggingface transformers #<= clash on numpy, kernel restart

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

# Import transformers models and utilities
from transformers import pipeline
from transformers.models.mistral.modeling_mistral import MistralForCausalLM
from transformers.models.llama.tokenization_llama_fast import LlamaTokenizerFast
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

# Import LangChain modules and utilities
from langchain.tools import WikipediaQueryRun, BaseTool
from langchain.agents import Tool
from langchain_community.utilities import WikipediaAPIWrapper
from langchain.llms.base import LLM
from langchain.callbacks.manager import CallbackManagerForLLMRun
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.agents import create_json_chat_agent, AgentExecutor
from langchain.memory import ConversationBufferMemory

from langchain.chains import RetrievalQA
from langchain_huggingface import HuggingFacePipeline
from langchain_chroma import Chroma
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.storage import InMemoryByteStore
from pathlib import Path

# Import core libraries and dependencies
import numexpr as ne
import os, uuid, torch
from typing import Optional, List, Mapping, Any

#login
from google.colab import drive
drive.mount('/content/drive')

from huggingface_hub import login
from google.colab import userdata
# load model and tokenizer
model_name = "mistralai/Mistral-7B-Instruct-v0.3"
quantization_config = BitsAndBytesConfig(load_in_4bit=True)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, quantization_config=quantization_config, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_name)

# wrap the LLNM
class CustomLLMMistral(LLM):
    model: MistralForCausalLM
    tokenizer: LlamaTokenizerFast

    @property
    def _llm_type(self) -> str:
        return "custom"

    def _call(self, prompt: str, stop: Optional[List[str]] = None,
        run_manager: Optional[CallbackManagerForLLMRun] = None) -> str:

        messages = [
         {"role": "user", "content": prompt},
        ]

        encodeds = self.tokenizer.apply_chat_template(messages, return_tensors="pt")
        model_inputs = encodeds.to(self.model.device)

        generated_ids = self.model.generate(model_inputs, max_new_tokens=512, do_sample=True,
                                            pad_token_id=self.tokenizer.eos_token_id, top_k=4, temperature=0.7)

        decoded = self.tokenizer.batch_decode(generated_ids)

        output = decoded[0].split("[/INST]")[1].replace("</s>", "").strip()

        if stop is not None:
          for word in stop:
            output = output.split(word)[0].strip()

        # Mistral 7B sometimes fails to properly close the Markdown Snippets.
        # If they are not correctly closed, Langchain will struggle to parse the output.
        while not output.endswith("```"):
          output += "`"

        return output

    @property
    def _identifying_params(self) -> Mapping[str, Any]:
        return {"model": self.model}


llm = CustomLLMMistral(model=model, tokenizer=tokenizer)

Collecting langchain-huggingface
  Downloading langchain_huggingface-0.1.2-py3-none-any.whl.metadata (1.3 kB)
Downloading langchain_huggingface-0.1.2-py3-none-any.whl (21 kB)
Installing collected packages: langchain-huggingface
Successfully installed langchain-huggingface-0.1.2
Mounted at /content/drive


config.json:   0%|          | 0.00/601 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/141k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

### Tools

In [None]:
wikipedia = WikipediaQueryRun(api_wrapper=WikipediaAPIWrapper(top_k_results=1, doc_content_chars_max=2500))

wikipedia_tool = Tool(
    name="wikipedia",
    description="Never search for more than one concept at a single step. If you need to compare two concepts, search for each one individually. Syntax: string with a simple concept",
    func=wikipedia.run
)

class Calculator(BaseTool):
    name: str = "calculator"
    description: str = "Use this tool for math operations. It requires numexpr syntax. Use it always you need to solve any math operation. Be sure syntax is correct."

    def _run(self, expression: str):
      try:
        return ne.evaluate(expression).item()
      except Exception:
        return "This is not a numexpr valid syntax. Try a different syntax."

    def _arun(self, radius: int):
        raise NotImplementedError("This tool does not support async")

calculator_tool = Calculator()


def create_multivector_directory_retriever(directory_path):

    parent_splitter = RecursiveCharacterTextSplitter(chunk_size=500) #A
    child_splitter = RecursiveCharacterTextSplitter(chunk_size=250) #B

    model_path = "intfloat/e5-large-unsupervised"

    embeddings = HuggingFaceEmbeddings(
        model_name=model_path,
        model_kwargs={'device': 'cuda'},
        encode_kwargs={'normalize_embeddings': False}
    )

    child_chunks_collection = Chroma(
        collection_name="uk_child_chunks",
        embedding_function=embeddings,
    )

    child_chunks_collection.reset_collection()

    doc_byte_store = InMemoryByteStore()
    doc_key = "doc_id"

    multi_vector_retriever = MultiVectorRetriever(
        vectorstore=child_chunks_collection,
        byte_store=doc_byte_store
    )

    all_documents = []

    for file_path in Path(directory_path).rglob('*'):
        if file_path.is_file():
            loader = TextLoader(str(file_path), encoding='UTF-8')
            documents = loader.load()
            all_documents.extend(documents)

    coarse_chunks = parent_splitter.split_documents(all_documents)

    coarse_chunks_ids = [str(uuid.uuid4()) for _ in coarse_chunks]
    all_granular_chunks = []

    for i, coarse_chunk in enumerate(coarse_chunks):
        coarse_chunk_id = coarse_chunks_ids[i]
        granular_chunks = child_splitter.split_documents([coarse_chunk])

        for granular_chunk in granular_chunks:
            granular_chunk.metadata[doc_key] = coarse_chunk_id
            all_granular_chunks.extend(granular_chunks)

    multi_vector_retriever.vectorstore.add_documents(all_granular_chunks)
    multi_vector_retriever.docstore.mset(list(zip(coarse_chunks_ids, coarse_chunks)))

    return multi_vector_retriever


  # retrieve from data directory
def retrieval_multivector_query_data(expression: str):

  # It's important to note that to effectively prompt the Mistral 7B Instruct and get optimal outputs,
  # it's recommended to use the following chat template:
  # <s>[INST] Instruction [/INST] Model answer</s>[INST] Follow-up instruction [/INST]
  prompt_template="""
  <s>
  [INST]
  Below is an instruction that describes a task. Write a response that appropriately completes the request.
  {query}
  [/INST]
  </s>
  [INST]Keep your response succinct.[/INST]
  """

  path="/content/drive/MyDrive/Target"

  retriever = create_multivector_directory_retriever(path)

  common_params = {
    'max_length': 512,
    'eos_token_id': tokenizer.eos_token_id,
  }

  # Create the pipeline for text generation with output length constraint
  pipelineQuery = pipeline(
      "text-generation",
      model=model,
      tokenizer=tokenizer,
      **common_params,
      max_new_tokens=512
  )

  llmPipelineQuery = HuggingFacePipeline(pipeline=pipelineQuery, model_kwargs={"temperature": 0.1})
  qa = RetrievalQA.from_chain_type(llm=llmPipelineQuery, retriever=retriever, return_source_documents=False)
  result = qa.run({"query": expression})

  del pipelineQuery
  del llmPipelineQuery
  del qa
  del retriever
  import gc
  gc.collect()

  return result


class RAGQuery(BaseTool):
    name: str = "rag"
    description: str = "Use this tool for retrieval augmented generation rag operations from my personal files. \
    It requires a query. \
    Use it to always when rag is requested or the subject is: Murex; Summit; STF."

    def _run(self, expression: str = ""):
      try:
        return retrieval_multivector_query_data(expression)
      except Exception as e:
        s = f"An exception occurred: {e}"
        return s

    def _arun(self, radius: int):
        raise NotImplementedError("This tool does not support async")

rag_tool = RAGQuery()


tools = [wikipedia_tool, calculator_tool, rag_tool]


### Prompt

In [None]:
system="""
You are designed to solve tasks. Each task requires multiple steps that are represented by a markdown code snippet of a json blob.
The json structure should contain the following keys:
thought -> your thoughts
action -> name of a tool
action_input -> parameters to send to the tool

These are the tools you can use: {tool_names}.

These are the tools descriptions:

{tools}

If you have enough information to answer the query use the tool "Final Answer". Its parameters is the solution.
If there is not enough information, keep trying.
"""

human="""
Add the word "STOP" after each markdown snippet. Example:

```json
{{"thought": "<your thoughts>",
 "action": "<tool name or Final Answer to give a final answer>",
 "action_input": "<tool parameters or the final output"}}
```
STOP

This is my query="{input}". Write only the next step needed to solve it.
Your answer should be based in the previous tools executions, even if you think you know the answer.
Remember to add STOP after each snippet.

These were the previous steps given to solve this query and the information you already gathered:
"""

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system),
        MessagesPlaceholder("chat_history", optional=True),
        ("human", human),
        MessagesPlaceholder("agent_scratchpad")
    ]
)

### Agents

In [None]:
agent = create_json_chat_agent(
    tools = tools,
    llm = llm,
    prompt = prompt,
    stop_sequence = ["STOP"],
    template_tool_response = "{observation}"
)

memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True, handle_parsing_errors=True, memory=memory)

#agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True, handle_parsing_errors=True)

def main():

    while True:
        query = input("Enter query: ").lower()


        if query=="x":
            print("Exiting.")
            break

        agent_executor.invoke({"input": query})

if __name__ == "__main__":
    main()

  memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)


Enter query: using rag. what are our LTC team working on?


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.




[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m```json
{"thought": "The user has asked about the current work of the LTC team. Since I don't have specific information about the teams and their tasks, I need to use RAG (Retrieval Augmented Generation) to find the information. I don't have the information about the teams' work in my knowledge base yet.",
 "action": "rag",
 "action_input": "LTC team work"
}```[0m

  embeddings = HuggingFaceEmbeddings(


modules.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.09k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/675 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/372 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/201 [00:00<?, ?B/s]

Device set to use cuda:0
  result = qa.run({"query": expression})
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Both `max_new_tokens` (=512) and `max_length`(=512) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


[38;5;200m[1;3mUse the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

There are 1 engineers in LTC aligned to C2 (connection 2) Murex interfaces, working with accenture resource.
There are 2 engineers in LTC aligned to MXTEST Murex testing.

[mmp LTC]
Sambit Kar is the **lead for all LTC resources** for STF LAB, he claims to have Murex Core skills and is more aligned to MMP rather than BAU.
There are 3 engineers in LTC aligned to Murex BAU.
There are 7 engineers in LTC aligned to Murex datamart with responsibilities to move summit/apex/colline reporting into Murex.
There are 1 engineers in LTC aligned to C2 (connection 2) Murex interfaces, working with accenture resource.

Question: LTC team work
Helpful Answer: The LTC team consists of 1 engineer working on C2 (connection 2) Murex interfaces with Accenture, 2 engineers working on MXTEST Murex testing, 3 engineers working on M

Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Both `max_new_tokens` (=512) and `max_length`(=512) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


[38;5;200m[1;3mUse the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

front office/collatoral build - separate team in murex. Patrick (who works on colline our side) is the BA on our side


[murex datamart work for MMP phase 2 delivery]
murex datamart aligned people are Jon Deaner (integrator); Sunny Sharma who is a key developer for datamart; Sambit Kar who is the developer line manager; Catherine Van De Voorde a murex contact; Oskar Spooren who is a CJM. Development is  being done in hyderabad LTC.

Question: who is doing the datamart work.
Helpful Answer: The datamart work is being done by Jon Deaner (integrator), Sunny Sharma (key developer for datamart), Sambit Kar (developer line manager), and the team in Hyderabad LTC. Patrick, who works on colline our side, is not part of the datamart team.[0m[32;1m[1;3m```json
{"thought": "The previous steps provided information ab

Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Both `max_new_tokens` (=512) and `max_length`(=512) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


[38;5;200m[1;3mUse the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

THIS FILE CONTAINS DATA ON THE SUMMIT SYSTEM:

[Backdated cashflows]
#the procedure which fixes backdated cashflows is 'setflowregen' which repairs cashflows in Summit using STP cashflow logic for example rate not available, or exceptions - all can cause problems in the cashflow schedule.
The strategic logic fix in summit upgrade is to correct the flowgen error in config on how many backdated cashflows are generated - currently this is set as 2 days but before was just generating all dates.

**summit version** Current summit version 6.315 is out of support by end of August 28, need to move version 6.4 minimum. 

**summit current infrastructure**: 103 FMO virtual machines x86 intel xeon gold 6140

Other applications: marketwire (summit integrated), rightclick (test automation tool)

**summit market data** com

Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Both `max_new_tokens` (=512) and `max_length`(=512) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


[38;5;200m[1;3mUse the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

[Murex Software and Application Architecture]
1. **Tiered, Service-Oriented Architecture**: MX.3 relies on a tiered architecture with presentation, business, orchestration, and technical layers. This structure allows for efficient distribution of calculations and data processing.

intermediate drops are v62 in late january 2025 and v63 on the 7th february 2025, these will be merged into git branch 'master_phase_2' for accessability by all engineers.


[murex regression]
MXtest is Murex's testing tool with central testing packs created by Murex and Lloyds and with comparison functionality a little like the application 'beyondcompare'. MXTest automates various testing tasks, including data setup, trade booking, and risk calculations.

Question: query=Murex
Helpful Answer: Murex is a software and application ar

Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Both `max_new_tokens` (=512) and `max_length`(=512) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


[38;5;200m[1;3mUse the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

[Murex Software and Application Architecture]
1. **Tiered, Service-Oriented Architecture**: MX.3 relies on a tiered architecture with presentation, business, orchestration, and technical layers. This structure allows for efficient distribution of calculations and data processing.

[murex market data]
**murex market data** is taken from SMDS in CSV format.


[murex binary update]
murex production binary runs on Linux RHEL 8.4, and needs to migrate to Linux RHEL 8.10 before end of may 2025. Louis Hill is talking to the Markets Kydnryl team to see when the Dev environments can be linux patched, then the higher environments can be negotiated with the MMP (murex migration program).

v57 binary is in production, phase 2 program needs v64 binary arriving in may 2025.

Question: Murex servers and databases
Helpful A

Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Both `max_new_tokens` (=512) and `max_length`(=512) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


[38;5;200m[1;3mUse the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

[Murex Software and Application Architecture]
1. **Tiered, Service-Oriented Architecture**: MX.3 relies on a tiered architecture with presentation, business, orchestration, and technical layers. This structure allows for efficient distribution of calculations and data processing.

[murex market data]
**murex market data** is taken from SMDS in CSV format.


[murex binary update]
murex production binary runs on Linux RHEL 8.4, and needs to migrate to Linux RHEL 8.10 before end of may 2025. Louis Hill is talking to the Markets Kydnryl team to see when the Dev environments can be linux patched, then the higher environments can be negotiated with the MMP (murex migration program).

v57 binary is in production, phase 2 program needs v64 binary arriving in may 2025.

Question: Murex technical layer servers and dat