<a href="https://colab.research.google.com/github/russellemergentai/MistralDocker/blob/main/TaskAssistant.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

T4 is fine.

In [None]:
#login
from google.colab import drive
drive.mount('/content/drive')

from huggingface_hub import login
from google.colab import userdata

# Load the secret token
HF_TOKEN = userdata.get('HF_TOKEN')

# Check if the token is set and log in
if HF_TOKEN:
    login(HF_TOKEN)
    print("logged in to Hugging Face")
else:
    print("Hugging Face login failed")

# init LLM
#!pip install torch transformers langchain
!pip install -U langchain-community
!pip install -U langchain-huggingface
!pip install -U bitsandbytes
!pip install faiss-gpu-cu12
!pip install langchain-chroma

import torch, os, uuid
from transformers import (AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline)
from langchain.chains import RetrievalQA
from langchain_huggingface import HuggingFacePipeline
from langchain_chroma import Chroma
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from google.colab import output
from langchain import PromptTemplate
from langchain.storage import InMemoryByteStore
from pathlib import Path


os.environ['USER_AGENT'] = 'mistral'

# load model
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

#model_id = "nachtwindecho/mistralai-Code-Instruct-Finetune-SG1-V5"
model_id = "mistralai/Mistral-7B-Instruct-v0.3"

llm = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"":0})
tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side='left', add_eos_token=True, use_fast=False)


def create_directory_retriever(directory_path):
    all_documents = []

    for file_path in Path(directory_path).rglob('*'):
        if file_path.is_file():
            loader = TextLoader(str(file_path), encoding='UTF-8')
            documents = loader.load()
            all_documents.extend(documents)

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=200, chunk_overlap=100, separators=[" ", ",", "\n"]
    )

    texts = text_splitter.split_documents(all_documents)

    model_path = "intfloat/e5-large-unsupervised"

    embeddings = HuggingFaceEmbeddings(
        model_name=model_path,
        model_kwargs={'device': 'cuda'},
        encode_kwargs={'normalize_embeddings': False}
    )

    db = FAISS.from_documents(texts, embeddings)
    retriever = db.as_retriever()

    return retriever


def create_multivector_directory_retriever(directory_path):

    parent_splitter = RecursiveCharacterTextSplitter(chunk_size=500) #A
    child_splitter = RecursiveCharacterTextSplitter(chunk_size=250) #B

    model_path = "intfloat/e5-large-unsupervised"

    embeddings = HuggingFaceEmbeddings(
        model_name=model_path,
        model_kwargs={'device': 'cuda'},
        encode_kwargs={'normalize_embeddings': False}
    )

    child_chunks_collection = Chroma(
        collection_name="uk_child_chunks",
        embedding_function=embeddings,
    )

    child_chunks_collection.reset_collection()

    doc_byte_store = InMemoryByteStore()
    doc_key = "doc_id"

    multi_vector_retriever = MultiVectorRetriever(
        vectorstore=child_chunks_collection,
        byte_store=doc_byte_store
    )

    all_documents = []

    for file_path in Path(directory_path).rglob('*'):
        if file_path.is_file():
            loader = TextLoader(str(file_path), encoding='UTF-8')
            documents = loader.load()
            all_documents.extend(documents)

    coarse_chunks = parent_splitter.split_documents(all_documents)

    coarse_chunks_ids = [str(uuid.uuid4()) for _ in coarse_chunks]
    all_granular_chunks = []

    for i, coarse_chunk in enumerate(coarse_chunks):
        coarse_chunk_id = coarse_chunks_ids[i]
        granular_chunks = child_splitter.split_documents([coarse_chunk])

        for granular_chunk in granular_chunks:
            granular_chunk.metadata[doc_key] = coarse_chunk_id
            all_granular_chunks.extend(granular_chunks)

    multi_vector_retriever.vectorstore.add_documents(all_granular_chunks)
    multi_vector_retriever.docstore.mset(list(zip(coarse_chunks_ids, coarse_chunks)))

    return multi_vector_retriever


# Define common pipeline parameters
common_params = {
    "torch_dtype": torch.bfloat16,
    "trust_remote_code": True,
    "device_map": {"": 0},
    "num_return_sequences": 1,
    "repetition_penalty": 1.5
}



# format output
def print_sections(result):
  words = result.split()
  for i in range(0, len(words), 12):
      chunk = words[i:i + 12]
      print(" ".join(chunk) + "\n")


# retrieve from data directory
def retrieval_multivector_query_data(query):

  # It's important to note that to effectively prompt the Mistral 7B Instruct and get optimal outputs,
  # it's recommended to use the following chat template:
  # <s>[INST] Instruction [/INST] Model answer</s>[INST] Follow-up instruction [/INST]
  prompt_template="""
  <s>
  [INST]
  Below is an instruction that describes a task. Write a response that appropriately completes the request.
  {query}
  [/INST]
  </s>
  [INST]Keep your response succinct.[/INST]
  """

  path="/content/drive/MyDrive/Target"

  retriever = create_multivector_directory_retriever(path)
  output.clear()

  common_params = {
    'max_length': 512,
    'eos_token_id': tokenizer.eos_token_id,
  }

  # Create the pipeline for text generation with output length constraint
  pipelineQuery = pipeline(
      "text-generation",
      model=llm,
      tokenizer=tokenizer,
      **common_params,
      max_new_tokens=512
  )

  llmPipelineQuery = HuggingFacePipeline(pipeline=pipelineQuery, model_kwargs={"temperature": 0.1})
  qa = RetrievalQA.from_chain_type(llm=llmPipelineQuery, retriever=retriever, return_source_documents=False)
  result = qa.run({"query": query})

  del pipelineQuery
  del llmPipelineQuery
  del qa
  del retriever
  import gc
  gc.collect()

  print(result)


# retrieve from data directory
def retrieval_query_data(query):

  # It's important to note that to effectively prompt the Mistral 7B Instruct and get optimal outputs,
  # it's recommended to use the following chat template:
  # <s>[INST] Instruction [/INST] Model answer</s>[INST] Follow-up instruction [/INST]
  prompt_template="""
  <s>
  [INST]
  Below is an instruction that describes a task. Write a response that appropriately completes the request.
  {query}
  [/INST]
  </s>
  [INST]Keep your response succinct.[/INST]
  """

  path="/content/drive/MyDrive/Target"

  retriever = create_directory_retriever(path)
  output.clear()

  common_params = {
    'max_length': 512,
    'eos_token_id': tokenizer.eos_token_id,
  }

  # Create the pipeline for text generation with output length constraint
  pipelineQuery = pipeline(
      "text-generation",
      model=llm,
      tokenizer=tokenizer,
      **common_params,
      max_new_tokens=512
  )

  llmPipelineQuery = HuggingFacePipeline(pipeline=pipelineQuery, model_kwargs={"temperature": 0.1})
  qa = RetrievalQA.from_chain_type(llm=llmPipelineQuery, retriever=retriever, return_source_documents=False)
  result = qa.run({"query": query})

  del pipelineQuery
  del llmPipelineQuery
  del qa
  del retriever
  import gc
  gc.collect()

  print(result)



# query the base model with a commentary from the fine tuned model
def standard_query(query):
  prompt_template="""
  <s>
  [INST]
  Below is an instruction that describes a task. Write a response that appropriately completes the request.
  {query}
  [/INST]
  </s>
  [INST]Keep your response succinct.[/INST]
  """

  # Initialize pipelines with specific parameters
  pipelineBase = pipeline(
      "text-generation",
      model=llm,
      tokenizer=tokenizer,
      #max_length=500,
      #do_sample=True,
      eos_token_id=tokenizer.eos_token_id,
      **common_params,
      max_new_tokens=200
  )

  # Create HuggingFacePipeline objects
  llmPipeline = HuggingFacePipeline(pipeline=pipelineBase, model_kwargs={"temperature": 0.1})
  PROMPT = PromptTemplate.from_template(prompt_template)
  llm_chain = PROMPT | llmPipeline
  result = llm_chain.invoke({"query": query})

  del pipelineBase
  del llmPipeline
  del PROMPT
  del llm_chain
  import gc
  gc.collect()

  print(result)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
logged in to Hugging Face
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-

config.json:   0%|          | 0.00/601 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

In [None]:
def main():
    functions = {'1': standard_query, '2': retrieval_query_data, '3': retrieval_multivector_query_data}

    while True:
        user_input = input("Enter query beginning #i for inference, #d for data, or #m for multi vector data: ").lower()

        variable = None
        query = None
        if user_input.startswith('#i'):
            variable = '1'
            query = user_input.replace('#i ', '')
        elif user_input.startswith('#d'):
            variable = '2'
            query = user_input.replace('#d ', '')
        elif user_input.startswith('#m'):
            variable = '3'
            query = user_input.replace('#m ', '')
        else:
            variable = 0

        output.clear()

        if variable in functions:
            functions[variable](query)
        elif variable == 0:
            print("Exiting.")
            break
        else:
            print("Invalid input.")

if __name__ == "__main__":
    main()