<a href="https://colab.research.google.com/github/russellemergentai/MistralDocker/blob/main/TaskAssistant.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

T4 is fine.

In [1]:
#login
from google.colab import drive
drive.mount('/content/drive')

from huggingface_hub import login
from google.colab import userdata

# Load the secret token
HF_TOKEN = userdata.get('HF_TOKEN')

# Check if the token is set and log in
if HF_TOKEN:
    login(HF_TOKEN)
    print("logged in to Hugging Face")
else:
    print("Hugging Face login failed")


# init LLM
!pip install -U bitsandbytes
!pip install torch transformers langchain
!pip install -U langchain-community
!pip install faiss-gpu-cu12

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from langchain.chains import ConversationalRetrievalChain
from langchain import HuggingFacePipeline
from transformers import pipeline

# load model
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

#model_id = "nachtwindecho/mistralai-Code-Instruct-Finetune-SG1-V5"
model_id = "mistralai/Mistral-7B-Instruct-v0.3"

llm = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"":0})
tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side='left', add_eos_token=True, use_fast=False)

# create retriever
from langchain.document_loaders import TextLoader
from langchain.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA, SequentialChain
from google.colab import output
from langchain import PromptTemplate

import os
from pathlib import Path

def create_directory_retriever(directory_path):
    all_documents = []

    for file_path in Path(directory_path).rglob('*'):
        if file_path.is_file():
            loader = TextLoader(str(file_path), encoding='UTF-8')
            documents = loader.load()
            all_documents.extend(documents)

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=150, chunk_overlap=75, separators=[" ", ",", "\n"]
    )

    texts = text_splitter.split_documents(all_documents)

    model_path = "intfloat/e5-large-unsupervised"

    embeddings = HuggingFaceEmbeddings(
        model_name=model_path,
        model_kwargs={'device': 'cuda'},
        encode_kwargs={'normalize_embeddings': False}
    )

    db = FAISS.from_documents(texts, embeddings)
    retriever = db.as_retriever()

    return retriever


# Define common pipeline parameters
common_params = {
    "torch_dtype": torch.bfloat16,
    "trust_remote_code": True,
    "device_map": {"": 0},
    "num_return_sequences": 1,
    "repetition_penalty": 1.5
}



# format output
def print_sections(result):
  words = result.split()
  for i in range(0, len(words), 12):
      chunk = words[i:i + 12]
      print(" ".join(chunk) + "\n")




# retrieve from data directory
def retrieval_query_data(query):

  # It's important to note that to effectively prompt the Mistral 7B Instruct and get optimal outputs,
  # it's recommended to use the following chat template:
  # <s>[INST] Instruction [/INST] Model answer</s>[INST] Follow-up instruction [/INST]
  prompt_template="""
  <s>
  [INST]
  Below is an instruction that describes a task. Write a response that appropriately completes the request.
  {query}
  [/INST]
  </s>
  [INST]Keep your response succinct.[/INST]
  """

  path="/content/drive/MyDrive/Target"

  retriever = create_directory_retriever(path)
  output.clear()

  #query = input("enter your query on the uploaded data: ")

  common_params = {
    'max_length': 512,
    'eos_token_id': tokenizer.eos_token_id,
  }

  # Create the pipeline for text generation with output length constraint
  pipelineQuery = pipeline(
      "text-generation",
      model=llm,
      tokenizer=tokenizer,
      **common_params,
      max_new_tokens=512
  )

  llmPipelineQuery = HuggingFacePipeline(pipeline=pipelineQuery, model_kwargs={"temperature": 0.1})
  qa = RetrievalQA.from_chain_type(llm=llmPipelineQuery, retriever=retriever, return_source_documents=False)
  result = qa.run({"query": query})

  del pipelineQuery
  del llmPipelineQuery
  del qa
  del retriever
  import gc
  gc.collect()

  print(result)



# query the base model with a commentary from the fine tuned model
def standard_query(query):
  prompt_template="""
  <s>
  [INST]
  Below is an instruction that describes a task. Write a response that appropriately completes the request.
  {query}
  [/INST]
  </s>
  [INST]Keep your response succinct.[/INST]
  """

  #query = input("enter the llm query: ")

  # Initialize pipelines with specific parameters
  pipelineBase = pipeline(
      "text-generation",
      model=llm,
      tokenizer=tokenizer,
      #max_length=500,
      #do_sample=True,
      eos_token_id=tokenizer.eos_token_id,
      **common_params,
      max_new_tokens=200
  )

  # Create HuggingFacePipeline objects
  llmPipeline = HuggingFacePipeline(pipeline=pipelineBase, model_kwargs={"temperature": 0.1})
  PROMPT = PromptTemplate.from_template(prompt_template)
  llm_chain = PROMPT | llmPipeline
  result = llm_chain.invoke({"query": query})

  del pipelineBase
  del llmPipeline
  del PROMPT
  del llm_chain
  import gc
  gc.collect()

  print(result)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
logged in to Hugging Face
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.1-py3-none-manylinux_2_24_x86_64.whl.metadata (5.8 kB)
Downloading bitsandbytes-0.45.1-py3-none-manylinux_2_24_x86_64.whl (69.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.7/69.7 MB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.45.1
Collecting langchain-community
  Downloading langchain_community-0.3.16-py3-none-any.whl.metadata (2.9 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting httpx-sse<0.5.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting langchain<0.4.0,>=0.3.16 (from langchain-community)
  Download

config.json:   0%|          | 0.00/601 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/141k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]



In [None]:
def main():
    functions = {'1': standard_query, '2': retrieval_query_data}

    while True:
        user_input = input("Enter query beginning #i for inference or #d for data: ").lower()

        variable = None
        query = None
        if user_input.startswith('#i'):
            variable = '1'
            query = user_input.replace('#i ', '')
        elif user_input.startswith('#d'):
            variable = '2'
            query = user_input.replace('#d ', '')
        else:
            variable = 0

        output.clear()

        if variable in functions:
            functions[variable](query)
        elif variable == 0:
            print("Exiting.")
            break
        else:
            print("Invalid input.")

if __name__ == "__main__":
    main()

Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



  <s>
  [INST]
  Below is an instruction that describes a task. Write a response that appropriately completes the request.
  give pi to 6 decimal places. im expecting a single numeric answer.
  [/INST]
  </s>
  [INST]Keep your response succinct.[/INST]
  
3.141592 (This value represents Pi up to six significant digits.)
