<a href="https://colab.research.google.com/github/polyexplorer/open-llm/blob/main/HF_Model_Chat_Interface.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# # #Dependencies
# ! pip install git+https://github.com/huggingface/transformers.git@72958fcd3c98a7afdc61f953aa58c544ebda2f79
# ! pip install optimum
# ! pip install auto-gptq --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/  # Use cu117 if on CUDA 11.7
# ! pip install langchain
# ! pip install "unstructured[pdf]"

Collecting git+https://github.com/huggingface/transformers.git@72958fcd3c98a7afdc61f953aa58c544ebda2f79
  Cloning https://github.com/huggingface/transformers.git (to revision 72958fcd3c98a7afdc61f953aa58c544ebda2f79) to /tmp/pip-req-build-q47lskuk
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers.git /tmp/pip-req-build-q47lskuk
  Running command git rev-parse -q --verify 'sha^72958fcd3c98a7afdc61f953aa58c544ebda2f79'
  Running command git fetch -q https://github.com/huggingface/transformers.git 72958fcd3c98a7afdc61f953aa58c544ebda2f79
  Running command git checkout -q 72958fcd3c98a7afdc61f953aa58c544ebda2f79
  Resolved https://github.com/huggingface/transformers.git to commit 72958fcd3c98a7afdc61f953aa58c544ebda2f79
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting tokenizers<0.15,>=0.14 (from transformers==

In [2]:
# Mistral Wrapper
from transformers import AutoModelForCausalLM, AutoTokenizer,GPTQConfig, pipeline,TextStreamer
import torch
from typing import Any, List, Mapping, Optional
from langchain.callbacks.manager import CallbackManagerForLLMRun
from langchain.llms.base import LLM

class MistralModel:
    def __init__(self):
        # Refresh CUDA Memory
        torch.cuda.empty_cache()
        self.model,self.tokenizer = self.get_model()
        streamer = TextStreamer(self.tokenizer, skip_prompt=True, skip_special_tokens=True)
        self.pipe = pipeline(
            "text-generation",
            model=self.model,
            tokenizer=self.tokenizer,
            max_new_tokens=4092,
            do_sample=True,
            temperature=0.1,
            top_k=40,
            top_p=0.95,
            repetition_penalty=1.15,
            streamer=streamer,
        )


    def format_prompt(self,prompt):
        return f"""<s>[INST] {prompt} [/INST]"""

    def generate_instruction(
        self,
        prompt:str,
        instruction:str = 'Think carefully and answer the given question as truthfully as possible',
        llm_template = None
    ):
        # if not llm_template:
        #     llm_template = self.format_prompt
        instruction_format = f"""### Instruction: {instruction}:

    ### Input:
    {prompt}

    ### Response:
    """
        if llm_template:
            return llm_template(instruction_format)
        else:
            return instruction_format


    def get_model(self):
        # model_name_or_path = "TheBloke/Mistral-7B-Instruct-v0.1-GPTQ"
        model_name_or_path = "TheBloke/Mistral-7B-OpenOrca-GPTQ"
        # To use a different branch, change revision
        # For example: revision="main"
        quantization_config_loading = GPTQConfig(bits=4, use_exllama = False)
        model = AutoModelForCausalLM.from_pretrained(model_name_or_path,

                                                  quantization_config=quantization_config_loading,
                                                  device_map="cuda",
                                                  trust_remote_code=True,
                                                  revision="gptq-4bit-32g-actorder_True")

        tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
        return model, tokenizer

    def _predict(self, prompt):
        torch.cuda.empty_cache()
        response =  self.pipe(self.format_prompt(prompt))[0]['generated_text']
        return response

    def predict(self,prompt):
        return self._predict(prompt).split(r'INST]')[-1].strip()

    def ask(self,question,instruction = None):
        formatted_prompt = self.generate_instruction(prompt=question,instruction=instruction)
        return self.predict(formatted_prompt)

class MistralLLM(LLM):
    mistral_model: MistralModel

    @property
    def _llm_type(self) -> str:
        return "custom"

    def _call(
        self,
        prompt: str,
        stop: Optional[List[str]] = None,
        run_manager: Optional[CallbackManagerForLLMRun] = None,
        **kwargs: Any,
    ) -> str:
        # if stop is not None:
        #     raise ValueError("stop kwargs are not permitted.")
        return self.mistral_model.ask(prompt)

    @property
    def _identifying_params(self) -> Mapping[str, Any]:
        """Get the identifying parameters."""
        return {"model": self.mistral_model}

In [5]:
def get_model(model_name, revision = 'main'):
  # model_name_or_path = "TheBloke/Mistral-7B-Instruct-v0.1-GPTQ"
  # To use a different branch, change revision
  # For example: revision="main"
  quantization_config_loading = GPTQConfig(bits=4, use_exllama = False)
  model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quantization_config_loading,
    device_map="cuda",
    trust_remote_code=True,
    revision=revision
  )
  return model

In [7]:
model_name = "TheBloke/zephyr-7B-alpha-GPTQ"
revision = "gptq-4bit-32g-actorder_True"

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
model = get_model(model_name, revision)

Downloading tokenizer_config.json:   0%|          | 0.00/983 [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/169 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading config.json:   0%|          | 0.00/1.31k [00:00<?, ?B/s]

You passed `quantization_config` to `from_pretrained` but the model you're loading already has a `quantization_config` attribute and has already quantized weights. However, loading attributes (e.g. disable_exllama, use_cuda_fp16, max_input_length) will be overwritten with the one you passed to `from_pretrained`. The rest will be ignored.


Downloading model.safetensors:   0%|          | 0.00/4.57G [00:00<?, ?B/s]

Downloading generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

In [13]:
def complete(prompt,model,tokenizer, config=None):
  curr_config = {
        'temperature':0.1,
        'do_sample':True,
        'top_p':0.95,
        'top_k':40,
        'max_new_tokens':512
    }
  if config:
    for k,v in config.items():
      curr_config[k] = v

  config = curr_config

  input_ids = tokenizer(prompt, return_tensors='pt').input_ids.cuda()

  output = model.generate(input_ids, **config)
  return tokenizer.decode(output[0])

In [14]:
complete("How are you?",model,tokenizer)

'<s> How are you?\n\nI’m good. How about you?\n\nI’m doing well, thanks.\n\nThat’s great to hear. So, what’s new with you?\n\nWell, I’ve been working on a new project at work. It’s been keeping me busy, but it’s also been really exciting.\n\nThat sounds interesting. Can you tell me more about it?\n\nSure! We’re working on a new product that’s going to revolutionize the industry. It’s going to be a game-changer.\n\nWow, that sounds amazing. How far along are you in the project?\n\nWe’re still in the testing phase, but we’re making great progress. We’ve already received some positive feedback from our beta testers.\n\nThat’s really impressive. How did you come up with the idea for this product?\n\nActually, it was a collaboration between our team and some of our clients. We identified a need in the market and worked together to create a solution that would meet that need.\n\nThat’s really cool. How do you think this product will impact the industry?\n\nI think it will have a significant 

In [19]:
from transformers import TextStreamer

class LLMInterface:
  def __init__(self, model, tokenizer, prompt_template= """<s>[INST] {prompt} [/INST]"""):
    self.model = model
    self.prompt_template = prompt_template
    self.tokenizer = tokenizer
    streamer = TextStreamer(self.tokenizer, skip_prompt=True, skip_special_tokens=True)
    self.pipe = pipeline(
      "text-generation",
      model=self.model,
      tokenizer=self.tokenizer,
      max_new_tokens=512,
      do_sample=True,
      temperature=0.1,
      top_k=40,
      top_p=0.95,
      repetition_penalty=1.15,
      streamer=streamer,
    )

  def generate_instruction(
        self,
        prompt:str,
        instruction:str = 'Think carefully and answer the given question as truthfully as possible',
        llm_template = None
    ):
    # if not llm_template:
    #     llm_template = self.format_prompt
    instruction_format = f"""### Instruction: {instruction}:
### Input:
{prompt}

### Response:
"""
    return instruction_format

  def _predict(self, prompt):
    torch.cuda.empty_cache()
    response =  self.pipe(self.prompt_template.format(prompt=prompt))[0]['generated_text']
    return response

  def predict(self,prompt):
    return self._predict(prompt).split(r'INST]')[-1].strip()

  def ask(self,question,instruction = None):
    formatted_prompt = self.generate_instruction(prompt=question,instruction=instruction)
    return self.predict(formatted_prompt)


In [20]:
model_interface = LLMInterface(model,tokenizer)

In [29]:
!pip install gradio

NotImplementedError: ignored

In [24]:
gr.ChatInterface(model_interface.ask).queue().launch()

NameError: ignored