# Using HuggingFace model
This code explains how to download the model, quantize and fine tune it

## Installing Llama 3
### first install Hugging Face CLI

`pip install -U "huggingface_hub[cli]"`

### login to your account:

`huggingface-cli login`

### Download the model (specify the path you want and gitignore it)

`huggingface-cli download meta-llama/Meta-Llama-3-8B-Instruct --exclude "original/*" --local-dir LargeModel/Meta-Llama-3-8B-Instruct`

## Create quanised pipeline

In [8]:
import torch
import transformers

class Llama3:
    def __init__(self, model_path):
        self.model_id = model_path
        self.pipeline = transformers.pipeline(
            "text-generation",
            model=self.model_id,
            model_kwargs={
                "torch_dtype": torch.float16,
                #"quantization_config": {"load_in_4bit": True},
                #"low_cpu_mem_usage": True,
            },
        )
        self.terminators = [
            self.pipeline.tokenizer.eos_token_id,
            self.pipeline.tokenizer.convert_tokens_to_ids(""),
        ]
  
    def get_response(
          self, query, message_history=[], max_tokens=4096, temperature=0.6, top_p=0.9
      ):
        user_prompt = message_history + [{"role": "user", "content": query}]
        prompt = self.pipeline.tokenizer.apply_chat_template(
            user_prompt, tokenize=False, add_generation_prompt=True
        )
        outputs = self.pipeline(
            prompt,
            max_new_tokens=max_tokens,
            eos_token_id=self.terminators,
            do_sample=True,
            temperature=temperature,
            top_p=top_p,
        )
        response = outputs[0]["generated_text"][len(prompt):]
        return response, user_prompt + [{"role": "assistant", "content": response}]
    
    def chatbot(self, system_instructions=""):
        conversation = [{"role": "system", "content": system_instructions}]
        while True:
            user_input = input("User: ")
            if user_input.lower() in ["exit", "quit"]:
                print("Exiting the chatbot. Goodbye!")
                break
            response, conversation = self.get_response(user_input, conversation)
            print(f"Assistant: {response}")
  


In [9]:
if __name__ == "__main__":
    bot = Llama3("LargeModel/Meta-Llama-3-8B-Instruct")
    bot.chatbot()

Loading checkpoint shards: 100%|██████████| 4/4 [00:36<00:00,  9.18s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [3]:

def get_response(
          query,model_path, message_history=[], max_tokens=4096, temperature=0.6, top_p=0.9
          ):
        pipeline = transformers.pipeline(
            "text-generation",
            model=model_path,
            model_kwargs={
                "torch_dtype": torch.float16,
                #"quantization_config": {"load_in_4bit": True},
                #"low_cpu_mem_usage": True,
            },
        )
        terminators = [
            pipeline.tokenizer.eos_token_id,
            pipeline.tokenizer.convert_tokens_to_ids(""),
        ]
        user_prompt = message_history + [{"role": "user", "content": query}]
        prompt = pipeline.tokenizer.apply_chat_template(
            user_prompt, tokenize=False, add_generation_prompt=True
        )
        outputs = pipeline(
            prompt,
            max_new_tokens=max_tokens,
            eos_token_id=terminators,
            do_sample=True,
            temperature=temperature,
            top_p=top_p,
        )
        response = outputs[0]["generated_text"][len(prompt):]
        complete = user_prompt + [{"role": "assistant", "content": response}]
        return response, complete 

In [4]:
model_path = "LargeModel/Meta-Llama-3-8B-Instruct"
query = "describe quantum theory in 100 words with a simple language"
get_response(query,model_path)

TypeError: get_response() missing 1 required positional argument: 'model_path'