In [None]:
#@markdown # Chinkara 7B (Improved)
#@markdown This is the first model from [MaralGPT](https://huggingface.co/MaralGPT) project. It's an effort in making a free/libre and open source software (FLOSS) compatible Large Language Model (LLM). This model can be ran on only 8 GB of GPU VRAM and it's based on _Meta's LLaMa-2_.

In [None]:
#@markdown ## Installing Libraries
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q datasets
!pip install -q einops

In [None]:
#@markdown ## Setting Up the Model and Improts

import torch
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

model_name = "Trelis/Llama-2-7b-chat-hf-sharded-bf16"
adapters_name = 'MaralGPT/chinkara-7b-improved'

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    load_in_4bit=True,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    max_memory= {i: '24000MB' for i in range(torch.cuda.device_count())},
    quantization_config=BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type='nf4'
    ),
)
model = PeftModel.from_pretrained(model, adapters_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [3]:
#@markdown ## Setting up the model and QLoRa adapter

from peft import LoraConfig, get_peft_model

model = PeftModel.from_pretrained(model, adapters_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [7]:
#@markdown ## Running the inference
#@markdown Mind that this may take some time and since the model is quantized, there is a chance of huge hallucinations.

prompt = "Who was the president of the united states in 1996?" #@param{type: "string"}

prompt = f"### Human: {prompt} ### Assistant:"
inputs = tokenizer(prompt, return_tensors="pt").to("cuda:0")
outputs = model.generate(inputs=inputs.input_ids, max_new_tokens=100, temperature=0.75, repetition_penalty=1.2)
answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
answer = answer.split("### Assistant:")[1]

if "### Human" in answer:
  answer = answer.split("### Human:")[0]

print(answer)


 The President of the United States in 1996 was Bill Clinton. Unterscheidung between "Bill Clinton" and "William Clinton":
* "Bill Clinton" refers to William Jefferson Clinton, who served as the 42nd President of the United States from 1993 to 2001.
* "William Clinton" is an alternative spelling for the name of the same person, but it is less common than
