# Inference notebook

## Imports

In [None]:
!pip install datasets
!pip install sentencepiece
!pip install git+https://github.com/huggingface/transformers.git
!pip install git+https://github.com/huggingface/accelerate.git
!pip install bitsandbytes
!pip install git+https://github.com/huggingface/peft.git
!pip install gradio

Collecting git+https://github.com/huggingface/transformers.git
  Cloning https://github.com/huggingface/transformers.git to /tmp/pip-req-build-bmuuwmww
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers.git /tmp/pip-req-build-bmuuwmww
  Resolved https://github.com/huggingface/transformers.git to commit c8c8dffbe45ebef0a8dba4a51024e5e5e498596b
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting git+https://github.com/huggingface/accelerate.git
  Cloning https://github.com/huggingface/accelerate.git to /tmp/pip-req-build-wu3allx9
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/accelerate.git /tmp/pip-req-build-wu3allx9
  Resolved https://github.com/huggingface/accelerate.git to commit cb8b7c637a8588668c52bd306f9b2828f69d9585
  Installing build dependencies ... [?25l[?25hdon

In [None]:
import torch
from peft import PeftModel
import transformers
import gradio as gr
assert (
    "LlamaTokenizer" in transformers._import_structure["models.llama"]
), "LLaMA is now in HuggingFace's main branch.\nPlease reinstall it: pip uninstall transformers && pip install git+https://github.com/huggingface/transformers.git"
from transformers import LlamaTokenizer, LlamaForCausalLM, GenerationConfig

## Setup

In [None]:
tokenizer = LlamaTokenizer.from_pretrained("baffo32/decapoda-research-llama-7B-hf")

In [None]:
BASE_MODEL = "baffo32/decapoda-research-llama-7B-hf"
LORA_WEIGHTS = "kunchum/capstone-llama-finetuned"

In [None]:
if torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"
try:
    if torch.backends.mps.is_available():
        device = "mps"
except:
    pass

In [None]:
if device == "cuda":
    model = LlamaForCausalLM.from_pretrained(
        BASE_MODEL,
        load_in_8bit=True,
        device_map="auto",
    )
    model = PeftModel.from_pretrained(
        model, LORA_WEIGHTS, force_download=True
    )

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/33 [00:00<?, ?it/s]

adapter_config.json:   0%|          | 0.00/399 [00:00<?, ?B/s]



adapter_model.bin:   0%|          | 0.00/8.41M [00:00<?, ?B/s]

In [None]:
def generate_prompt(instruction, input=None):
    if input:
        return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
{instruction}
### Input:
{input}
### Response:"""
    else:
        return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.
### Instruction:
{instruction}
### Response:"""

In [None]:
model.half()
model.eval()
if torch.__version__ >= "2":
    model = torch.compile(model)

## Evaluate function setup

In [None]:
def evaluate(
    instruction,
    input=None,
    temperature=0.1,
    top_p=0.75,
    top_k=40,
    num_beams=4,
    max_new_tokens=128,
    repetition_penalty=1.15,
    **kwargs,
):
    prompt = generate_prompt(instruction, input)
    inputs = tokenizer(prompt, return_tensors="pt")
    input_ids = inputs["input_ids"].to(device)
    generation_config = GenerationConfig(
        temperature=temperature,
        do_sample=True,
        top_p=top_p,
        repetition_penalty=repetition_penalty,
        **kwargs,
    )
    with torch.autocast("cuda"):
        generation_output = model.generate(
            input_ids=input_ids,
            generation_config=generation_config,
            return_dict_in_generate=True,
            output_scores=True,
            max_new_tokens=max_new_tokens,
        )
    s = generation_output.sequences[0]
    output = tokenizer.decode(s)
    return output.split("### Response:")[1].strip()

    #     response = ""
    # for s in generation_output.sequences:
    #     response = tokenizer.decode(generation_output.sequences[0], skip_special_tokens=True)

    # # Truncate the response at the first occurrence of triple quotes (""")
    # if '"""' in response:
    #     response = response.split('"""')[0].strip()
    #     response = response.split('### Response:')[1].strip()

    # return response

## Gradio Interface

In [None]:
# import gradio as gr

# g = gr.Interface(
#     fn=evaluate,
#     inputs=[
#         gr.Textbox(
#             lines=2, label="Instruction", placeholder="When should I get a second credit card?"
#         ),
#         gr.Textbox(lines=2, label="Input", placeholder="none"),
#         gr.Dropdown(
#             choices=["LLaMa 7B", "Mistral 7B"],
#             label="Model",
#             value="LLaMa 7B",
#         ),
#         gr.Slider(minimum=0, maximum=1, value=0.6, label="Temperature"),
#         gr.Slider(minimum=0, maximum=1, value=0.95, label="Top p"),
#         # gr.Slider(minimum=0, maximum=100, step=1, value=40, label="Top k"),
#         # gr.Slider(minimum=1, maximum=4, step=1, value=4, label="Beams"),
#         gr.Slider(minimum=1, maximum=512, step=1, value=128, label="Max tokens"),
#         gr.Slider(minimum=0.1, maximum=2, step=0.1, value=1.15, label="Repetition Penalty"),
#     ],
#     outputs=[
#         gr.Textbox(
#             lines=10,
#             label="Output",
#         )
#     ],
#     title="FinAdvisor - Financial Advisory Tool for contextually-aware personal finance assistance",
#     description="FinAdvisor LLM based project to generate quick, personal advice focused on wealth/finance."
# )

# # Remove concurrency_count argument
# g.queue()
# g.launch(share=True, debug=True)

In [26]:
def dummy_model_selection(model_name):
    return f"Selected model: {model_name}"

# Gradio Interface
g = gr.Blocks()

with g:
    gr.Markdown("# FinAdvisor - Financial Advisory Tool for Contextually-Aware Personal Finance Assistance")
    gr.Markdown("Generate quick, personal advice focused on wealth/finance.")

    with gr.Row():
        instruction = gr.Textbox(
            lines=2, label="Instruction", placeholder="When should I get a second credit card?"
        )
        input_box = gr.Textbox(lines=2, label="Input", placeholder="none")

    with gr.Row():
        model_dropdown = gr.Dropdown(
            choices=["LLaMa 7B", "Mistral 7B"],
            label="Model",
            value="LLaMa 7B",
        )

    with gr.Row():
        temperature = gr.Slider(minimum=0, maximum=1, value=0.6, label="Temperature")
        top_p = gr.Slider(minimum=0, maximum=1, value=0.95, label="Top p")
        max_tokens = gr.Slider(minimum=1, maximum=512, step=1, value=128, label="Max tokens")
        repetition_penalty = gr.Slider(minimum=0.1, maximum=2, step=0.1, value=1.15, label="Repetition Penalty")

    with gr.Row():
        output = gr.Textbox(lines=10, label="Output")

    evaluate_button = gr.Button("Evaluate")

    evaluate_button.click(
        fn=evaluate,
        inputs=[instruction, input_box, temperature, top_p, max_tokens, repetition_penalty],
        outputs=[output],
    )

# Launch interface
g.queue()
g.launch(share=True, debug=True)

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://ec2ee8039062a14861.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://ec2ee8039062a14861.gradio.live


