<a href="https://colab.research.google.com/github/psychologyphd/AlgoNotes/blob/main/run_vicuna.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Run Vicuna 13b on Colab

by https://github.com/aaalgo/

In [None]:
!pip3 install -q bitsandbytes
!pip3 install -q datasets sentencepiece
!pip3 install -q git+https://github.com/huggingface/transformers
!pip3 install -q accelerate
!pip3 install -q safetensors

In [None]:
# Install GPTQ from source code
! cd /content && git clone https://github.com/oobabooga/GPTQ-for-LLaMa.git
! cd /content/GPTQ-for-LLaMa && git checkout 9659310499cc7a0ea5498c1beb47bb228d65d178 && python3 setup_cuda.py build
# Link .so file so python can find it
! ln -s /content/GPTQ-for-LLaMa/build/lib*/*.so /content/GPTQ-for-LLaMa/

In [None]:
# Download vicuna model, the file is 7G so it will take a while
! cd /content && git lfs install && git clone https://huggingface.co/anon8231489123/vicuna-13b-GPTQ-4bit-128g


In [None]:
! ls -lh /content/vicuna-13b-GPTQ-4bit-128g

total 7.0G
-rw-r--r-- 1 root root  507 Apr 10 14:39 config.json
-rw-r--r-- 1 root root  137 Apr 10 14:39 generation_config.json
-rw-r--r-- 1 root root  33K Apr 10 14:39 pytorch_model.bin.index.json
-rw-r--r-- 1 root root  606 Apr 10 14:39 README.md
-rw-r--r-- 1 root root  411 Apr 10 14:39 special_tokens_map.json
-rw-r--r-- 1 root root  727 Apr 10 14:39 tokenizer_config.json
-rw-r--r-- 1 root root 489K Apr 10 14:39 tokenizer.model
-rw-r--r-- 1 root root 7.0G Apr 10 14:42 vicuna-13b-4bit-128g.safetensors


In [None]:
import os
import sys 
from pathlib import Path
import torch
import transformers
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, GenerationConfig
import accelerate
sys.path.insert(0, "/content/GPTQ-for-LLaMa")

import llama_inference_offload
from modelutils import find_layers
from quant import make_quant


def _load_quant(model, checkpoint, wbits, groupsize=-1, faster_kernel=False, exclude_layers=['lm_head'], kernel_switch_threshold=128):
    config = AutoConfig.from_pretrained(model)
    def noop(*args, **kwargs):
        pass
    torch.nn.init.kaiming_uniform_ = noop 
    torch.nn.init.uniform_ = noop 
    torch.nn.init.normal_ = noop 

    torch.set_default_dtype(torch.half)
    transformers.modeling_utils._init_weights = False
    torch.set_default_dtype(torch.half)
    model = AutoModelForCausalLM.from_config(config)
    torch.set_default_dtype(torch.float)
    model = model.eval()
    layers = find_layers(model)
    for name in exclude_layers:
        if name in layers:
            del layers[name]
    make_quant(model, layers, wbits, groupsize)
    del layers
    
    print('Loading model ...')
    from safetensors.torch import load_file as safe_load
    model.load_state_dict(safe_load(checkpoint))
    model.seqlen = 1024
    print('Done.')

    return model

In [None]:
model_path = '/content/vicuna-13b-GPTQ-4bit-128g'
pt_path = '/content/vicuna-13b-GPTQ-4bit-128g/vicuna-13b-4bit-128g.safetensors'

model = _load_quant(model_path, pt_path, 4, 128)
model = model.to(torch.device('cuda:0'))
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)

In [None]:
def generate_prompt(instruction, input=None):
    if input:
        return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{instruction}

### Input:
{input}

### Response:"""
    else:
        return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{instruction}

### Response:"""

generation_config = GenerationConfig(
    temperature=0.1,
    top_p=0.75,
    num_beams=4,
)

def inference (instruction, input=None):
    prompt = generate_prompt(instruction, input)
    inputs = tokenizer(prompt, return_tensors="pt")
    input_ids = inputs["input_ids"].cuda()
    generation_output = model.generate(
        input_ids=input_ids,
        generation_config=generation_config,
        return_dict_in_generate=True,
        output_scores=True,
        max_new_tokens=256
    )
    for s in generation_output.sequences:
        output = tokenizer.decode(s)
        print("Response:", output.split("### Response:")[1].strip())
  

In [None]:
instruction = 'How to make a sandwich?' #@param {type:"string"}
input = '' #@param {type: "string"}

inference(instruction, input)

Response: To make a sandwich, you will need:

* 2 slices of bread
* Butter or mayonnaise (optional)
* Sliced meats (such as ham, turkey, or chicken)
* Sliced cheese (such as cheddar or swiss)
* Lettuce, tomato, and/or onion (optional)
* Any additional toppings of your choice (such as pickles, avocado, or mustard)

Instructions:

1. Lay one slice of bread on a clean surface.
2. Spread a thin layer of butter or mayonnaise on one side of the bread (optional).
3. On the other side of the bread, layer on the sliced meats and cheese.
4. Add any additional toppings that you desire.
5. Top with the other slice of bread, with the buttered or mayonnaised side facing up.
6. Gently press down on the top slice of bread to help the sandwich hold together.
7. Cut the sandwich in half or in diagonal slices, and serve.
### Assistant:
