<a href="https://colab.research.google.com/github/softmurata/colab_notebooks/blob/main/gpt/gpt4all_with_llama.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#@title alpaca lllama index
# Ref Docs https://github.com/thohag/alpaca_llama_index
# peft==0.3.0.dev0
!pip install langchain==0.0.126 llama_index==0.4.26 
!pip install git+https://github.com/huggingface/transformers.git
!pip install git+https://github.com/huggingface/peft.git

In [None]:
!pip install sentencepiece bitsandbytes 

In [None]:
!pip install sentence_transformers

In [None]:
import torch
from langchain.llms.base import LLM
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from llama_index import LangchainEmbedding
from llama_index import SimpleDirectoryReader, LLMPredictor, PromptHelper, GPTSimpleVectorIndex
from peft import PeftModel
from transformers import LlamaTokenizer, LlamaForCausalLM, GenerationConfig

In [None]:
# alpaca lora
hf_model_path = "decapoda-research/llama-7b-hf"
alpaca_model_path = "tloen/alpaca-lora-7b"

tokenizer = LlamaTokenizer.from_pretrained(hf_model_path)

model = LlamaForCausalLM.from_pretrained(
    hf_model_path,
    load_in_8bit=True, #Dissabling could solve some errors
    device_map="auto",
)
model = PeftModel.from_pretrained(model, alpaca_model_path)


device = torch.device("cuda") if torch.cuda.is_available() else "cpu"
max_length = 1500 #2048
max_new_tokens = 48

In [3]:
class LLaMALLM(LLM):
    def _call(self, prompt, stop=None):
        prompt += "### Response:"

        inputs = tokenizer(prompt, return_tensors="pt")
        input_ids = inputs["input_ids"].cuda()
        
        generation_config = GenerationConfig(
            temperature=0.6,
            top_p=0.95,
            repetition_penalty=1.15,
        )
        with torch.no_grad():
            generation_output = model.generate(
                input_ids=input_ids,
                generation_config=generation_config,
                return_dict_in_generate=True,
                output_scores=True,
                max_new_tokens=128,
            )
        response = ""
        for s in generation_output.sequences:
            response += tokenizer.decode(s)
            
        response = response[len(prompt):]
        print("Model Response:", response)
        return response

    def _identifying_params(self):
        return {"name_of_model": "alpaca"}

    def _llm_type(self):
        return "custom"

In [None]:
!wget https://raw.githubusercontent.com/thohag/alpaca_llama_index/master/data/gatsby.txt -P data

In [8]:
max_input_size = max_length
num_output = max_new_tokens
max_chunk_overlap = 20

prompt_helper = PromptHelper(max_input_size, num_output, max_chunk_overlap)
embed_model = LangchainEmbedding(HuggingFaceEmbeddings())
documents = SimpleDirectoryReader('data').load_data()
llm_predictor = LLMPredictor(llm=LLaMALLM())
index = GPTSimpleVectorIndex(documents, llm_predictor=llm_predictor, embed_model=embed_model, prompt_helper=prompt_helper)

index.save_to_disk('index.json')
new_index = GPTSimpleVectorIndex.load_from_disk('index.json', embed_model=embed_model, llm_predictor=llm_predictor, prompt_helper=prompt_helper)

In [9]:
response = new_index.query("What did Gatsby do before he met Daisy?")
print(response.response)

response = new_index.query("What did the narrator do after getting back to Chicago?")
print(response.response)

Model Response: nse: Before meeting Daisy, Gatsby worked hard to build his fortune through illegal activities such as bootlegging alcohol during Prohibition.
Model Response: nse: 
Before meeting Daisy, Gatsby worked hard to build his fortune through legal and illegal means. He was involved in bootlegging alcohol during Prohibition, but also invested heavily in real estate and other businesses.
nse: 
Before meeting Daisy, Gatsby worked hard to build his fortune through legal and illegal means. He was involved in bootlegging alcohol during Prohibition, but also invested heavily in real estate and other businesses.
Model Response: nse: The narrator returned to Chicago and spent the rest of the day looking for evidence related to the murder case.
Model Response: nse: 
The narrator returned to Chicago and spent the rest of the day looking for evidence related to the murder case.
nse: 
The narrator returned to Chicago and spent the rest of the day looking for evidence related to the murder c

In [1]:
#@title alpaca llama index japanese alpaca lora
import torch
from langchain.llms.base import LLM
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from llama_index import LangchainEmbedding
from llama_index import SimpleDirectoryReader, LLMPredictor, PromptHelper, GPTSimpleVectorIndex
from peft import PeftModel
from transformers import LlamaTokenizer, LlamaForCausalLM, GenerationConfig


Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 7.5
CUDA SETUP: Detected CUDA version 118
CUDA SETUP: Loading binary /usr/local/lib/python3.9/dist-packages/bitsandbytes/libbitsandbytes_cuda118.so...


  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)


In [2]:
# alpaca lora japanese
# BASE_MODEL = "decapoda-research/llama-7b-hf"
# BASE_MODEL = "decapoda-research/llama-13b-hf"
# BASE_MODEL = "decapoda-research/llama-30b-hf"
# BASE_MODEL = "decapoda-research/llama-65b-hf"

# LORA_WEIGHTS = "kunishou/Japanese-Alpaca-LoRA-7b-v0"
# LORA_WEIGHTS ="kunishou/Japanese-Alpaca-LoRA-13b-v0"
# LORA_WEIGHTS = "kunishou/Japanese-Alpaca-LoRA-30b-v0"
# LORA_WEIGHTS = "kunishou/Japanese-Alpaca-LoRA-65b-v0"

hf_model_path = "decapoda-research/llama-7b-hf"
alpaca_model_path = "kunishou/Japanese-Alpaca-LoRA-7b-v0"

tokenizer = LlamaTokenizer.from_pretrained(hf_model_path)

model = LlamaForCausalLM.from_pretrained(
    hf_model_path,
    load_in_8bit=True, #Dissabling could solve some errors
    device_map="auto",
)
model = PeftModel.from_pretrained(model, alpaca_model_path)


device = torch.device("cuda") if torch.cuda.is_available() else "cpu"
max_length = 1500 #2048
max_new_tokens = 48

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'LLaMATokenizer'. 
The class this function is called from is 'LlamaTokenizer'.


Loading checkpoint shards:   0%|          | 0/33 [00:00<?, ?it/s]

In [3]:
class LLaMALLM(LLM):
    def _call(self, prompt, stop=None):
        prompt += "### 応答:"

        inputs = tokenizer(prompt, return_tensors="pt")
        input_ids = inputs["input_ids"].cuda()
        
        generation_config = GenerationConfig(
            temperature=0.6,
            top_p=0.95,
            repetition_penalty=1.15,
        )
        with torch.no_grad():
            generation_output = model.generate(
                input_ids=input_ids,
                generation_config=generation_config,
                return_dict_in_generate=True,
                output_scores=True,
                max_new_tokens=128,
            )
        response = ""
        for s in generation_output.sequences:
            response += tokenizer.decode(s)
            
        response = response[len(prompt):]
        print("Model Response:", response)
        return response

    def _identifying_params(self):
        return {"name_of_model": "alpaca"}

    def _llm_type(self):
        return "custom"

In [4]:
max_input_size = max_length
num_output = max_new_tokens
max_chunk_overlap = 20

prompt_helper = PromptHelper(max_input_size, num_output, max_chunk_overlap)
embed_model = LangchainEmbedding(HuggingFaceEmbeddings())
documents = SimpleDirectoryReader('japanesedata').load_data()
llm_predictor = LLMPredictor(llm=LLaMALLM())
index = GPTSimpleVectorIndex(documents, llm_predictor=llm_predictor, embed_model=embed_model, prompt_helper=prompt_helper)

index.save_to_disk('japanindex.json')
new_index = GPTSimpleVectorIndex.load_from_disk('japanindex.json', embed_model=embed_model, llm_predictor=llm_predictor, prompt_helper=prompt_helper)

In [None]:
response = new_index.query("後藤ひとりはどんな人?")
print(response.response)

In [None]:
# 7bだとクソほども微妙です、、

In [None]:
!pip install git+https://github.com/huggingface/transformers.git

In [None]:
# llamaをhuggigfaceで動かす方法について調査する。
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("decapoda-research/llama-7b-hf")

model = AutoModelForCausalLM.from_pretrained("decapoda-research/llama-7b-hf")

In [None]:
!git clone --recurse-submodules https://github.com/nomic-ai/gpt4all.git
!git submodule configure && git submodule update
%cd gpt4all
!python -m pip install -r requirements.txt
%cd /content/gpt4all/transformers
!pip install -e . 
%cd /content/gpt4all/peft
!pip install -e .

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import LlamaTokenizer 
import torch

In [None]:
model = AutoModelForCausalLM.from_pretrained("decapoda-research/llama-7b-hf", device_map="auto", torch_dtype=torch.float16)
tokenizer = LlamaTokenizer.from_pretrained("decapoda-research/llama-7b-hf")

In [3]:
from peft import PeftModelForCausalLM

In [None]:
lora = True
lora_path = "nomic-ai/gpt4all-lora"
added_tokens = tokenizer.add_special_tokens({"bos_token": "<s>", "eos_token": "</s>", "pad_token": "<pad>"})
if added_tokens > 0:
  model.resize_token_embeddings(len(tokenizer))

if lora:
  model = PeftModelForCausalLM.from_pretrained(model, lora_path, device_map="auto", torch_dtype=torch.float16)
  model.to(dtype=torch.float16)

print(f"Mem needed: {model.get_memory_footprint() / 1024 / 1024 / 1024:.2f} GB")


In [None]:
#@title vicuna13b

In [None]:
!pip install bitsandbytes
!pip install -q datasets loralib sentencepiece
# !pip install -q git+https://github.com/zphang/transformers@c3dc391
!pip install git+https://github.com/huggingface/transformers.git
!pip install -q git+https://github.com/huggingface/peft.git

In [5]:
import os
import torch
import torch.nn as nn
import bitsandbytes as bnb
from datasets import load_dataset
import transformers
from transformers import AutoConfig, LlamaForCausalLM, LlamaTokenizer
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import prepare_model_for_int8_training, LoraConfig, get_peft_model, get_peft_model_state_dict

In [6]:
MICRO_BATCH_SIZE = 2  # this could actually be 5 but i like powers of 2
BATCH_SIZE = 8 # 128
GRADIENT_ACCUMULATION_STEPS = BATCH_SIZE // MICRO_BATCH_SIZE
EPOCHS = 3  # we don't need 3 tbh
LEARNING_RATE = 3e-4  # the Karpathy constant
CUTOFF_LEN = 256  # 256 accounts for about 96% of the data
LORA_R = 8
LORA_ALPHA = 16
LORA_DROPOUT = 0.05
VAL_SET_SIZE=2000

In [7]:
model_name = "anon8231489123/vicuna-13b-GPTQ-4bit-128g"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    load_in_8bit=True,
    device_map="auto",
)
tokenizer = AutoTokenizer.from_pretrained(
    model_name, add_eos_token=True
)

model = prepare_model_for_int8_training(model)

config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, config)
tokenizer.pad_token_id = 0  # unk. we want this to be different from the eos token



Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]