References: https://huggingface.co/blog/4bit-transformers-bitsandbytes

### Install dependencies

In [None]:
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q -U einops
!pip install -q -U safetensors

In [None]:
!pip install accelerate
!pip install -i https://test.pypi.org/simple/ bitsandbytes

In [None]:
!pip install scipy

### bitsandbytes configs

The 4bit integration comes with 2 different quantization types: FP4 and NF4. The NF4 dtype stands for Normal Float 4 and is introduced in the QLoRA paper

You can switch between these two dtype using bnb_4bit_quant_type from BitsAndBytesConfig. By default, the FP4 quantization is used.

This saves more memory at no additional performance - from our empirical observations, this enables fine-tuning llama-13b model on an NVIDIA-T4 16GB with a sequence length of 1024, batch size of 1 and gradient accumulation steps of 4.

To enable this feature, simply add `bnb_4bit_use_double_quant=True` when creating your quantization config!

(text from HF colab)



We will used NF4!

In [None]:
import torch

from transformers import BitsAndBytesConfig

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

In [None]:
import scipy
import scipy.stats

### Load model and pipeline

In [None]:
# My version with smaller chunks on safetensors for low RAM environments
#model_id = "vilsonrodrigues/falcon-7b-instruct-sharded"
#model_id = "anakin87/zephyr-7b-alpha-sharded"
model_id = "Trelis/Llama-2-7b-chat-hf-sharded-bf16"

from transformers import AutoModelForCausalLM, AutoTokenizer,pipeline

In [None]:
model_4bit = AutoModelForCausalLM.from_pretrained(
        model_id,
        device_map="auto",
        quantization_config=quantization_config,
        trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [None]:
print(model_4bit)

In [None]:
!pip install xformers

In [None]:
import torch
import transformers

pipeline = transformers.pipeline(
        "text-generation",
        model=model_4bit,
        tokenizer=tokenizer,
        use_cache=True,
        device_map="auto",
        #max_length=296,
        max_length=2048,
        do_sample=True,
        top_k=10,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id,
)

In [None]:
sequences = pipeline(
   "Girafatron is obsessed with giraffes, the most glorious animal on the face of this Earth. Giraftron believes all other animals are irrelevant when compared to the glorious majesty of the giraffe.\nDaniel: Hello, Girafatron!\nGirafatron:")

In [None]:
sequences

### Use with LangChain

In [None]:
# Some error in colab. fix with
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [None]:
!pip install langchain

In [None]:
from langchain import HuggingFacePipeline

Load local LLM

In [None]:
llm = HuggingFacePipeline(pipeline=pipeline)

Define Template

In [None]:
from langchain import PromptTemplate, LLMChain

template = """Question: {question}
Answer: Let's think step by step."""

prompt = PromptTemplate(
    template=template,
    input_variables= ["question"]
)

Chain

In [None]:
llm_chain = LLMChain(prompt=prompt, llm=llm)

In [None]:
!pip install "weaviate-client==3.*"
!pip install sentence-transformers
!pip install tiktoken

In [None]:
import weaviate
import pandas as pd
from google.colab import drive
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import TokenTextSplitter
from langchain.vectorstores import Weaviate
from langchain.chains import RetrievalQA
from langchain.agents.types import AgentType
from langchain.agents import AgentExecutor, Tool,initialize_agent
from langchain.memory import ConversationBufferMemory

In [None]:
drive.mount('/content/drive')

In [None]:
!ls drive/MyDrive/'Colab Notebooks'

In [None]:
WEAVIATE_URL = "https://ragtestarray-4gihzxpr.weaviate.network"
WEAVIATE_API_KEY = "7E0Vf7POMdgUkpQfEHj5hPMpfUtPxNCNIisB"

client = weaviate.Client(
    url=WEAVIATE_URL, auth_client_secret=weaviate.AuthApiKey(WEAVIATE_API_KEY),
)

In [None]:
data = pd.read_csv("drive/MyDrive/Colab Notebooks/kidney_cancer_stories_v2.txt", index_col=0)

In [None]:
data.head()

In [None]:
embedding_model_name = "sentence-transformers/all-mpnet-base-v2"
#model_kwargs = {"device": "cuda"} mps
#model_kwargs = {"device": "mps"}
model_kwargs = {}
embeddings = HuggingFaceEmbeddings(
  model_name=embedding_model_name,
  model_kwargs=model_kwargs
)

In [None]:
text_splitter = TokenTextSplitter(chunk_size=128, chunk_overlap=0)

In [None]:
all_docs = []
# vectors_docs = []
count = 0
for index, d in data.iterrows():
    try:
        #list_docs = text_splitter.create_documents([d["text"]])
        #split = text_splitter.split_documents(list_docs)
        try:
            # base_docs = text_splitter.split_text(d['headline'] + d["text"])
            # list_docs = [d['title'] + '##' + base_docs[i] for i in range(0, len(base_docs))]
            base_docs = text_splitter.split_text(d['Story'])
            list_docs = [d['Name'] + '##' + base_docs[i] for i in range(0, len(base_docs))]
            create_docs = text_splitter.create_documents(list_docs)
            #split_docs = text_splitter.split_documents(list_docs)

        except Exception as e:
            #print(e)
            list_docs = []
            create_docs = []
            split_docs = []
            continue

        # try:
        #     vector_list = embeddings.embed_documents(list_docs)
        # except:
        #     vector_list = []
        all_docs.extend(create_docs)
        # vectors_docs.append(vector_list)
        count = count + 1
    except Exception as  e:
        print(e)
        continue

In [None]:
vector_db = Weaviate.from_documents(
    all_docs, embeddings, client=client, by_text=False
)

In [None]:
vector_db.similarity_search("What are side effects of kidney cancer?", k=3)

In [None]:
qa_chain = RetrievalQA.from_chain_type(
    llm=llm, chain_type="stuff", retriever=vector_db.as_retriever(search_kwargs={"k": 4}), return_source_documents=True, verbose=True
)

In [None]:
response = qa_chain("What are side effects of kidney cancer?")

In [None]:
response

In [None]:
memory = ConversationBufferMemory(memory_key="chat_history", input_key='input', return_messages=True, output_key='output')

In [None]:
system_message = """
"You are the XYZ bot."
"This is conversation with a human. Answer the questions you get based on the knowledge you have."
"If you don't know the answer, just say that you don't, don't try to make up an answer."
"""

In [None]:
tools = [
        Tool(
            name="doc_search_tool",
            func=qa_chain,
            description=(
               "This tool is used to retrieve information from the knowledge base"
            )
        )
    ]

# agent = initialize_agent(
#         agent = AgentType.CHAT_CONVERSATIONAL_REACT_DESCRIPTION,
#         tools=tools,
#         llm=llm,
#         memory=memory,
#         return_source_documents=True,
#         return_intermediate_steps=True,
#         agent_kwargs={"system_message": system_message}
#         )
agent = initialize_agent(
        agent = AgentType.CHAT_CONVERSATIONAL_REACT_DESCRIPTION,
        tools=tools,
        llm=llm,
        # memory=memory,
        return_source_documents=True,
        return_intermediate_steps=True,
        agent_kwargs={"system_message": system_message}
        )

In [None]:
result1 = agent("What are side effects of kidney cancer?")

### **Tests**

In [None]:
llm_chain("How to prepare eggs?")

In [None]:
llm_chain("How to start a car?")

In [None]:
template2 = """Question: /n {question}. Answer: """

prompt2 = PromptTemplate(
    template=template2,
    input_variables= ["question"]
)

In [None]:
llm_chain_2 = LLMChain(prompt=prompt2, llm=llm)

In [None]:
result_explanation = llm_chain_2("Explain antibiotics")

In [None]:
result_explanation['text']

In [None]:
"""Author-contribution statements and acknowledgements in research papers should state clearly and specifically whether, and to what extent, the authors used AI technologies such as ChatGPT in the preparation of their manuscript and analysis. They should also indicate which LLMs were used. This will alert editors and reviewers to scrutinize manuscripts more carefully for potential biases, inaccuracies and improper source crediting. Likewise, scientific journals should be transparent about their use of LLMs, for example when selecting submitted manuscripts.
Mention the large language model based product mentioned in the paragraph above:"""

In [None]:
prompt_pt_grafos = """No ramo de análise de grafos, existe uma métrica chamada Clustering Coefficient,
   você pode me falar como interpretar ela?"""
llm_chain_2(prompt_pt_grafos)

In [None]:
llm_chain_2("what is a convolution?")

In [None]:
prompt_code = """ I have to pass 2 values that are as a string

'2,3'

And turn them into a tuple

(2,3), how to do?
"""

(hit token limit)

In [None]:
llm_chain_2(prompt_code)

In [None]:
prompt_code2 = """
/*
Write a python code to ask the user for their name and say "Hello"
*/
"""
llm_chain_2(prompt_code2)

In [None]:
llm_chain_2("How to convert a base64 file to bytes in python?")

In [None]:
prompt_sql = """
Table departments, columns = [DepartmentId, DepartmentName]
Table students, columns = [DepartmentId, StudentId, StudentName]
Create a MySQL query for all students in the Computer Science Department
"""
llm_chain_2(prompt_sql)

In [None]:
llm_chain_2("como funciona o método __call__ em python")

In [None]:
llm_chain_2("show me how python's args and kwargs work")

In [None]:
llm_chain_2("What's latency definition?")

In [None]:
llm_chain_2("what is Python's ABC library and what is it for?")

In [None]:
llm_chain_2("Write me a diet, my goal is to gain lean mass and I will work out")

In [None]:
llm_chain_2("what is the difference between Similarity embeddings and search embeddings")

Conversation

In [None]:
template_chat = """You are now a conversational assistant and must answer the questions: /n {history}"""

prompt_chat = PromptTemplate(
    template=template_chat,
    input_variables= ["history"]
)
llm_chain_chat = LLMChain(prompt=prompt_chat, llm=llm)

In [None]:
prompt_conversation1 = """
The following is a conversation with an AI research assistant. The assistant tone is technical and scientific.
Human: Hello, who are you?
AI: Greeting! I am an AI research assistant. How can I help you today?
Human: Can you tell me about the creation of blackholes?
AI:
"""

In [None]:
llm_chain_chat(prompt_conversation1)