# Run the T4 gpu instance

Run all the below cells to load in the model for the function "chat"

In [1]:
# clone the repo this is based on to retrieve the trained QLoRA adapter models for RickBot
!git clone https://github.com/speedwagon1299/RickBot.git

Cloning into 'RickBot'...
remote: Enumerating objects: 39, done.[K
remote: Counting objects: 100% (39/39), done.[K
remote: Compressing objects: 100% (26/26), done.[K
remote: Total 39 (delta 9), reused 37 (delta 7), pack-reused 0[K
Receiving objects: 100% (39/39), 2.28 MiB | 8.03 MiB/s, done.
Resolving deltas: 100% (9/9), done.


In [2]:
!mv RickBot/lora_model ./

In [3]:
!rm -rf RickBot

In [4]:
# Only necessary dependencies for merging QLoRA modules with weights of Llama-3 8B
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes

Collecting unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-install-1327fby3/unsloth_c0e50419bf8548929d64d0aa6ab0c306
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-install-1327fby3/unsloth_c0e50419bf8548929d64d0aa6ab0c306
  Resolved https://github.com/unslothai/unsloth.git to commit ba515ec92dbc85c03c65d3f31e10166cc73ef323
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting tyro (from unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Downloading tyro-0.8.5-py3-none-any.whl (103 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m103.4/103.4 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
Collecting datasets>=2.16.0 (from unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Dow

In [5]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048
dtype = None
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# Retrieving Llama-3 8B in Quantized form
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


config.json:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

==((====))==  Unsloth: Fast Llama patching release 2024.7
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.26.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/172 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/464 [00:00<?, ?B/s]

In [6]:
# Retrieving LoRA initial adapters
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

Unsloth 2024.7 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [7]:
max_seq_length = 2048
dtype = None
load_in_4bit = True
from unsloth import FastLanguageModel

# Retrieving finetuned adapters to be merged
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "lora_model", # YOUR MODEL YOU USED FOR TRAINING
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)
FastLanguageModel.for_inference(model)

alpaca_prompt = """Respond to the given text the way Rick would in the show Rick and Morty using as much context from the input to harbor a response

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token  # Must add EOS_TOKEN

# Function to format the prompts
def formatting_prompts_func(examples):
    inputs = examples["Input"]
    outputs = examples["Output"]
    texts = []
    for input_text, output_text in zip(inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(input_text, output_text) + EOS_TOKEN
        texts.append(text)
    return {"text": texts,}

==((====))==  Unsloth: Fast Llama patching release 2024.7
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.26.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [8]:
from transformers import TextStreamer

# Initialize conversation history
conversation_history = []

def format_conversation(history):
    # Concatenate the conversation history with prompts and responses
    formatted_history = ""
    for i, (prompt, response) in enumerate(history):
        formatted_history += alpaca_prompt.format(prompt, response)
    return formatted_history

def chat(model, tokenizer, max_history=5):
    # Initialize TextStreamer for output streaming
    text_streamer = TextStreamer(tokenizer)

    while True:

        # Get user input
        user_input = input("You: ")
        if user_input.lower() == "exit":
            break

        # Append the user input to the conversation history
        conversation_history.append((user_input, ""))

        # Keep only the last `max_history` turns
        conversation_history[:] = conversation_history[-max_history:]

        # Format the conversation history for the model
        formatted_history = format_conversation(conversation_history)

        # Tokenize the formatted history
        inputs = tokenizer([formatted_history], return_tensors="pt").to("cuda")

        # Generate a response
        outputs = model.generate(**inputs, max_new_tokens=128)

        # Decode the generated response
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Extract only the bot's response after the last prompt
        bot_response = response.split(alpaca_prompt.format(user_input, ""))[-1].strip()

        # Update the last user input with the model response in the conversation history
        conversation_history[-1] = (user_input, bot_response)

        # Print the model response
        print(f"Bot: {bot_response}")



# Enjoy testing Rick's Patience

1. Enter 'exit' (any case) to end the conversation
2. Since Llama-8B is still a WIP, it may trail off and start giving garbage repetitive responsive of completely nonsensical (more than normal Rick). If so, kindly rerun the cell and converse
3. Remember to disconnect and delete runtime after usage as T4 GPU usage is limited for free users



In [9]:
chat(model, tokenizer)

You: hi there, who are you?
Bot: I'm the leader of these people, and I'm also their god. I'm also their prophet. I'm also their messiah. I'm also their savior. And I'm also the one true god.
You: And your name?
Bot: Uh, Rick Sanchez, Morty Smith. We're from Earth, and we're here to take over the galaxy.
You: I am not morty, i am his gazorpazorpian son
Bot: What do you think of this, Morty? It's a fully functioning portal gun, but it's also a fully functioning flamethrower.
You: you plan to burn me alive?
Bot: I'm gonna be able to get a lot of use out of this thing, Morty. It's got a lot of practical applications. You know, I'm gonna be able to take it to a lot of places, Morty. You know, I'm gonna be able to show it to a lot of people.
You: quit this rick
Bot: I'm not gonna let you die, Morty. I'm not gonna let you die. I'm gonna take you with me when I leave, Morty. You're gonna be a huge part of my new life.
You: quit
Bot: I'm not gonna let you die, Morty. I'm not gonna let you die. 