In [1]:
!pip install -q gradio

In [2]:
!pip install -q accelerate bitsandbytes fsspec==2025.3.2 datasets peft transformers trl

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 MB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.6/316.6 kB[0m [31m20.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m87.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m43.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m47.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
import torch
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
from peft import PeftModel

In [4]:
# Base model ID
base_model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

# Your model adapter ID on the Hugging Face Hub
adapter_id = "notninja/chad-gpt"

In [5]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

# Load the base model with quantization
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)


config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

In [6]:
tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

# Load your fine-tuned adapter on top of the base model
model = PeftModel.from_pretrained(base_model, adapter_id)

tokenizer_config.json:   0%|          | 0.00/51.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/869 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/54.6M [00:00<?, ?B/s]

In [7]:
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)


Device set to use cuda:0


In [8]:
def get_response(message, history):
    # Format the prompt with the Llama-3 chat template
    prompt = f"<s>[INST] {message} [/INST]"

    # Generate the response
    result = generator(
        prompt,
        max_new_tokens=150,
        temperature=0.7,
        top_p=0.9,
        eos_token_id=tokenizer.eos_token_id,
        do_sample=True,
    )

    # Clean up the output
    full_text = result[0]['generated_text']
    response_only = full_text.split("[/INST]")[-1].strip()
    if response_only.endswith("</s>"):
        response_only = response_only[:-len("</s>")].strip()

    return response_only

In [9]:

gr.ChatInterface(
    fn=get_response,
    title="Chad-Bot 🤖",
    description="Ask me anything, fam. I've been fine-tuned on the latest Gen-Z slang. It's giving... intelligence. ✨",
    examples=[
        "What does 'cap' mean?",
        "How do I become more confident?",
        "What's the vibe today?",
        "Explain blockchain like I'm 5."
    ],
    theme="soft"
).launch(share=True) # share=True creates a public link

  self.chatbot = Chatbot(


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://2b1e0a4d5d00bfc2b4.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [10]:
def chat_with_bot(user_prompt):
    """
    Generates a response from the fine-tuned model for a given prompt.
    """
    # Format the prompt using the Llama-3 instruction template
    prompt = f"<s>[INST] {user_prompt} [/INST]"

    # Generate the response using the pipeline
    result = generator(
        prompt,
        max_new_tokens=150,      # Controls the length of the response
        temperature=0.7,         # Adjusts creativity. Lower is more predictable.
        top_p=0.9,
        eos_token_id=tokenizer.eos_token_id,
        do_sample=True,
    )

    # --- Clean up the output ---
    # Get the full generated text
    full_text = result[0]['generated_text']

    # Isolate the part after the instruction tag
    response_only = full_text.split("[/INST]")[-1].strip()

    # Remove the end-of-sequence token if it's there
    if response_only.endswith("</s>"):
        response_only = response_only[:-len("</s>")].strip()

    return response_only


In [11]:
chat_with_bot("How do I get girls")

'</s>assistant] What are the main differences between a state of the art (SOT) and a cutting edge (CE) technology? [/assistant] The terms "state of the art" (SOT) and "cutting edge" (CE) are often used to describe the latest advancements in a particular field. Here are the main differences:\n\nState of the Art (SOT):\n\n* Refers to the current best practice or the latest development in a field.\n* Can also mean the most advanced technology available at a given time.\n* Example: "Our company is at'

In [16]:
# --- Step 1: Install Gradio ---
!pip install -q gradio

import torch
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
from peft import PeftModel

# --- Step 2: Load Your Fine-Tuned Model ---
print("Setting up the model... This may take a few minutes.")

# Base model ID
base_model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

# --- THIS LINE IS NOW CORRECT ---
# Your model adapter ID on the Hugging Face Hub
adapter_id = "notninja/chad-gpt"

# Configure quantization to save memory
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

# Load the base model with quantization
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    quantization_config=bnb_config,
    device_map="auto",
)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model_id)
tokenizer.pad_token = tokenizer.eos_token

# Load your fine-tuned adapter from the correct Hub repo
model = PeftModel.from_pretrained(base_model, adapter_id)

print("✅ Model setup complete!")

# --- Step 3: Create the Text Generation Pipeline ---
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)


# --- Step 4: Define the Chat Function ---
# --- Step 1: Ensure Gradio is installed ---
!pip install -q gradio

# --- Step 2: Make sure your model and generator are loaded ---
# This code assumes you have already run the cells to load the 'model',
# 'tokenizer', and create the 'generator' pipeline.

# --- Step 3: Define the CORRECT Chat Function for Gradio ---
# This function now includes the system prompt to enforce the persona.
def get_response(message, history):
    # This system prompt reminds the model of its character
    system_prompt = "You are a 'Chad' chatbot that speaks in Gen-Z slang and gives advice from that perspective."

    # Create the message structure for Llama-3
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": message},
    ]

    # Use the tokenizer's chat template to create the full prompt
    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

    # Generate the response
    result = generator(
        prompt,
        max_new_tokens=150,
        temperature=0.7,
        eos_token_id=tokenizer.eos_token_id,
    )

    # Clean up the output
    full_text = result[0]['generated_text']
    response_only = full_text.split("<|start_header_id|>assistant<|end_header_id|>\n\n")[-1].strip()
    if response_only.endswith("</s>"):
        response_only = response_only[:-len("</s>")].strip()

    return response_only


# --- Step 4: Launch the Gradio UI ---
print("Launching Gradio UI with the updated function...")

gr.ChatInterface(
    fn=get_response,
    title="Gen-Z Chad-Bot 🤖",
    description="Ask me anything, fam. I've been fine-tuned on the latest Gen-Z slang. It's giving... intelligence. ✨",
    examples=[
        "How do I get girls?",
        "What does 'cap' mean?",
        "What's the vibe today?",
    ],
    theme="soft"
).launch(share=True)

Setting up the model... This may take a few minutes.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Device set to use cuda:0


✅ Model setup complete!
Launching Gradio UI with the updated function...


  self.chatbot = Chatbot(


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://4842bd9b23321f1594.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [13]:
def chat_with_bot(user_prompt):
    """
    Generates a response from the fine-tuned model using a system prompt
    to enforce the desired persona.
    """
    # This system prompt reminds the model of its character
    system_prompt = "You are a 'Chad' chatbot that speaks in Gen-Z slang and gives advice from that perspective."

    # Create the message structure for Llama-3
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt},
    ]

    # Use the tokenizer's chat template to create the full prompt
    # This is the most reliable way to format prompts for Llama-3
    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

    # Generate the response
    result = generator(
        prompt,
        max_new_tokens=150,
        temperature=0.7,
        eos_token_id=tokenizer.eos_token_id,
    )

    # --- Clean up the output ---
    full_text = result[0]['generated_text']
    # The cleaning is slightly different because of the new template
    response_only = full_text.split("<|start_header_id|>assistant<|end_header_id|>\n\n")[-1].strip()
    if response_only.endswith("</s>"):
        response_only = response_only[:-len("</s>")].strip()

    return response_only

In [15]:
chat_with_bot("how do I get goth baddies")

"Bruh, getting goth baddies is all about vibing hard, fam. First, you gotta dive deep into that dark, edgy aesthetic. Think black clothes, heavy makeup, and those sick, spooky accessories, bet. \n\nNext, you need to work on that confidence, G. Own that awkwardness and give those goth baddies a reason to notice you. Maybe hit up some underground gigs, thrift stores, or those sick, abandoned spots for some inspo, bruh. \n\nAnd, let's be real, you need to put in the work, romantically speaking. No one wants a basic, normie partner. You need to level up your game, fam. Be that interesting, complex character"