In [1]:
!pip install -q transformers accelerate bitsandbytes sentencepiece


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

def load_model(model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map="auto",
        load_in_4bit=True,
        torch_dtype=torch.float16
    )
    return tokenizer, model


In [3]:
model_name = "mistralai/Mistral-7B-Instruct-v0.2"
tokenizer, model = load_model(model_name)

prompt = """You are a customer support assistant.
Customer: My product is delayed. What should I do?
Assistant:"""

inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
output = model.generate(**inputs, max_new_tokens=150)

print(tokenizer.decode(output[0], skip_special_tokens=True))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/596 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


You are a customer support assistant.
Customer: My product is delayed. What should I do?
Assistant: I'm sorry to hear that your product has been delayed. I understand how frustrating this can be. Here are some steps you can take:
1. Contact the seller or the company directly: They may be able to provide you with an updated delivery date or an explanation for the delay.
2. Check your order status: Sometimes, delays can occur due to unforeseen circumstances or issues with your order. Checking your order status can help you understand the cause of the delay.
3. Consider insurance: If the delay is significant, you may want to consider purchasing insurance for your order. This can help protect you against potential damages or losses.
4. Contact your bank or credit card company: If the delay is


In [4]:
!pip install -q transformers accelerate bitsandbytes sentencepiece gradio
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import gradio as gr



In [5]:
MODEL_REGISTRY = {
    "Mistral (Customer Support)": "mistralai/Mistral-7B-Instruct-v0.2",
    "LLaMA (HR Resume Analysis)": "meta-llama/Llama-2-7b-chat-hf",
    "Falcon (Document Summarization)": "tiiuae/falcon-7b-instruct",
    "Qwen (Multilingual Assistant)": "Qwen/Qwen-7B-Chat"
}
# Model Registry (All Models in One Place)

# Universal Model Loader (4-bit for Colab)
loaded_models = {}

def load_model(model_name):
    if model_name in loaded_models:
        return loaded_models[model_name]

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map="auto",
        load_in_4bit=True,
        torch_dtype=torch.float16
    )

    loaded_models[model_name] = (tokenizer, model)
    return tokenizer, model
# 💡 Teaching tip:
# Explain “We load models only once to save GPU memory.”


In [6]:
# Text Generation Function
def generate_text(model_key, user_prompt, max_tokens=200):
    model_name = MODEL_REGISTRY[model_key]
    tokenizer, model = load_model(model_name)

    inputs = tokenizer(user_prompt, return_tensors="pt").to(model.device)

    outputs = model.generate(
        **inputs,
        max_new_tokens=max_tokens,
        do_sample=True,
        temperature=0.7
    )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)


In [7]:
# Predefined Real-World Business Prompts
USE_CASE_PROMPTS = {
    "Customer Support":
    "You are a customer support assistant. A customer says their order is delayed. Respond politely and helpfully.",

    "HR Resume Screening":
    "You are an HR assistant. Evaluate this resume for Data Analyst role:\nSkills: SQL, Python, Power BI, Excel",

    "Policy Summarization":
    "Summarize the following government policy in simple bullet points:\nIndia's National Education Policy focuses on...",

    "Multilingual Translation":
    "Translate the following sentence into Hindi:\n'Your loan will be approved within 5 working days.'"
}




In [8]:
# Test WITHOUT UI (Quick Demo)
print(
    generate_text(
        "Mistral (Customer Support)",
        USE_CASE_PROMPTS["Customer Support"]
    )
)
# ✔ Useful before launching UI
# ✔ Helps avoid demo surprises

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


You are a customer support assistant. A customer says their order is delayed. Respond politely and helpfully.

Subject: Order Delay

Hello [Customer's Name],

Thank you for reaching out to us regarding your order. I apologize for any inconvenience caused by the delay. Our team is committed to providing you with the best possible service, and I'm here to help.

I'd be happy to provide you with some information about the status of your order. Could you please provide me with your order number or the email address associated with the order? This will help me to look up the details of your order and provide you with an accurate update.

In the meantime, I'd like to assure you that we take order delays seriously and are working diligently to get your package on its way to you as soon as possible. We understand that you may have plans that depend on the arrival of your order, and we're here to help in any way we can.

If you have any other questions or concerns


In [10]:
def gradio_demo(model_key, use_case, custom_prompt):
    if custom_prompt.strip() == "":
        prompt = USE_CASE_PROMPTS[use_case]
    else:
        prompt = custom_prompt

    return generate_text(model_key, prompt)

gradio_demo(
    "Mistral (Customer Support)",
    "Customer Support",
    "Customer: My product is delayed. What should I do?"
)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


"Customer: My product is delayed. What should I do?\n\nAnswer:\n\n1. Communicate with your customers: Be upfront and transparent about the delay, explaining the reason(s) for it and the new estimated delivery date if possible.\n2. Consider offering an incentive: Depending on the reason for the delay, you may want to offer a discount, additional feature, or other incentive to make up for the inconvenience.\n3. Provide regular updates: Keep your customers informed of the progress, even if it's just to let them know that things are still on track and there's no new information to share.\n4. Assess and address the root cause: Once the product is delivered, take a hard look at why the delay occurred and put processes in place to prevent it from happening again.\n5. Review your communication strategy: Consider whether your communication strategy was effective, and if not, what changes you can make to improve it for future delays."

In [11]:
# Gradio UI (MAIN DEMO)
def gradio_demo(model_key, use_case, custom_prompt):
    if custom_prompt.strip() == "":
        prompt = USE_CASE_PROMPTS[use_case]
    else:
        prompt = custom_prompt

    return generate_text(model_key, prompt)

with gr.Blocks(title="Open-Source LLM Demo") as demo:
    gr.Markdown("## 🚀 Open-Source LLM Live Demo (Colab)")
    gr.Markdown(
        "Run **LLaMA, Mistral, Falcon, Qwen** on free GPU with real business use cases."
    )

    model_choice = gr.Dropdown(
        choices=list(MODEL_REGISTRY.keys()),
        label="Select Model"
    )

    use_case = gr.Dropdown(
        choices=list(USE_CASE_PROMPTS.keys()),
        label="Select Use Case"
    )

    custom_prompt = gr.Textbox(
        lines=4,
        placeholder="(Optional) Enter your own business prompt"
    )

    output = gr.Textbox(lines=12, label="Model Output")

    run_btn = gr.Button("Run Model")

    run_btn.click(
        gradio_demo,
        inputs=[model_choice, use_case, custom_prompt],
        outputs=output
    )

demo.launch(share=True, debug=True)

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://adec1916116315a120.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/transformers/dynamic_module_utils.py", line 754, in resolve_trust_remote_code
    prev_sig_handler = signal.signal(signal.SIGALRM, _raise_timeout_error)
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.12/signal.py", line 58, in signal
    handler = _signal.signal(_enum_to_int(signalnum), _enum_to_int(handler))
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: signal only works in main thread of the main interpreter

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/gradio/queueing.py", line 759, in process_events
    response = await route_utils.call_process_api(
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/gradio/route_utils.py", line 354, in call_process_api
 

Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://162540901ad385f429.gradio.live
Killing tunnel 127.0.0.1:7861 <> https://adec1916116315a120.gradio.live


