<a href="https://colab.research.google.com/github/priyal6/General-llm/blob/main/slm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
# Phi-3 Mini        → microsoft/phi-3-mini-4k-instruct
# LLaMA 3 8B        → meta-llama/Meta-Llama-3-8B-Instruct
# TinyLLaMA         → TinyLlama/TinyLlama-1.1B-Chat-v1.0
# Gemma             → google/gemma-2b-it
# Flan-T5           → google/flan-t5-base
# MiniLM            → sentence-transformers/all-MiniLM-L6-v2
# Qwen 2.5          → Qwen/Qwen2.5-7B-Instruct

In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM

model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto"
)

prompt = "What is machine learning?"
inputs = tokenizer(prompt, return_tensors="pt")

output = model.generate(**inputs, max_new_tokens=80)
print(tokenizer.decode(output[0], skip_special_tokens=True))




What is machine learning? How does it work, and what are its applications?


In [4]:
output = model.generate(**inputs, max_new_tokens=80)
print(tokenizer.decode(output[0], skip_special_tokens=True))

What is machine learning? How does it work, and what are its applications?


In [8]:
TASKS = {
    "chat": "Qwen/Qwen2.5-7B-Instruct",
    "code": "Qwen/Qwen2.5-Coder-7B-Instruct",
    "clean_text": "google/gemma-2b-it",
    "nlp": "google/flan-t5-base",
}


In [9]:
def classify_task(prompt: str) -> str:
    p = prompt.lower()

    if any(x in p for x in ["code", "function", "python", "bug", "algorithm"]):
        return "code"

    if any(x in p for x in ["summarize", "classify", "translate", "question:"]):
        return "nlp"

    if any(x in p for x in ["email", "polite", "rewrite", "professional"]):
        return "clean_text"

    return "chat"


In [10]:
import torch, gc
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    AutoModelForSeq2SeqLM
)

LOADED_MODEL = None
LOADED_TOKENIZER = None
LOADED_NAME = None


In [11]:
def load_model(model_name, task):
    global LOADED_MODEL, LOADED_TOKENIZER, LOADED_NAME

    if LOADED_NAME == model_name:
        return LOADED_MODEL, LOADED_TOKENIZER

    unload_model()

    print(f"Loading {model_name}...")

    tokenizer = AutoTokenizer.from_pretrained(model_name)

    if task == "nlp":
        model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    else:
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=torch.float16,
            device_map="auto"
        )

    LOADED_MODEL = model
    LOADED_TOKENIZER = tokenizer
    LOADED_NAME = model_name

    return model, tokenizer


In [12]:
def unload_model():
    global LOADED_MODEL, LOADED_TOKENIZER, LOADED_NAME

    if LOADED_MODEL is not None:
        del LOADED_MODEL
        del LOADED_TOKENIZER
        LOADED_MODEL = None
        LOADED_TOKENIZER = None
        LOADED_NAME = None

        gc.collect()
        torch.cuda.empty_cache()


In [13]:
def run(prompt: str, max_new_tokens=150):
    task = classify_task(prompt)
    model_name = TASKS[task]

    model, tokenizer = load_model(model_name, task)

    inputs = tokenizer(prompt, return_tensors="pt")

    if task != "nlp":
        inputs = inputs.to("cuda")

    output = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        temperature=0.7
    )

    return tokenizer.decode(output[0], skip_special_tokens=True)


In [None]:
print(run("Write a Python function to reverse a list"))

print(run("Summarize: Transformers are sequence models"))

print(run("Write a polite email declining a meeting"))

print(run("Explain attention mechanisms simply"))
