# Autoregressive LLM Demo
## MIS 769 - Advanced Big Data Analytics
### Prof. Richard Young (ryoung@unlv.edu)

This notebook demonstrates how language models generate text one token at a time (autoregressive generation).

Colab notes:
- Works on GPU when available (faster)
- Falls back to CPU automatically
- Installs missing dependencies in-notebook


In [None]:
# Colab/Jupyter dependency setup
import importlib
import importlib.util
import logging
import os
import subprocess
import sys

os.environ["HF_HUB_DISABLE_IMPLICIT_TOKEN"] = "1"
logging.getLogger("huggingface_hub.utils._http").setLevel(logging.ERROR)

needs_install = False
for pkg in ["transformers", "accelerate", "sentencepiece"]:
    if importlib.util.find_spec(pkg) is None:
        needs_install = True

if not needs_install:
    try:
        transformers = importlib.import_module("transformers")
        from packaging import version
        if version.parse(transformers.__version__) < version.parse("4.45.0"):
            needs_install = True
            print(f"Upgrading transformers from {transformers.__version__} to >=4.45.0")
    except Exception:
        needs_install = True

if needs_install:
    subprocess.check_call([
        sys.executable,
        "-m",
        "pip",
        "install",
        "-q",
        "transformers>=4.45.0",
        "accelerate>=0.33.0",
        "sentencepiece",
    ])
    print("Dependencies installed/updated.")
else:
    print("Dependencies already installed and version-compatible.")



In [None]:
import time
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# Primary model for class demo, with a small fallback if loading fails
PRIMARY_MODEL = "microsoft/Phi-3-mini-4k-instruct"
PRIMARY_REVISION = "f39ac1d28e925b323eae81227eaba4464caced4e"
FALLBACK_MODEL = "distilgpt2"

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {DEVICE}")


In [None]:
_tokenizer = None
_model = None
_loaded_model_id = None


def load_model():
    """Load the demo model once and reuse it across runs."""
    global _tokenizer, _model, _loaded_model_id

    if _tokenizer is not None and _model is not None:
        return _tokenizer, _model, _loaded_model_id

    # Use native Transformers implementation for Phi-3 to avoid remote-code rope_scaling issues.
    model_candidates = [
        (PRIMARY_MODEL, PRIMARY_REVISION, False),
        (FALLBACK_MODEL, None, False),
    ]

    last_error = None
    for model_id, revision, trust_remote_code in model_candidates:
        try:
            tokenizer_kwargs = {
                "token": False,
            }
            if trust_remote_code:
                tokenizer_kwargs["trust_remote_code"] = True
            if revision is not None:
                tokenizer_kwargs["revision"] = revision

            tokenizer = AutoTokenizer.from_pretrained(model_id, **tokenizer_kwargs)

            model_kwargs = {
                "token": False,
                "dtype": torch.float16 if DEVICE == "cuda" else torch.float32,
                "low_cpu_mem_usage": True,
            }
            if trust_remote_code:
                model_kwargs["trust_remote_code"] = True
            if revision is not None:
                model_kwargs["revision"] = revision
            if model_id == PRIMARY_MODEL:
                model_kwargs["attn_implementation"] = "eager"
            if DEVICE == "cuda":
                model_kwargs["device_map"] = "auto"

            model = AutoModelForCausalLM.from_pretrained(model_id, **model_kwargs)

            if DEVICE == "cpu":
                model = model.to("cpu")

            if tokenizer.pad_token_id is None and tokenizer.eos_token_id is not None:
                tokenizer.pad_token = tokenizer.eos_token

            _tokenizer, _model, _loaded_model_id = tokenizer, model, model_id
            print(f"Loaded model: {model_id}")
            return _tokenizer, _model, _loaded_model_id
        except Exception as e:
            last_error = e
            print(f"Could not load {model_id}: {e}")

    raise RuntimeError(
        "Failed to load models from Hugging Face. "
        "Check Colab internet access and rerun the setup cell."
    ) from last_error


def _build_input_ids(prompt, tokenizer, model_device, model_id, use_chat_template=True):
    """Use chat template for Phi-3 instruct models, plain tokenization otherwise."""
    if use_chat_template and model_id == PRIMARY_MODEL and hasattr(tokenizer, "apply_chat_template"):
        messages = [{"role": "user", "content": prompt}]
        templated = tokenizer.apply_chat_template(
            messages,
            add_generation_prompt=True,
            tokenize=True,
            return_dict=True,
            return_tensors="pt",
        )
        return templated["input_ids"].to(model_device), True

    return tokenizer(prompt, return_tensors="pt").input_ids.to(model_device), False


def show_autoregressive_generation(
    prompt,
    max_tokens=15,
    temperature=0.7,
    delay=0.3,
    use_chat_template=True,
    stop_on_sentence_end=False,
    seed=None,
):
    """Generate one token at a time to visualize autoregressive decoding."""
    tokenizer, model, model_id = load_model()

    model_device = next(model.parameters()).device
    input_ids, used_chat_template = _build_input_ids(
        prompt,
        tokenizer,
        model_device,
        model_id,
        use_chat_template=use_chat_template,
    )

    current_text = tokenizer.decode(input_ids[0], skip_special_tokens=True)

    if seed is not None:
        torch.manual_seed(seed)
        if torch.cuda.is_available():
            torch.cuda.manual_seed_all(seed)

    print(f"\nModel: {model_id}")
    print(f"Prompt: '{prompt}'")
    print(f"Seed: {seed if seed is not None else 'random'}")
    print(f"Chat template: {'on' if used_chat_template else 'off'}")
    print("\nGenerating token by token...")

    for step in range(max_tokens):
        outputs = model.generate(
            input_ids,
            max_new_tokens=1,
            do_sample=True,
            temperature=temperature,
            pad_token_id=tokenizer.pad_token_id,
        )

        new_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        if new_text.startswith(current_text):
            new_token = new_text[len(current_text):]
        else:
            new_token = new_text

        print(f"\nStep {step + 1}:")
        print(f"New token: {new_token!r}")
        print(f"Text so far: {new_text}")

        input_ids = outputs
        current_text = new_text

        if delay:
            time.sleep(delay)

        if stop_on_sentence_end and new_text.endswith((".", "!", "?")) and step > 5:
            break

    return new_text



In [None]:
# Classroom demo: explicit token-by-token generation (Step 1..Step N)
prompt = "Some people like cats but I am more of"

generated_text = show_autoregressive_generation(
    prompt,
    max_tokens=10,             # Show exactly 10 prediction steps
    temperature=0.7,
    delay=0.4,
    use_chat_template=True,
    stop_on_sentence_end=False,
    seed=103,                  # Fixed seed for repeatable classroom demos
)

print("\nFinal output:", generated_text)

