<a href="https://colab.research.google.com/github/prasanna14200/generativeai/blob/main/chatgptoutputhighstreamingmode.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Response Output: Full Block vs. Low Level Streaming Mode vs. High Level Streaming Mode

This notebook can run on a low-cost or free T4 runtime.

In [1]:
!pip install -q requests torch bitsandbytes transformers sentencepiece accelerate gradio

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m94.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m67.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m47.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
from google.colab import userdata
from huggingface_hub import login
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig, TextIteratorStreamer
import torch
import gradio as gr
import threading

## Sign in to Hugging Face

In [3]:
hf_token = userdata.get('HF_TOKEN')
login(hf_token, add_to_git_credential=True)

In [4]:
# Define the instruct model names
LLAMA = "meta-llama/Meta-Llama-3.1-8B-Instruct"

# Test Different types of Response Output

In [5]:
# define helper function to load model and tokenizer
def load_model(model_name):
  quant_config = BitsAndBytesConfig(
      load_in_4bit=True,
      bnb_4bit_use_double_quant=True,
      bnb_4bit_compute_dtype=torch.bfloat16,
      bnb_4bit_quant_type="nf4"
  )
  tokenizer = AutoTokenizer.from_pretrained(model_name)
  tokenizer.pad_token = tokenizer.eos_token
  model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", quantization_config=quant_config)
  return tokenizer, model

In [6]:
# define different generating functions:
#   1- full response
#   2- low level streaming response
#   3- low level streaming response

def generate_full(tokenizer, model, user_input, max_tokens=2000):
  global messages
  # Append the user's new message to the conversation history
  messages.append({"role": "user", "content": user_input})

  inputs = tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt=True).to("cuda")
  outputs = model.generate(inputs, max_new_tokens=max_tokens)
  response = tokenizer.decode(outputs[0])
  print(response)

def generate_stream_low_level(tokenizer, model, user_input, max_tokens=2000):
    global messages
    # Append the user's new message to the conversation history
    messages.append({"role": "user", "content": user_input})

    # Prepare the initial input
    input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt=True).to("cuda")

    # Generate up to 2000 tokens
    for _ in range(max_tokens):
        outputs = model(input_ids)  # Get the model's output (logits) for the given input IDs
        # Select the token with the highest probability from the last position's logits
        next_token_id = outputs.logits[:, -1].argmax(dim=-1).unsqueeze(-1)

        input_ids = torch.cat([input_ids, next_token_id], dim=-1)  # Append new token
        next_token = tokenizer.decode(next_token_id[0])  # Decode and print
        # flush=True ensures the output is immediately written to the console.
        # By default, print output is buffered, so it may not appear instantly.
        # flush=True forces the buffer to flush, making real-time output possible.
        print(next_token, end="", flush=True)

        if next_token_id.item() == tokenizer.eos_token_id:  # Stop if EOS token
            break
    print()

def generate_stream_high_level(tokenizer, model, user_input, max_tokens=2000):
  global messages
  # Append the user's new message to the conversation history
  messages.append({"role": "user", "content": user_input})

  inputs = tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt=True).to("cuda")
  # we skip using TextStreamer() here cause it streams back results to stdout and thats not what we want in gradio app
  # and we use TextIteratorStreamer() instead

  # Initialize the TextIteratorStreamer for streaming output
  streamer = TextIteratorStreamer(
      tokenizer,
      skip_prompt=True,
      decode_kwargs={"skip_special_tokens": True}
  )

  # Run the generation process in a separate thread
  thread = threading.Thread(
      target=model.generate,
      kwargs={"inputs": inputs, "max_new_tokens": max_tokens, "streamer": streamer}
  )
  thread.start()

  # Stream and print the output progressively
  for text_chunk in streamer:
    filtered_chunk = text_chunk.replace("<|eot_id|>", "")  # Remove special tokens if present
    print(filtered_chunk, end="")  # Print without adding new lines

## Load the model and tokenizer in order to test generating

In [7]:
# call the helper function and load the model and tokenizer
tokenizer, model = load_model(LLAMA)

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

## Start testing the three generating functions

In [8]:
# initialize the messages history, the max tokens for the model, and the user_input
messages = [
    {"role": "system", "content": "You are a helpful assistant"}
]

max_tokens = 2000

user_input = "What is the meaning of life? Answer in markdown and in 5 lines maximum."

In [9]:
generate_full(tokenizer, model, user_input)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

You are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>

What is the meaning of life? Answer in markdown and in 5 lines maximum.<|eot_id|><|start_header_id|>assistant<|end_header_id|>

**The Meaning of Life**

The meaning of life is a subjective and complex question that has puzzled philosophers, theologians, and scientists for centuries. Some possible answers include:

* **Existential fulfillment**: finding purpose and happiness in life's experiences and relationships.
* **Self-actualization**: realizing one's potential and living authentically.
* **Spiritual connection**: seeking a higher power or transcendence.
* **Personal growth**: learning, growing, and contributing to the world.
* **Happiness and well-being**: living a life that brings joy and fulfillment.<|eot_id|>


In [10]:
generate_stream_low_level(tokenizer, model, user_input)

**The Meaning of Life**

The meaning of life is a subjective question with various interpretations. 
It can be seen as a pursuit of happiness, fulfillment, and personal growth.
For some, it's about leaving a lasting legacy or making a positive impact.
Others may find meaning in spiritual or philosophical pursuits.
Ultimately, the meaning of life is unique to each individual.<|eot_id|>


In [11]:
generate_stream_high_level(tokenizer, model, user_input)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


**The Meaning of Life**

The meaning of life is a subjective and complex question that has puzzled philosophers, theologians, and scientists for centuries. It is a personal and individual concept that can vary greatly from person to person. Some possible answers include:

*   **Existential fulfillment**: Living a life of purpose, happiness, and fulfillment.
*   **Spiritual connection**: Seeking a deeper connection with a higher power or the universe.
*   **Self-discovery**: Continuously learning, growing, and evolving as a person.

Ultimately, the meaning of life is a question that each individual must answer for themselves.

# Implement the Gradio Interface

In [12]:
# define the streaming function for gradio (using yield)
def generate_stream(user_input):
    # Global variables for modifications
    global tokenizer, model, messages, max_tokens

    # Step 1: Append the user's new message to the conversation history
    messages.append({"role": "user", "content": user_input})

    # Step 2: Tokenize the input messages and convert them into a tensor
    # - apply_chat_template: Formats the messages according to the model's expected input format.
    # - return_tensors="pt": Returns the result as a PyTorch tensor.
    # - add_generation_prompt=True: Adds a special prompt or token for generation.
    # - .to("cuda"): Moves the tensor to the GPU for faster computation.
    input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt=True).to("cuda")
    # Type: torch.Tensor of shape [batch_size, sequence_length].to("cuda")

    # Initialize an empty string to accumulate the generated result
    result = ""

    # Step 3: Start generating tokens in a loop, up to a maximum of 2000 tokens
    for _ in range(max_tokens):
        # Step 4: Pass the current input sequence to the model to predict the next token
        # - outputs.logits: Contains the raw prediction scores (logits) for all possible tokens.
        # - Shape of outputs.logits: [batch_size, sequence_length, vocab_size].
        # - outputs.logits[:, -1]: Selects the logits for the last token position (shape: [batch_size, vocab_size]).
        outputs = model(input_ids)
        # Type: transformers.modeling_outputs.CausalLMOutputWithPast containing logits of shape [batch_size, sequence_length, vocab_size]

        # Step 5: Find the token ID with the highest score (greedy decoding)
        # - argmax(dim=-1): Selects the index of the maximum value along the vocab_size dimension.
        # - unsqueeze(-1): Adds a new dimension at the last position, resulting in a shape of [batch_size, 1].
        next_token_id = outputs.logits[:, -1].argmax(dim=-1).unsqueeze(-1)
        # Type: torch.Tensor of shape [batch_size, 1].unsqueeze(-1)

        # Step 6: Append the newly generated token ID to the input_ids tensor
        # - torch.cat(): Concatenates the current input_ids with the next_token_id along the last dimension.
        # - This updates input_ids to include the newly generated token, so the model can use the updated sequence in the next iteration.
        input_ids = torch.cat([input_ids, next_token_id], dim=-1)
        # Type: torch.Tensor of shape [batch_size, updated_sequence_length]

        # Step 7: Decode the newly generated token ID into a human-readable string
        # - tokenizer.decode(): Converts the token ID into its corresponding string.
        # - skip_special_tokens=True: Ensures special tokens like <eos> (end-of-sequence) are not included in the output.
        next_token = tokenizer.decode(next_token_id[0], skip_special_tokens=True)
        # Type: str representing the decoded token

        # Step 8: Accumulate the decoded token into the result string
        result += next_token

        # Step 9: Yield the accumulated result for streaming output
        # - yield allows the function to return partial results without stopping, enabling real-time streaming.
        yield result

        # Step 10: Check if the model predicted the end-of-sequence (EOS) token
        # - tokenizer.eos_token_id: The special token ID representing EOS.
        # - If EOS is detected, break the loop to stop further generation.
        if next_token_id.item() == tokenizer.eos_token_id:
            break

    # Append the final assistant response to the conversation history
    messages.append({"role": "assistant", "content": result})

In [13]:
# optimize the streaming function for gradio (using TextIteratorStreamer)
def generate_stream_optimized(user_input):
  # Global variables for modifications
  global tokenizer, model, messages, max_tokens

  # Step 1: Append the user's new message to the conversation history
  messages.append({"role": "user", "content": user_input})

  # Step 2: Prepare the inputs for the model by applying the chat template
  # The inputs include the conversation history and the user's latest message
  inputs = tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt=True).to("cuda")
  # we skip using TextStreamer() here cause it streams back results to stdout and thats not what we want in gradio app
  # we use TextIteratorStreamer() instead

  # Step 3: Initialize the TextIteratorStreamer
  streamer = TextIteratorStreamer(
      tokenizer,
      skip_prompt=True,  # Ensures that the input prompt is not repeatedly included in the streamed output.
      decode_kwargs={"skip_special_tokens": True}  # Filters out special tokens (e.g., <s>, </s>, <pad>, <cls>, <sep>) from the generated text.
  )

  # Step 4: Create a thread to run the generation process in the background
  thread = threading.Thread(
      target=model.generate,  # Specifies that the model's `generate` method will be run in the thread.
      kwargs={                           # Passes the arguments required for text generation
          "inputs": inputs,              # The tokenized input prompt for the model.
          "max_new_tokens": max_tokens,  # Limits the number of tokens to be generated.
          "streamer": streamer           # The TextIteratorStreamer to handle streaming the output.
          }
  )

  # Step 5: Start the thread to begin the generation process
  thread.start()

  # Step 6: Initialize an empty string to accumulate the growing output
  accumulated_reply = ""

  # Step 7: Stream the output progressively
  for text_chunk in streamer:  # Iterate over each chunk of text streamed by the model
      # Filter out any unexpected special tokens manually if they appear to ensure a clean output
      # `<|eot_id|>` is a special token (e.g., end-of-text marker) that may still appear in some outputs
      filtered_chunk = text_chunk.replace("<|eot_id|>", "")

      # Append the filtered chunk to the accumulated text that holds all the generated text seen so far
      accumulated_reply += filtered_chunk

      # Yield the accumulated text to the calling function/UI for progressive updates,
      # ensuring the output is continuously refreshed with new content
      yield accumulated_reply

  # Step 8: Append the final assistant response to the conversation history
  messages.append({"role": "assistant", "content": accumulated_reply})

In [14]:
# Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("# Chat with AI (Streaming Enabled)")
    with gr.Row():
      with gr.Column():
        user_input = gr.Textbox(label="Your message", placeholder="Type something...")
        output_box = gr.Markdown(label="AI Response", min_height=50)
        send_button = gr.Button("Send")

    send_button.click(fn=generate_stream_optimized, inputs=user_input, outputs=output_box)

demo.launch()

It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://68a0440740bcc4000e.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [2]:
import json

notebook_filename = "chatgptoutputhighstreamingmode.ipynb"  # 👈 choose any cleaned name

# Read the current notebook as text (works inside Colab only)
from google.colab import _message

# Save current notebook content as string
notebook_data = _message.blocking_request('get_ipynb')['ipynb']

# Clean widgets metadata
if "widgets" in notebook_data.get("metadata", {}):
    del notebook_data["metadata"]["widgets"]

# Save the cleaned notebook to local drive
with open(notebook_filename, "w") as f:
    json.dump(notebook_data, f, indent=2)

print(f"✅ Cleaned notebook saved as {notebook_filename}")


✅ Cleaned notebook saved as chatgptoutputhighstreamingmode.ipynb


In [3]:
import json
from google.colab import _message

# Get the notebook content
notebook_data = _message.blocking_request('get_ipynb')['ipynb']

# Fix: remove problematic widgets metadata
if "widgets" in notebook_data.get("metadata", {}):
    print("🧹 Removing invalid 'metadata.widgets' section...")
    del notebook_data["metadata"]["widgets"]
else:
    print("✅ No widgets metadata found. Nothing to clean.")

# Save cleaned notebook
fixed_filename = "chatgptoutputhighstreamingmode.ipynb"
with open(fixed_filename, "w") as f:
    json.dump(notebook_data, f, indent=2)

print(f"✅ Cleaned notebook saved as: {fixed_filename}")


🧹 Removing invalid 'metadata.widgets' section...
✅ Cleaned notebook saved as: chatgptoutputhighstreamingmode.ipynb
