In [None]:
!pip install -q torch torchvision transformers huggingface_hub qwen_vl_utils hf_transfer


# VLM Generation
#### Due to lack of GPU, Qwen3-VL-4B was used. For better results you can try Qwen3-VL-8B.

In [None]:
import torch
from transformers import Qwen3VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info


def describe_actions(video_path):

    with torch.no_grad():
            model_id = "Qwen/Qwen3-VL-4B-Instruct"
            processor = AutoProcessor.from_pretrained(model_id)
            model = Qwen3VLForConditionalGeneration.from_pretrained(
                model_id, torch_dtype=torch.float16
            ).to('cuda')

            action_context = (
                "You are an expert basketball video analyst. Describe actions precisely, chronologically.\n\n"
                "Frames represent ≈3 FPS (~3 frames ≈ 1 second).\n\n"
                "Rules to decide this:\n"
                "• If two players are contending or the ball is loose, write 'uncertain'.\n"
                "• Once a player clearly controls the ball, set possession to that team at that second and treat them as Offense.\n"
                "• Do NOT switch offense/defense later unless you explicitly describe a turnover, steal, rebound after a miss, or an inbound.\n\n"
                "Per-second instructions (only what CHANGES vs previous second):\n"
                "• Possession & roles (who controls the ball; Offense = team in possession).\n"
                "• Actions (dribbles, passes, drives, screens, cuts, contests).\n"
                "• Ball trajectory (where it moves; if in flight, from whom to whom/where).\n"
                "• Key events (shot attempt/make/miss, foul, turnover, rebound, block).\n\n"
                "Visual cues:\n"
                "• Refer to teams ONLY by the **color of the jersey** and with the help of the **colored ellipses** (e.g., Blue, White). Do not use numbers unless clearly readable; otherwise omit numbers.\n"
                "• The green triangle ball marker can be inaccurate—prefer motion/possession; if unclear, say 'possession uncertain'.\n"
                "• Never mention the court or the crowd.\n\n"
                "Style:\n"
                "• Be objective; do not hallucinate.\n"
                "• Do not repeat the same event across adjacent seconds.\n"
                "• End each second with: Outcome: one of [none, pass, shot_attempt, made, missed, foul, turnover, rebound, block]."
            )


            messages = [{
                "role": "user",
                "content": [
                    {
                        "type": "video",
                        "video": video_path,
                       # "fps": 3.0
                    },
                    {
                        "type": "text",
                        "text": action_context
                    }
                ]
            }]

            text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
            image_inputs, video_inputs = process_vision_info(messages)

            print("Number of frames processed:", video_inputs[0].shape[0])
            inputs = processor(
                text=[text],
                images=image_inputs,
                videos=video_inputs,
                padding=True,
                return_tensors="pt"
            ).to("cuda")

            generated_ids = model.generate(**inputs, max_new_tokens=512)
            generated_ids_trimmed = generated_ids[:, inputs.input_ids.shape[1]:]
            description = processor.batch_decode(
                generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
            )[0]


            print(description)

    return description

In [None]:
video = "output_video.avi"
description = describe_actions(video)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
# Description obtained by the VLM
description = f"""
0.0 seconds
Blue controls the ball. A Blue player dribbles toward the basket, guarded by a White player. The Blue player passes the ball to a teammate who is cutting toward the basket. The ball is in flight from the Blue player to the teammate. Outcome: pass

0.2 seconds
The Blue player receives the pass and drives toward the basket, guarded by a White player. The Blue player attempts a shot. The shot is missed. Outcome: shot_attempt, missed"""

# LLM Refinement
#### The description obtained from the VLM is refined using Qwen3-4B-Instruct

In [22]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "Qwen/Qwen3-4B-Instruct-2507"


# load the tokenizer and the model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)

# prepare the model input
prompt = f"""
            You are a professional live basketball commentator.
            Your task is to generate a vivid and emotionally engaging **spoken commentary** based on a detailed frame-by-frame action description.

            Imagine you're broadcasting live: your words will be **converted into audio** using text-to-speech, so speak naturally, with excitement and clarity.

            ---

            **Guidelines**:
            - Follow the **CHRONOLOGY OF ACTIONS CLOSELY**, reflecting the sequence described (use the **timestamps as implicit structure**, but **don't include them** in your output). DON'T REVEAL the OUTCOME of the action BEFORE IT HAPPENS.
            - Make sure you COVER THE DESCRIPTION CHRONOLOGICALLY. You can USE THE GIVEN TIMESTAMPS to comment until the end of the action.
            - Your commentary must be **live, energetic, and natural**.
            - FIT EVERYTHING WITHIN <10 SECONDS — aim for **4 to 5 VERY SHORT AND CONCISE SENTENCES**.
            - **Your commentary must fit exactly within a few seconds**, so USE CONCISE YET NATURAL SENTENCES.
            - JUST GIVE THE OUTPUT, avoid unnecessary introduction. **DO NOT include extra labels like 'Commentary:' or explanations about what you are doing.**
            - Jersey color is linked to team identity.

            **Tone**:
            -**Use TEXT FORMATTING to create expressive speech** for SUSPENSEFUL ACTIONS:
                - **SHOUT important words**: `"DUNKKKK!!!"`, `"WHAT A SHOT!"`, `"UNBELIEVABLE SAVE!!!"`
                - **Elongate dramatic words**: `"3 POINTSSSS!"`, `"NOOOO! HE MISSED!!!"`
                - **Use suspenseful pauses**: `"Curry... takes a step... SHOOTS—WOW!!!"`
                - **Use sound-like words** for realism: `"OH WOW! WHAT A SHOT! THE CROWD ERUPTS!!!"`
            - However don't try to name the players, DON'T put things like [Player Name]

            ---

            **IMPORTANT**:
            - Cover the full action step-by-step.
            - **Don't summarize**. Capture the evolving intensity.
            - **Only output the final commentary**, no metadata, no labels.

            ---

            Here is the action description to turn into a few-second commentary (MAXIMUM 4 TO 5 SENTENCES - MAXIMUM 200 characters):
            '{description}'
            """

messages = [
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True,
)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

# conduct text completion
generated_ids = model.generate(
    **model_inputs,
    max_new_tokens=16384
)
output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist()

content = tokenizer.decode(output_ids, skip_special_tokens=True)

print("content:", content)


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

content: Blue passes—teammate cuts!  
Blue drives, guarded—tries to shoot…  
NOOOO! HE MISSED!!!  
White’s defense holds—ball live!


In [1]:
content = f"""Blue passes—teammate cuts!
Blue drives, guarded—tries to shoot…
NOOOO! HE MISSED!!!
White’s defense holds—ball live!
"""

# Audio generation using a TTS like Zonos

In [2]:
!git clone https://github.com/Zyphra/Zonos.git
!cd Zonos

# For gradio
#!docker compose up


Cloning into 'Zonos'...
remote: Enumerating objects: 340, done.[K
remote: Counting objects: 100% (196/196), done.[K
remote: Compressing objects: 100% (80/80), done.[K
remote: Total 340 (delta 159), reused 116 (delta 116), pack-reused 144 (from 1)[K
Receiving objects: 100% (340/340), 3.03 MiB | 21.99 MiB/s, done.
Resolving deltas: 100% (211/211), done.


In [None]:
from transformers.models.dac.modeling_dac import DacModel
from transformers import AutoProcessor





In [3]:
!pip install kanjize
!pip install phonemizer
!pip install sudachipy
!pip install sudachidict_full

Collecting kanjize
  Downloading kanjize-1.6.1-py3-none-any.whl.metadata (2.5 kB)
Downloading kanjize-1.6.1-py3-none-any.whl (6.8 kB)
Installing collected packages: kanjize
Successfully installed kanjize-1.6.1
Collecting phonemizer
  Downloading phonemizer-3.3.0-py3-none-any.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Collecting segments (from phonemizer)
  Downloading segments-2.3.0-py2.py3-none-any.whl.metadata (3.5 kB)
Collecting dlinfo (from phonemizer)
  Downloading dlinfo-2.0.0-py3-none-any.whl.metadata (1.1 kB)
Collecting csvw>=1.5.6 (from segments->phonemizer)
  Downloading csvw-3.7.0-py2.py3-none-any.whl.metadata (10 kB)
Collecting isodate (from csvw>=1.5.6->segments->phonemizer)
  Downloading isodate-0.7.2-py3-none-any.whl.metadata (11 kB)
Collecting rfc3986<2 (from csvw>=1.5.6->segments->phonemizer)
  Downloading rfc3986-1.5.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting lan

In [4]:
!apt-get install -y espeak-ng


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  espeak-ng-data libespeak-ng1 libpcaudio0 libsonic0
The following NEW packages will be installed:
  espeak-ng espeak-ng-data libespeak-ng1 libpcaudio0 libsonic0
0 upgraded, 5 newly installed, 0 to remove and 38 not upgraded.
Need to get 4,526 kB of archives.
After this operation, 11.9 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/main amd64 libpcaudio0 amd64 1.1-6build2 [8,956 B]
Get:2 http://archive.ubuntu.com/ubuntu jammy/main amd64 libsonic0 amd64 0.2.0-11build1 [10.3 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 espeak-ng-data amd64 1.50+dfsg-10ubuntu0.1 [3,956 kB]
Get:4 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 libespeak-ng1 amd64 1.50+dfsg-10ubuntu0.1 [207 kB]
Get:5 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 espeak-ng amd64 1.50+dfsg-1

In [5]:
import torch
import torchaudio
import sys
sys.path.append("./Zonos")
from zonos.model import Zonos
from zonos.conditioning import make_cond_dict
from zonos.utils import DEFAULT_DEVICE as device


# model = Zonos.from_pretrained("Zyphra/Zonos-v0.1-hybrid", device=device)
model = Zonos.from_pretrained("Zyphra/Zonos-v0.1-transformer", device=device)


wav, sampling_rate = torchaudio.load("/content/Zonos/assets/exampleaudio.mp3")
speaker = model.make_speaker_embedding(wav, sampling_rate)

cond_dict = make_cond_dict(text=content, speaker=speaker, language="en-us")
conditioning = model.prepare_conditioning(cond_dict)

codes = model.generate(conditioning)

wavs = model.autoencoder.decode(codes).cpu()
torchaudio.save("output.wav", wavs[0], model.autoencoder.sampling_rate)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/3.25G [00:00<?, ?B/s]

config.json:   0%|          | 0.00/541 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/307M [00:00<?, ?B/s]

  s = torchaudio.io.StreamReader(src, format, None, buffer_size)


ResNet293_SimAM_ASP_base.pt:   0%|          | 0.00/397M [00:00<?, ?B/s]

ResNet293_SimAM_ASP_base_LDA-128.pt:   0%|          | 0.00/265k [00:00<?, ?B/s]

Generating:   0%|          | 0/2588 [00:00<?, ?it/s]W1028 15:59:15.208000 822 torch/fx/experimental/symbolic_shapes.py:6823] [1/0] _maybe_guard_rel() was called on non-relation expression Eq(s38, 1) | Eq(s38, 2)
W1028 16:00:20.093000 822 torch/_inductor/utils.py:1436] [3/0_1] Not enough SMs to use max_autotune_gemm mode
Generating:  26%|██▌       | 675/2588 [03:54<11:03,  2.88it/s]
  s = torchaudio.io.StreamWriter(uri, format=muxer, buffer_size=buffer_size)


In [6]:
from IPython.display import Audio
Audio("output.wav")
