In [8]:
!apt-get update -qq
!apt-get install ffmpeg -y
!pip install --upgrade "websockets>=13.0,<15.0"
!pip install --upgrade gradio --quiet
!pip install --upgrade openai --quiet
!pip install transformers==4.29.0 torch torchvision --quiet
!pip install huggingface_hub --quiet
!pip install -U openai-whisper --quiet

W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 26 not upgraded.


In [9]:
import os
import subprocess
import json
import math
import gradio
import openai
import torch
import whisper
from PIL import Image
from transformers import CLIPProcessor, CLIPModel
from openai import OpenAI

# openai.api_key = "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"
client = OpenAI(api_key="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")
device = "cuda" if torch.cuda.is_available() else "cpu"


whisper_model = whisper.load_model("base", device=device)

clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")


HIERARCHY_TAGS = {"product","category","industry","brand","none"}
STORYLINE_TAGS = {"unboxing","testimonial","before-after","tutorial","listicle","daily-routine",
                  "voice-over-showcase","dialogue","replicate-ad","demonstration","none"}
HOOK_TAGS = {"strong-reaction","dramatize-problem","absurd-alternative","visual-trick",
             "highlight-popularity","target-audience-callout","controversy","emphasize-one-usp","none"}
CTA_TAGS = {"buy_now","download_now","visit_website","sign_up","subscribe","start_free_trial","learn_more","none"}
ICP_TAGS = {"moms","athletes","students","none"}
ACTOR_TAGS = {"male","female","mixed","none"}


clip_candidate_texts = [
    "a female actor speaking",
    "a male actor speaking",
    "no people visible",
    "someone unboxing a product",
    "someone giving a personal testimonial",
    "someone showing a before-after scenario",
    "someone teaching a tutorial",
    "someone in a daily routine",
    "multiple people in a dialogue",
    "someone demonstrating a product",
    "someone with a strong reaction",
    "someone dramatizing a problem",
    "someone highlighting brand story",
    # you can add or remove as needed
]

  checkpoint = torch.load(fp, map_location=device)
  return torch.load(checkpoint_file, map_location="cpu")


In [10]:

def extract_frames(video_path):
    cmd = f'ffprobe -v error -show_entries format=duration -of default=noprint_wrappers=1:nokey=1 "{video_path}"'
    try:
        dur_str = subprocess.check_output(cmd, shell=True).decode("utf-8").strip()
        duration = float(dur_str)
    except:
        duration = 10.0

    fixed_times = [0,4,8,12,16,20,24,28]
    out_files = []
    for i, t in enumerate(fixed_times):
        if t >= duration - 0.5:
            break
        out_f = f"frame_{i}.jpg"
        cmd = f'ffmpeg -ss {t} -i "{video_path}" -frames:v 1 -q:v 2 "{out_f}" -y'
        os.system(cmd)
        if os.path.exists(out_f):
            out_files.append(out_f)
    return out_files


def extract_audio_transcript(video_path):
    wav_file = "temp_audio.wav"
    cmd = f'ffmpeg -i "{video_path}" -q:a 0 -map a "{wav_file}" -y'
    # cmd = f'ffmpeg -i "{video_path}" -vn -acodec pcm_s16le -ar 16000 -ac 1 "{wav_file}" -y'

    # Run the command and capture output
    try:
        subprocess.run(cmd, shell=True, check=True, capture_output=True)
    except subprocess.CalledProcessError as e:
        print(f"FFmpeg command failed with error: {e.stderr.decode()}")
        return None

    # Check if the WAV file was created
    if not os.path.exists(wav_file):
        print(f"Error: {wav_file} was not created.")
        return None

    # Transcribe the audio
    try:
        result = whisper_model.transcribe(wav_file)
        return result["text"]
    except Exception as e:
        print(f"Whisper transcription failed: {e}")
        return None


def analyze_frame_with_clip(frame_path):
    im = Image.open(frame_path).convert("RGB")
    inputs = clip_processor(text=clip_candidate_texts, images=im, return_tensors="pt", padding=True).to(device)
    with torch.no_grad():
        outputs = clip_model(**inputs)
    logits_per_image = outputs.logits_per_image
    probs = logits_per_image.softmax(dim=1)[0].tolist()

    max_prob = max(probs)
    max_idx = probs.index(max_prob)
    chosen_label = clip_candidate_texts[max_idx]
    return chosen_label, max_prob




SYSTEM_PROMPT_HEAD = """
You are an ad classification assistant.
We have EXACTLY 6 keys: "hierarchy","storyline","hook","cta","icp","actor".
Each key must be a single string from the sets below, no arrays or multiple strings:

hierarchy: {hierarchy}
storyline: {storyline}
hook: {hook}
cta: {cta}
icp: {icp}
actor: {actor}

If the ad is borderline, pick the single best match. Return valid JSON with these 6 keys.
No extra commentary, no arrays.
"""

SYSTEM_PROMPT_HEAD = SYSTEM_PROMPT_HEAD.format(
    hierarchy=list(HIERARCHY_TAGS),
    storyline=list(STORYLINE_TAGS),
    hook=list(HOOK_TAGS),
    cta=list(CTA_TAGS),
    icp=list(ICP_TAGS),
    actor=list(ACTOR_TAGS),
)
EXAMPLE_1 = """
EXAMPLE 1:
Transcript:
"I just opened this brand new box, let's see what's inside!
So excited to discover everything for the first time."

Frame Observations:
frame_0: "someone unboxing a product"
frame_1: "a female actor speaking"

FINAL classification:
{
  "hierarchy": "product",
  "storyline": "unboxing",
  "hook": "none",
  "cta": "none",
  "icp": "none",
  "actor": "female"
}
"""

EXAMPLE_2 = """
EXAMPLE 2:
Transcript:
"I have used this cream for 2 months, let me show you my before-and-after results.
I'm stunned by the difference."

Frame Observations:
frame_0: "someone showing a before-after scenario"
frame_1: "a female actor speaking"

FINAL classification:
{
  "hierarchy": "product",
  "storyline": "before-after",
  "hook": "emphasize-one-usp",
  "cta": "none",
  "icp": "none",
  "actor": "female"
}
"""

EXAMPLE_3 = """
EXAMPLE 3:
Transcript:
"This sale is insane—20% discount for new moms only!
We dramatize the problem if you keep using old products,
sign up now to see the difference."

Frame Observations:
frame_0: "someone dramatizing a problem"
frame_1: "a female actor speaking"

FINAL classification:
{
  "hierarchy": "product",
  "storyline": "testimonial",
  "hook": "dramatize-problem",
  "cta": "sign_up",
  "icp": "moms",
  "actor": "female"
}
"""

SYSTEM_PROMPT_EXAMPLES = EXAMPLE_1 + "\n" + EXAMPLE_2 + "\n" + EXAMPLE_3

SYSTEM_PROMPT_TAIL = """
Now I will give you the real transcript + frame observations.
Use hidden chain-of-thought, but do NOT output it. Output final JSON only, no arrays, no extra fields.
"""

def build_system_prompt():
    return SYSTEM_PROMPT_HEAD + "\n" + SYSTEM_PROMPT_EXAMPLES + "\n" + SYSTEM_PROMPT_TAIL

In [17]:

def parse_and_validate_single_string(response_text):
    try:
        data = json.loads(response_text)
    except:
        return None

    expected_keys = ["hierarchy","storyline","hook","cta","icp","actor"]
    for k in expected_keys:
        if k not in data:
            return None

    # allowed sets
    allowed_map = {
        "hierarchy": HIERARCHY_TAGS,
        "storyline": STORYLINE_TAGS,
        "hook": HOOK_TAGS,
        "cta": CTA_TAGS,
        "icp": ICP_TAGS,
        "actor": ACTOR_TAGS
    }

    for k in expected_keys:
        val = data[k]
        if not isinstance(val, str):
            return None
        if val not in allowed_map[k]:
            return None

    return data

def advanced_llm_classify(transcript, clip_info):
    """
    1) Build a big user prompt with transcript + frame observations
    2) Use advanced system prompt with chain-of-thought examples
    3) parse & if invalid, re-ask once
    """
    system_prompt = build_system_prompt()

    # Prepare the user prompt based on whether a transcript is available
    if transcript and transcript != "No audio transcript available.":
        user_prompt = f"""
    Transcript:
    {transcript}

    Frame Observations:
    {clip_info}

    Now produce final classification as valid JSON.
    Remember, exactly one label from each set, no arrays, no commentary.
    """
    else:
        user_prompt = f"""
    No audio transcript is available. Classify the video based on the following frame observations:

    Frame Observations:
    {clip_info}

    Now produce final classification as valid JSON.
    Remember, exactly one label from each set, no arrays, no commentary.
    """
    # FIRST attempt
    # response1 = openai.ChatCompletion.create(
    #     model="gpt-3.5-turbo",
    #     messages=[
    #         {"role":"system","content": system_prompt},
    #         {"role":"user","content": user_prompt}
    #     ],
    #     temperature=0
    # )
    response1 = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ],
        temperature=0
    )
    raw_text1 = response1.choices[0].message.content

    parsed1 = parse_and_validate_single_string(raw_text1)
    if parsed1 is not None:
        return json.dumps(parsed1, indent=2)
    else:
        # second attempt
        fix_prompt = f"""
Your previous output was invalid or had unknown tags/arrays.
We need single-string for each key.
Allowed sets are shown above.
No arrays or multiple items.
Here's your last output:

{raw_text1}

Fix it and produce valid JSON:
"""
        # response2 = openai.ChatCompletion.create(
        #     model="gpt-3.5-turbo",
        #     messages=[
        #         {"role":"system","content": system_prompt},
        #         {"role":"user","content": fix_prompt}
        #     ],
        #     temperature=0
        # )
        response2 = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": fix_prompt}
            ],
            temperature=0
        )
        raw_text2 = response2.choices[0].message.content
        parsed2 = parse_and_validate_single_string(raw_text2)

        if parsed2 is not None:
            return json.dumps(parsed2, indent=2)
        else:
            return f"Still invalid after second attempt.\n\nFirst attempt:\n{raw_text1}\n\nSecond attempt:\n{raw_text2}"

def classify_ad(video):
    # if no video
    if video is None:
        return "No video provided."

    # handle path
    video_path = video["name"] if isinstance(video, dict) else video



    # 1) Transcribe
    transcript_text = extract_audio_transcript(video_path)


    # 2) Frames
    frames = extract_frames(video_path)


    # 3) CLIP each frame
    clip_obs = []
    for f in frames:
        label, prob = analyze_frame_with_clip(f)
        clip_obs.append(f"{f}: \"{label}\" (prob={prob:.2f})")

    clip_info = "\n".join(clip_obs)

    # 4) advanced LLM classify
    final_tags = advanced_llm_classify(transcript_text, clip_info)
    return final_tags

In [18]:
def run_app():
    import gradio as gr
    demo = gr.Interface(
        fn=classify_ad,
        inputs=gr.Video(label="Upload MP4"),
        outputs="text",
        title="Whisper + CLIP + LLM (Best Version)",
        description="""
BEST version:
- Deterministic frames at 0,4,8,12,16,20,24,28s
- Whisper for transcript
- CLIP with expanded prompts
- GPT-3.5 with multi examples (chain-of-thought hidden)
- Single label categories, fallback fix prompt if invalid
"""
    )
    demo.launch(debug=True)


In [20]:
run_app()

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://179932bf95a9a5a773.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




hi
hi
hi
Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://179932bf95a9a5a773.gradio.live
