In [1]:
import os
import shutil

def clear_previous_snippets_and_outputs():
    """
    Removes the '/kaggle/working/snippets' folder (and all subfolders)
    plus any 'final_output.txt' file, ensuring a clean slate.
    """
    snippets_dir = "/kaggle/working/snippets"
    final_output_file = "/kaggle/working/final_output.txt"

    # Remove snippet directories
    if os.path.exists(snippets_dir):
        shutil.rmtree(snippets_dir)
        print(f"Removed old snippet directories at {snippets_dir}")
    else:
        print(f"No snippet directories found at {snippets_dir}")

    # Remove final_output.txt if present
    if os.path.exists(final_output_file):
        os.remove(final_output_file)
        print(f"Removed old final_output.txt at {final_output_file}")
    else:
        print(f"No final_output.txt found at {final_output_file}")

# Call it here or in a separate cell before running your main code
clear_previous_snippets_and_outputs()

No snippet directories found at /kaggle/working/snippets
No final_output.txt found at /kaggle/working/final_output.txt


In [2]:
import os
import subprocess

# Write the entire video summarization code to a file.
# Note: We convert the pip install command into a subprocess call inside the script.
code_content = r'''import subprocess

# ======================
# 1. Install Dependencies
# ======================
subprocess.run(["pip", "install", "--no-cache-dir", "opencv-python-headless", "ffmpeg-python", "pydub", "faiss-cpu", "torch", "torchvision", "transformers", "openai-whisper==20231106", "tqdm"], check=True)

# ======================
# 2. Imports
# ======================
import os
import cv2
import faiss
import json
import torch
import numpy as np
from tqdm.auto import tqdm
from pydub import AudioSegment
from typing import List, Dict
import torch.nn.functional as F

from transformers import CLIPProcessor, CLIPModel
import whisper

# ============================
# 3. Read Entire Video Into Memory
# ============================
def load_entire_video(video_path: str, resize_dim=(640, 240)):
    print("[load_entire_video] Loading all frames into memory...")
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)
    if fps <= 0:
        fps = 30.0
    frames_in_mem = []
    idx = 0
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        frame = cv2.resize(frame, resize_dim)
        frames_in_mem.append((frame, idx))
        idx += 1
    cap.release()
    print(f"[load_entire_video] Loaded {len(frames_in_mem)} frames total.")
    return frames_in_mem, fps

# ============================
# 4. Dynamic Frame Extraction (In Memory)
# ============================
def dynamic_extraction_in_memory(frames_in_mem: List[tuple], pixel_thresh=30, min_interval=10):
    print("[dynamic_extraction_in_memory] Starting dynamic extraction...")
    keyframes = []
    prev_gray = None
    count = 0
    for (frame, real_idx) in frames_in_mem:
        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        diff_val = 0.0
        if prev_gray is not None:
            diff_img = cv2.absdiff(prev_gray, gray)
            diff_val = float(np.mean(diff_img))
        if diff_val > pixel_thresh or (count % min_interval == 0):
            keyframes.append({
                "frame": frame,
                "real_index": real_idx,
                "diff_score": diff_val
            })
            prev_gray = gray
        elif prev_gray is None:
            prev_gray = gray
        count += 1
    print(f"[dynamic_extraction_in_memory] Extracted {len(keyframes)} keyframes total.")
    return keyframes

# ============================
# 5. CLIP Scoring
# ============================
def clip_score_keyframes(keyframes: List[dict], prompt_emb: np.ndarray, clip_model, clip_proc, device):
    from torch.nn.functional import normalize
    print("[clip_score_keyframes] Scoring with CLIP...")
    results = []
    for kf in tqdm(keyframes, desc="CLIP scoring"):
        inp = clip_proc(images=kf["frame"], return_tensors="pt").to(device)
        with torch.no_grad():
            img_feat = clip_model.get_image_features(**inp)
        img_feat = normalize(img_feat, p=2, dim=1).cpu().numpy()
        dot = float(np.dot(img_feat, prompt_emb.T).squeeze())
        kf["clip_score"] = dot
        kf["clip_emb"] = img_feat[0]
        results.append(kf)
    results.sort(key=lambda x: x["clip_score"], reverse=True)
    print("[clip_score_keyframes] Done. Sorted descending by clip_score.")
    return results

# ============================
# 6. Diversity Filter (Skip Approach)
# ============================
def diversity_skip(keyframes: List[dict], threshold_dot=0.98):
    print(f"[diversity_skip] threshold_dot={threshold_dot}")
    final_list = []
    for kf in keyframes:
        emb = kf["clip_emb"]
        keep = True
        for chosen in final_list:
            dot_ = float(np.dot(emb, chosen["clip_emb"]))
            if dot_ > threshold_dot:
                keep = False
                break
        if keep:
            final_list.append(kf)
    print(f"[diversity_skip] After skip => {len(final_list)} frames remain.")
    return final_list

# ============================
# 7. Audio with Whisper
# ============================
class AudioProcessor:
    def __init__(self):
        self.audio = None
        self.word_timestamps = []
        self.segments = []
    def extract_and_transcribe(self, video_path: str):
        audio_path = "/kaggle/working/tmp_audio.wav"
        os.system(f"ffmpeg -i \"{video_path}\" -vn -acodec pcm_s16le -ar 16000 -ac 1 \"{audio_path}\" -y")
        print("[AudioProcessor] Transcribing with Whisper 'medium' model...")
        model = whisper.load_model("medium")
        result = model.transcribe(audio_path, word_timestamps=True)
        self.audio = AudioSegment.from_wav(audio_path)
        self.word_timestamps = []
        for seg in result["segments"]:
            self.word_timestamps.extend(seg.get("words", []))
        self.segments = []
        for seg in result["segments"]:
            self.segments.append({
                "start": seg["start"],
                "end": seg["end"],
                "text": seg["text"]
            })
    def get_audio_snippet(self, start_f, end_f, fps):
        if self.audio is None:
            return None
        start_ms = (start_f / fps) * 1000
        end_ms = (end_f / fps) * 1000
        if end_ms <= start_ms or start_ms >= len(self.audio):
            return None
        if end_ms > len(self.audio):
            end_ms = len(self.audio)
        return self.audio[start_ms:end_ms]

# ============================
# 8. Snippet Generation
# ============================
def generate_snippet(frames_in_mem: List[tuple], start_f: int, end_f: int):
    snippet_frames = []
    for (frm, idx) in frames_in_mem:
        if idx >= start_f and idx <= end_f:
            snippet_frames.append(frm)
    return snippet_frames

def measure_local_motion_snippet(snippet_frames: List[np.ndarray], resize_dim=(640,360)):
    if len(snippet_frames) < 2:
        return 0.0
    dis_flow = cv2.DISOpticalFlow_create(cv2.DISOPTICAL_FLOW_PRESET_FAST)
    gray_list = []
    for frm in snippet_frames:
        g = cv2.resize(frm, resize_dim)
        g = cv2.cvtColor(g, cv2.COLOR_BGR2GRAY)
        gray_list.append(g)
    prev = gray_list[0]
    mags = []
    for g2 in gray_list[1:]:
        flow = dis_flow.calc(prev, g2, None)
        fx, fy = flow[...,0], flow[...,1]
        mag = np.sqrt(fx**2 + fy**2)
        mags.append(float(np.mean(mag)))
        prev = g2
    return float(np.mean(mags))

# ============================
# 9. Full Summarizer
# ============================
class VideoSummarizer:
    def __init__(self, video_path, prompt, top_k=5, resize_dim=(640,360)):
        self.video_path = video_path
        self.prompt = prompt
        self.top_k = top_k
        self.resize_dim = resize_dim
        self.out_dir = "/kaggle/working/snippets"
        os.makedirs(self.out_dir, exist_ok=True)
        self.frames_in_mem, self.fps = load_entire_video(video_path, resize_dim=resize_dim)
        self.total_frames = len(self.frames_in_mem)
        print("[VideoSummarizer] Loading CLIP model on device.")
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(self.device)
        self.clip_proc = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
        from torch.nn.functional import normalize
        text_inp = self.clip_proc(text=self.prompt, return_tensors="pt").to(self.device)
        with torch.no_grad():
            txt_feat = self.clip_model.get_text_features(**text_inp)
        self.prompt_emb = normalize(txt_feat, p=2, dim=1).cpu().numpy()
        self.audio_proc = AudioProcessor()
    def run(self):
        keyframes = dynamic_extraction_in_memory(self.frames_in_mem, pixel_thresh=30, min_interval=10)
        if not keyframes:
            print("[VideoSummarizer] No frames after dynamic extraction, aborting.")
            return []
        keyframes = clip_score_keyframes(keyframes, self.prompt_emb, self.clip_model, self.clip_proc, self.device)
        print("[VideoSummarizer] Sorted by clip_score. Top 5 =>", [kf["clip_score"] for kf in keyframes[:5]])
        final_list = diversity_skip(keyframes, threshold_dot=0.98)
        print(f"[VideoSummarizer] After diversity => {len(final_list)} frames")
        final_list = final_list[:self.top_k]
        print(f"[VideoSummarizer] Taking top_k={self.top_k} => {len(final_list)} frames")
        del self.clip_model
        del self.clip_proc
        if self.device == "cuda":
            torch.cuda.empty_cache()
        self.audio_proc.extract_and_transcribe(self.video_path)
        final_list.sort(key=lambda x: x["real_index"])
        snippet_idx = 1
        results = []
        for kf in final_list:
            real_idx = kf["real_index"]
            start_temp = max(0, real_idx-5)
            end_temp = min(self.total_frames-1, real_idx+5)
            snippet_temp = generate_snippet(self.frames_in_mem, start_temp, end_temp)
            local_mot = measure_local_motion_snippet(snippet_temp, resize_dim=self.resize_dim)
            if local_mot < 0.3:
                halfw = 7
            elif local_mot < 0.7:
                halfw = 5
            else:
                halfw = 3
            startf = max(0, real_idx-halfw)
            endf = min(self.total_frames-1, real_idx+halfw)
            if endf <= startf:
                print(f"[Snippet {snippet_idx}] Zero-len snippet => skip. real_idx={real_idx}")
                continue
            start_s = startf / self.fps
            end_s = endf / self.fps
            overlapping_segments = []
            for seg in self.audio_proc.segments:
                seg_start = seg["start"]
                seg_end = seg["end"]
                if seg_end >= start_s and seg_start <= end_s:
                    overlapping_segments.append(seg)
            if overlapping_segments:
                final_seg_start = min(s["start"] for s in overlapping_segments)
                final_seg_end = max(s["end"] for s in overlapping_segments)
                startf = int(final_seg_start * self.fps)
                endf = int(final_seg_end * self.fps)
                startf = max(0, startf)
                endf = min(self.total_frames-1, endf)
                if endf <= startf:
                    print(f"[Snippet {snippet_idx}] After expansion, zero-len => skip. real_idx={real_idx}")
                    continue
            snippet_frames = generate_snippet(self.frames_in_mem, startf, endf)
            final_mot = measure_local_motion_snippet(snippet_frames, resize_dim=self.resize_dim)
            frames_written = len(snippet_frames)
            snippet_dir = os.path.join(self.out_dir, f"snippet_{snippet_idx:03d}")
            os.makedirs(snippet_dir, exist_ok=True)
            out_vid_path = os.path.join(snippet_dir, "video.mp4")
            fourcc = cv2.VideoWriter_fourcc(*'mp4v')
            outv = cv2.VideoWriter(out_vid_path, fourcc, self.fps, (self.resize_dim[0], self.resize_dim[1]))
            for frm in snippet_frames:
                resized = cv2.resize(frm, self.resize_dim)
                outv.write(resized)
            outv.release()
            audio_clip = self.audio_proc.get_audio_snippet(startf, endf, self.fps)
            audio_path = os.path.join(snippet_dir, "audio.wav")
            if audio_clip:
                audio_clip.export(audio_path, format="wav")
            snippet_start_s = float(startf / self.fps)
            snippet_end_s = float(endf / self.fps)
            local_words = []
            for w in self.audio_proc.word_timestamps:
                w_start = w.get("start", 0.0)
                if snippet_start_s <= w_start < snippet_end_s:
                    local_words.append(w)
            local_segments = []
            for seg in self.audio_proc.segments:
                if seg["end"] >= snippet_start_s and seg["start"] <= snippet_end_s:
                    local_segments.append(seg)
            meta = {
                "real_index": real_idx,
                "diff_score": kf["diff_score"],
                "clip_score": kf["clip_score"],
                "local_motion": final_mot,
                "snippet_start_frame": startf,
                "snippet_end_frame": endf,
                "snippet_start_s": snippet_start_s,
                "snippet_end_s": snippet_end_s,
                "frames_written": frames_written,
                "audio_words": local_words,
                "audio_segments": local_segments
            }
            meta_path = os.path.join(snippet_dir, "metadata.json")
            with open(meta_path, "w") as f:
                import json
                json.dump(meta, f, indent=2)
            snippet_secs = frames_written / self.fps
            print(f"[Snippet {snippet_idx:03d}] real_idx={real_idx}, clip_score={kf['clip_score']:.2f}, local_mot={final_mot:.2f}, frames={frames_written}, dur={snippet_secs:.2f}s")
            snippet_idx += 1
            results.append(snippet_dir)
        return results

if __name__ == "__main__":
    dataset_folder = "/kaggle/input/bro123"

    # 2) A local JSON file to store previously processed video names
    processed_json = "/kaggle/working/processed_videos.json"
    
    # Load or create the list of processed videos
    if os.path.exists(processed_json):
        with open(processed_json, "r") as f:
            processed_videos = json.load(f)
    else:
        processed_videos = []
    
    # Gather all .mp4 files in the dataset folder
    video_files = [f for f in os.listdir(dataset_folder) if f.lower().endswith(".mp4")]
    video_files.sort()  # optional, just for consistency
    
    # Filter to find only the new ones
    new_videos = [vf for vf in video_files if vf not in processed_videos]
    
    if not new_videos:
        print("No new .mp4 files to process.")
        VIDEO_PATH = None
        next_videos = None  # Define next_video as None if no new video exists
    else:
        # Pick the first new video
        next_videos = new_videos[0]
        VIDEO_PATH = os.path.join(dataset_folder, next_videos)
        print(f"Found new video to process: {next_videos}")

# After processing, only mark as processed if a new video was found.
    if next_videos is not None:
        processed_videos.append(next_videos)
        with open(processed_json, "w") as f:
            json.dump(processed_videos, f)
        print(f"Updated processed_videos.json with {next_videos}.")
    else:
        print("No new video was processed; processed_videos.json remains unchanged.")

    print(f"VIDEO_PATH = {VIDEO_PATH}")
    # Now you can process 'video_path' in your code below
    PROMPT = "Rank according to relevancy"
    TOP_K = 10
    summarizer = VideoSummarizer(video_path=VIDEO_PATH, prompt=PROMPT, top_k=TOP_K, resize_dim=(640,360))
    snippet_dirs = summarizer.run()
    print(f"\nGenerated {len(snippet_dirs)} snippet(s) in /kaggle/working/snippets:")
    for d in snippet_dirs:
        print(f"- {d}")
    del summarizer
    import gc
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    print("All models freed and memory cleared.")
'''
if not os.path.exists("video_summarization.py"):
    with open("video_summarization.py", "w") as f:
        f.write(code_content)
    print("Created video_summarization.py file.")
else:
    print("video_summarization.py already exists. Skipping file creation.")

# Run the new file as a subprocess.
result = subprocess.run(["python", "video_summarization.py"], capture_output=True, text=True)

print("Subprocess STDOUT:")
print(result.stdout)
print("Subprocess STDERR:")
print(result.stderr)


Created video_summarization.py file.
Subprocess STDOUT:
Collecting ffmpeg-python
  Downloading ffmpeg_python-0.2.0-py3-none-any.whl.metadata (1.7 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Collecting openai-whisper==20231106
  Downloading openai-whisper-20231106.tar.gz (798 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 798.6/798.6 kB 18.4 MB/s eta 0:00:00
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting triton==2.0.0 (from openai-whisper==20231106)
  Downloading triton-2.0.0-1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.0 kB)
Collecting lit (from triton==2.0.0->openai-whisper==20231106)
  Downloading 

In [3]:
# Change to the working directory
%cd /kaggle/working

# Clone the mPLUG-Owl repository
!git clone https://github.com/X-PLUG/mPLUG-Owl.git

# Navigate into the cloned repository
%cd /kaggle/working/mPLUG-Owl/mPLUG-Owl
!pip install -r requirements.txt
!pip install flash-attn
import os

SRC_DIR = "/kaggle/input/final-dataset"

%cd /kaggle/working/mPLUG-Owl/mPLUG-Owl
import torch
from mplug_owl_video.modeling_mplug_owl import MplugOwlForConditionalGeneration
from transformers import AutoTokenizer
from mplug_owl_video.processing_mplug_owl import MplugOwlImageProcessor, MplugOwlProcessor
pretrained_ckpt = SRC_DIR

# 1. Load model with device_map="auto"
model = MplugOwlForConditionalGeneration.from_pretrained(
    pretrained_ckpt,
    torch_dtype=torch.bfloat16,
    device_map="auto",  # Let HF Accelerate handle multi-GPU
)

# 3. Load processors
image_processor = MplugOwlImageProcessor.from_pretrained(pretrained_ckpt)
tokenizer = AutoTokenizer.from_pretrained(pretrained_ckpt)
processor = MplugOwlProcessor(image_processor, tokenizer)
    
print("Model loaded successfully with device_map='auto'!")

/kaggle/working
Cloning into 'mPLUG-Owl'...
remote: Enumerating objects: 1351, done.[K
remote: Counting objects: 100% (266/266), done.[K
remote: Compressing objects: 100% (112/112), done.[K
remote: Total 1351 (delta 206), reused 154 (delta 154), pack-reused 1085 (from 1)[K
Receiving objects: 100% (1351/1351), 34.39 MiB | 39.09 MiB/s, done.
Resolving deltas: 100% (497/497), done.
/kaggle/working/mPLUG-Owl/mPLUG-Owl
Collecting transformers==4.28.1 (from -r requirements.txt (line 1))
  Downloading transformers-4.28.1-py3-none-any.whl.metadata (109 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.0/110.0 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
Collecting icecream (from -r requirements.txt (line 3))
  Downloading icecream-2.1.4-py3-none-any.whl.metadata (1.3 kB)
Collecting ruamel.yaml (from -r requirements.txt (line 5))
  Downloading ruamel.yaml-0.18.10-py3-none-any.whl.metadata (23 kB)
Collecting uvicorn (from -r requirements.txt (line 6))
  Downloading uvi

  return torch.load(checkpoint_file, map_location="cpu")


Model loaded successfully with device_map='auto'!


In [4]:
import os
import json
import glob

###########################
# 2. Your Generation Settings (adjusted)
###########################
generate_kwargs = {
    'do_sample': False,
    'top_k': 5,
    'max_length': 70,   # Increased to allow for longer summaries
    'temperature': 0.5,
    'top_p': 0.9,
    'num_beams': 1,
    'no_repeat_ngram_size': 2,
    'early_stopping': True,
    'length_penalty': 1
}

###########################
# 3. Define getDescription
###########################
def getDescription(prompts, video_list, generate_kwargs, nframes=48):
    """
    Summarize a video using your loaded mPLUG-Owl model.
    """
    # Convert text+videos into model inputs
    inputs = processor(text=prompts, videos=video_list, num_frames=nframes, return_tensors='pt')
    # Convert float => bfloat16 if needed
    inputs = {
        k: v.bfloat16() if (torch.is_floating_point(v) and v.dtype == torch.float32) else v
        for k, v in inputs.items()
    }
    # Move inputs to GPU
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    with torch.no_grad():
        res = model.generate(**inputs, **generate_kwargs)
    sentence = tokenizer.decode(res[0], skip_special_tokens=True)
    return sentence

###########################
# 4. Summarize All Snippets
###########################
def summarize_snippets(snippet_base="/kaggle/working/snippets", snippet_count=10):
    snippet_summaries = []

    for i in range(1, snippet_count + 1):
        snippet_dir = os.path.join(snippet_base, f"snippet_{i:03d}")
        video_path  = os.path.join(snippet_dir, "video.mp4")
        meta_path   = os.path.join(snippet_dir, "metadata.json")

        if not os.path.exists(video_path):
            print(f"[WARNING] Missing video.mp4 in {snippet_dir}, skipping.")
            snippet_summaries.append(None)
            continue

        if not os.path.exists(meta_path):
            print(f"[WARNING] Missing metadata.json in {snippet_dir}, skipping.")
            snippet_summaries.append(None)
            continue

        # Load metadata
        with open(meta_path, "r") as f:
            meta = json.load(f)

        # Extract audio text from audio_segments
        audio_segments = meta.get("audio_segments", [])
        if audio_segments:
            # For simplicity, join all segment texts
            text_snippet = " ".join(seg["text"] for seg in audio_segments).strip()
        else:
            text_snippet = "No audio segments found."

        # Build a clearer prompt
        prompt = (
            "You are an expert in correlating video content with accompanying audio transcripts.\n"
            "Video: <|video|>\n"
            f"Audio Transcript: {text_snippet}\n"
            "Based on the visual content and the audio transcript above, provide a concise and insightful summary that connects both modalities. "
            "Ensure your summary is complete and ends with a full stop.\n"
            "Summary:"
        )

        # Summarize snippet
        summary = getDescription([prompt], [video_path], generate_kwargs, nframes=48)
        snippet_summaries.append(summary)
        print(f"[Snippet {i}] => {summary}")

    return snippet_summaries

###########################
# 5. Run Summaries
###########################
if __name__ == "__main__":
    results = summarize_snippets(snippet_base="/kaggle/working/snippets", snippet_count=10)
    
    # Build the output string
    final_lines = [""]
    for i, summ in enumerate(results, start=1):
        line = f"Snippet {i:03d}: {summ}"
        print(line)                # Print to console
        final_lines.append(line)   # Add to list
    
    # Combine all lines into a single string
    final_output = "".join(final_lines)

[Snippet 1] => The image shows a man sitting at a desk in a busy office, talking on the phone. The audio transcript indicates that he is calling for Mr. Michael Anderson. This combination of visual and audio elements suggests that the man is trying to reach a specific person in the office and is using the telephone to communicate with them.
[Snippet 2] => A man is sitting at a desk in a busy office, talking on the phone. He appears to be a manager or supervisor, as he is wearing a suit and tie. The office is filled with other people working, creating a lively atmosphere.
[Snippet 3] => A man is sitting at a desk in a busy office, working on a computer. He is wearing a suit and tie, and appears to be focused on his work. The office is filled with other people, some of whom are sitting and working, while others are standing and talking. There are also a few chairs scattered around the
[Snippet 4] => A man is talking on the phone while sitting at a desk in a busy office. He is discussing 

In [11]:
import torch
import gc

# Free the model from memory
del model

# Clear the GPU cache
torch.cuda.empty_cache()

# Run garbage collection to free up unreferenced memory
gc.collect()

NameError: name 'model' is not defined

In [21]:
if final_output.startswith("=== Final Summaries ==="):
    final_output = final_output[len("=== Final Summaries ==="):].strip()
final_output = final_output.replace("\n", " ")
print(final_output)

Snippet 001: The image shows a man sitting at a desk in a busy office, talking on the phone. The audio transcript indicates that he is calling for Mr. Michael Anderson. This combination of visual and audio elements suggests that the man is trying to reach a specific person in the office and is using the telephone to communicate with them.Snippet 002: A man is sitting at a desk in a busy office, talking on the phone. He appears to be a manager or supervisor, as he is wearing a suit and tie. The office is filled with other people working, creating a lively atmosphere.Snippet 003: A man is sitting at a desk in a busy office, working on a computer. He is wearing a suit and tie, and appears to be focused on his work. The office is filled with other people, some of whom are sitting and working, while others are standing and talking. There are also a few chairs scattered around theSnippet 004: A man is talking on the phone while sitting at a desk in a busy office. He is discussing a lunch dat

In [20]:
import requests
ngrok_url = "https://17e3-2409-40e1-1067-f605-6840-f09a-93ec-e92f.ngrok-free.app/upload"
response = requests.post(ngrok_url, data={"output": final_output})

if response.status_code == 200:
    print("Output sent successfully!")
else:
    print("Failed to send output:", response.text)
    

Failed to send output: 
