In [1]:
!pip install torch torchvision transformers pillow numpy gradio moviepy scikit-image timm einops

Collecting moviepy
  Downloading moviepy-1.0.3.tar.gz (388 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m388.3/388.3 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting scikit-image
  Downloading scikit_image-0.23.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (14 kB)
Collecting timm
  Downloading timm-0.9.16-py3-none-any.whl.metadata (38 kB)
Collecting einops
  Downloading einops-0.7.0-py3-none-any.whl.metadata (13 kB)
Collecting decorator<5.0,>=4.0.2 (from moviepy)
  Downloading decorator-4.4.2-py2.py3-none-any.whl.metadata (4.2 kB)
Collecting proglog<=1.0.0 (from moviepy)
  Downloading proglog-0.1.10-py3-none-any.whl.metadata (639 bytes)
Collecting imageio<3.0,>=2.5 (from moviepy)
  Downloading imageio-2.34.0-py3-none-any.whl.metadata (4.9 kB)
Collecting imageio_ffmpeg>=0.2.0 (from moviepy)
  Downloading imageio_ffmpeg-0.4.9-py3-none-manylinux2010_x86_64.whl.metadata (1.7 kB)


In [42]:
import os
import tempfile
from moviepy.editor import VideoFileClip
from transformers import AutoModelForCausalLM, AutoTokenizer, DetrImageProcessor, DetrForObjectDetection
import torch
from torchvision.transforms import functional as F
from PIL import Image
import numpy as np
import datetime
import gradio as gr

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [47]:
coco_names = [
    '__background__', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
    'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'N/A', 'stop sign',
    'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
    'elephant', 'bear', 'zebra', 'giraffe', 'N/A', 'backpack', 'umbrella', 'N/A', 'N/A',
    'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
    'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket',
    'bottle', 'N/A', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl',
    'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza',
    'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'N/A', 'dining table',
    'N/A', 'N/A', 'toilet', 'N/A', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone',
    'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'N/A', 'book',
    'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush'
]


uploaded_image_path = None

def extract_predictions(model_output):
    probas = model_output.logits.softmax(-1)[0, :, :-1].cpu().detach().numpy()
    keep = probas.max(-1) > 0.4
    boxes = model_output.pred_boxes[0, keep].cpu().detach().numpy()
    scores = probas[keep].max(-1)
    labels = probas[keep].argmax(-1)
    return [(box, score, label) for box, score, label in zip(boxes, scores, labels)]

def perform_object_detection(image):
    feature_extractor, detector = (DetrImageProcessor.from_pretrained("facebook/detr-resnet-50"),
                                   DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50").to(device))
    
    inputs = feature_extractor(images=image, return_tensors="pt").to(device)
    outputs = detector(**inputs)
    predictions = extract_predictions(outputs)
    
    return predictions

def save_image_data_to_file(image_path, caption, predictions, responses):
    folder_name = "moondream_responses"
    os.makedirs(folder_name, exist_ok=True)
    
    image_name = os.path.basename(image_path)
    timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
    file_name = f"{image_name}_{timestamp}.txt"
    file_path = os.path.join(folder_name, file_name)
    
    with open(file_path, "w") as file:
        file.write(f"Image: {image_name}\n")
        file.write(f"Caption: {caption}\n")
        file.write("Objects Detected:\n")
        for _, score, label in predictions:
            file.write(f"- {coco_names[label]} ({score:.2f})\n")
        file.write("\nMoondream Responses:\n")
        for i, response in enumerate(responses):
            file.write(f"Question {i+1}: {response[0]}\n")
            file.write(f"Answer {i+1}: {response[1]}\n\n")
    
    print(f"Image data saved to the file '{file_path}'.")

def process_image(image, caption, habit):
    global uploaded_image_path
    image_path = "uploaded_image.jpg"
    image.save(image_path)
    uploaded_image_path = image_path
    
    predictions = perform_object_detection(image)
    
    questions = [
        "Describe the color and ambience of the image.",
        f"What objects in the image are good or bad for the {habit}?",
        f"Based on the {caption}, Please provide suggestions that will help the user in maintaining their {habit}.",
    ]
    
    responses = []
    for question in questions:
        answer = moondream_chatbot(question)
        responses.append((question, answer))
    
    save_image_data_to_file(image_path, caption, predictions, responses)
    
    return responses

def moondream_chatbot(user_input):
    global uploaded_image_path
    model_id = "vikhyatk/moondream2"
    revision = "2024-03-05"

    model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, revision=revision).to("cuda")
    tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)

    if uploaded_image_path is not None:
        image = Image.open(uploaded_image_path)
        enc_image = model.encode_image(image).to("cuda")
        answer = model.answer_question(enc_image, user_input, tokenizer)
    else:
        answer = "Please upload an image first."

    if isinstance(answer, torch.Tensor):
        answer = answer.cpu()
    
    save_chat_history(user_input, answer)
    
    return [(user_input, answer)]



In [48]:
def extract_frames(video_path):
    print("Extracting frames from the video...")
    frames = []
    clip = VideoFileClip(video_path)
    for frame in clip.iter_frames():
        frame = Image.fromarray(frame)
        frames.append(frame)
    clip.close()
    print("Frame extraction completed.")
    return frames

from skimage.metrics import structural_similarity as ssim

def is_similar(frame1, frame2, threshold=0.9):
    # Convert frames to grayscale
    frame1_gray = frame1.convert('L')
    frame2_gray = frame2.convert('L')
    
    # Calculate structural similarity index (SSIM)
    similarity = ssim(np.array(frame1_gray), np.array(frame2_gray))
    
    return similarity > threshold

def process_video(video_path, text_prompt, habit, caption, model, tokenizer):
    print("Processing the video...")
    frames = extract_frames(video_path)
    frame_rate = len(frames) // 20  # Assuming 5-second video
    responses = []
    prev_frame = None

    questions = [
        f"Based on the {caption}, Please provide suggestions that will help the user in maintaining their {habit}.",
        "Specify as many objects with their relative location that are present in the image.",
        "Describe the color and ambience and lighting conditions in the image.",
        f"How does the image relate to or impact the {habit}?"
    ]

    for i in range(0, len(frames), frame_rate):
        frame = frames[i]
        print(f"Processing frame {i}...")
        
        if prev_frame is not None:
            print("Comparing frame similarity...")
            if is_similar(frame, prev_frame, threshold=0.7):
                print("Frame is similar to the previous frame. Skipping...")
                continue
            else:
                print("Frame is different from the previous frame.")
        
        enc_image = model.encode_image(frame)
        
        question = questions[i // frame_rate % len(questions)]
        response = model.answer_question(enc_image, question, tokenizer)
        
        # Add image tag and time to the response
        current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        response_with_tag = f"[Frame {i}] [{current_time}] {response}"
        
        print(f"Moondream output for frame {i}: {response_with_tag}")
        
        responses.append(response_with_tag)
        prev_frame = frame

    print("Video processing completed.")
    
    return responses

In [49]:
def save_video_responses_to_file(responses, video_path):
    folder_name = "moondream_responses"
    os.makedirs(folder_name, exist_ok=True)
    
    if len(responses) > 0:
        video_name = os.path.basename(video_path)
        timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
        file_name = f"video_{video_name}_{timestamp}.txt"
        file_path = os.path.join(folder_name, file_name)
        
        with open(file_path, "w") as file:
            file.write(f"Video: {video_name}\n\n")
            for response in responses:
                file.write(response + "\n")
        
        print(f"Video responses saved to the file '{file_path}'.")
    else:
        print("No responses to save.")

def save_chat_history(user_input, response):
    folder_name = "moondream_responses"
    os.makedirs(folder_name, exist_ok=True)
    
    file_path = os.path.join(folder_name, "chat_history.txt")
    
    with open(file_path, "a") as file:
        timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        file.write(f"[{timestamp}] User: {user_input}\n")
        file.write(f"[{timestamp}] Moondream: {response}\n\n")

def process_image_upload(image_file, caption, habit):
    global uploaded_image_path
    if image_file is None:
        return "Please upload an image file.", None
    
    image = Image.open(image_file.name)
    responses = process_image(image, caption, habit)
    
    uploaded_image_path = image_file.name
    
    return "Image processed successfully. Responses saved to file.", uploaded_image_path

def process_video_upload(video_file, text_prompt, habit, caption):
    if video_file is None:
        return "Please upload a video file."
    
    video_path = video_file.name
    
    # Initialize the model and tokenizer
    model_id = "vikhyatk/moondream2"
    revision = "2024-03-05"
    model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, revision=revision).to("cuda")
    tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)
    
    responses = process_video(video_path, text_prompt, habit, caption, model, tokenizer)
    save_video_responses_to_file(responses, video_path)
    
    return "Video processed successfully. Responses saved to file."

In [62]:
with gr.Blocks(theme=gr.themes.Soft(
    primary_hue="blue",
    secondary_hue="blue",
    neutral_hue="gray",
    text_size=gr.themes.sizes.text_sm,
    spacing_size=gr.themes.sizes.spacing_md,
    radius_size=gr.themes.sizes.radius_md,
)) as demo:
    gr.Markdown("# Moondream Image and Video Processing App")
    
    with gr.Row():
        with gr.Column():
            image_file = gr.File(label="Upload Image")
            video_file = gr.File(label="Upload Video")
            habit = gr.Textbox(label="Enter Habit")
            caption = gr.Textbox(label="Enter Caption")
            with gr.Row():
                image_submit_button = gr.Button("Process Image")
                video_submit_button = gr.Button("Process Video")
        with gr.Column():
            output_text = gr.Textbox(label="Output")
            image_chatbot = gr.Chatbot(label="Moondream Chatbot - Image")
            image_user_input = gr.Textbox(label="User Input - Image")
    
    image_submit_button.click(process_image_upload, inputs=[image_file, caption, habit], outputs=[output_text])
    video_submit_button.click(process_video_upload, inputs=[video_file, caption, habit, caption], outputs=[output_text])
    
    image_user_input.submit(moondream_chatbot, inputs=[image_user_input], outputs=[image_chatbot])

demo.launch(share=True)

Running on local URL:  http://127.0.0.1:7874
Running on public URL: https://c25bc8fb85a4bfb574.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




In [66]:
def process_upload(file):
    if file is None:
        return "Please upload an image or video file.", None
    
    if file.name.lower().endswith(('.png', '.jpg', '.jpeg', '.gif')):
        # Process image
        image = Image.open(file.name)
        responses = process_image(image, caption, habit)
        uploaded_image_path = file.name
        return "Image processed successfully. Responses saved to file.", uploaded_image_path
    elif file.name.lower().endswith(('.mp4', '.avi', '.mov')):
        # Process video
        video_path = file.name
        model_id = "vikhyatk/moondream2"
        revision = "2024-03-05"
        model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, revision=revision).to("cuda")
        tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)
        responses = process_video(video_path, caption, habit, caption, model, tokenizer)
        save_video_responses_to_file(responses, video_path)
        return "Video processed successfully. Responses saved to file.", None
    else:
        return "Unsupported file format. Please upload an image or video file.", None

with gr.Blocks(theme=gr.themes.Soft(
    primary_hue="blue",
    secondary_hue="blue",
    neutral_hue="gray",
    #background_fill="#F5F5F5",  # Light gray background
    text_size=gr.themes.sizes.text_sm,
    spacing_size=gr.themes.sizes.spacing_md,
    radius_size=gr.themes.sizes.radius_md,
)) as demo:
    gr.Markdown("# Moondream Image and Video Processing App")
    
    with gr.Row():
        with gr.Column():
            file_upload = gr.File(label="Upload Image or Video")
            habit = gr.Textbox(label="Enter Habit")
            caption = gr.Textbox(label="Enter Caption")
            submit_button = gr.Button("Process")
        with gr.Column():
            output_text = gr.Textbox(label="Output")
            image_chatbot = gr.Chatbot(label="Moondream Chatbot - Image")
            image_user_input = gr.Textbox(label="User Input - Image")
    
    submit_button.click(process_upload, inputs=[file_upload], outputs=[output_text])
    
    image_user_input.submit(moondream_chatbot, inputs=[image_user_input], outputs=[image_chatbot])

demo.launch(share=True)

Running on local URL:  http://127.0.0.1:7876
Running on public URL: https://c463a39cca985c0fa7.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


