In [None]:
import cv2
import os
import requests
import numpy as np
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration, AutoProcessor, AutoModel
import scipy # for saving to wav file
import openai

# set seed for reproducability
np.random.seed(45)

## Capture frames from a video input

OpenCV, frame-by-frame

In [20]:
video_path = 'data/bird.mp4'
cap = cv2.VideoCapture(video_path)

output_directory = 'output_frames'
os.makedirs(output_directory, exist_ok=True)

fps = cap.get(cv2.CAP_PROP_FPS)
interval_seconds = 2
interval_frames = int(fps * interval_seconds)
count = 1

if not cap.isOpened():
    print("Error opening video file")
else:
    while cap.isOpened():
        # Capture frame-by-frame
        ret, frame = cap.read()

        if not ret:
            break

        if count % interval_frames == 0:
            output_path = os.path.join(output_directory, f"frame_{count}.jpg")
            cv2.imwrite(output_path, frame)
            print(f"Saved frame {count}")
        count += 1
    cap.release()

Saved frame 120
Saved frame 240
Saved frame 360
Saved frame 480
Saved frame 600
Saved frame 720
Saved frame 840


In [21]:
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")

In [23]:
snapshots = []
if os.path.exists(output_directory):
    for filename in os.listdir(output_directory):
        
        if filename.endswith(".jpg"):
            print(f"Processing {filename}")
            raw_image = Image.open(os.path.join(output_directory, filename), "r").convert('RGB')
            # unconditional image captioning
            inputs = processor(raw_image, return_tensors="pt")
            out = model.generate(**inputs)
            snapshots.append(processor.decode(out[0], skip_special_tokens=True))

        else:
            print(f"Skipping {filename}")
        
else:
    print(f"Directory '{directory}' does not exist.")

# TODO: send snapshots to gpt prompt
print(snapshots)

Processing frame_300.jpg
Processing frame_240.jpg
Processing frame_840.jpg
Processing frame_600.jpg
Processing frame_720.jpg
Processing frame_120.jpg
Processing frame_480.jpg
Processing frame_360.jpg
['there is a bird that is standing in the water with a snake', 'there is a bird that is standing in the water with a snake', 'there is a bird with a fish in its mouth in the grass', 'there is a bird that is standing in the water with a fish in its mouth', 'there is a bird with a long beak standing in the grass', "there is a bird that is standing in the water with a fish in it's mouth", 'there is a bird that is standing in the water with a fish in its mouth', "there is a bird that is standing in the water with a fish in it's mouth"]


## OpenAI

In [None]:
openai.api_key = os.environ.get("OPENAI_API_KEY")

def openai_prompt(prompt: List):
    conversation = [
        {"role": "user", "content": "Give me the mood, genre, and feeling of this description: \"" + prompt + "\""}
    ]

    response = openai.ChatCompletion.create(
        model="gpt-4",
        messages=conversation
    )

    conversation.append({"role": "user", "content": "Summarize your response in the format of the example: \"80s pop track with bassy drums and synth.\""})

    response = openai.ChatCompletion.create(
        model="gpt-4",
        messages=conversation
    )

    return response['choices'][0]['message']['content']

## Music generation with MusicGen

In [18]:
from transformers import AutoProcessor, MusicgenForConditionalGeneration

music_processor = AutoProcessor.from_pretrained("facebook/musicgen-small")
music_model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")

In [19]:
# pass in descriptions
descriptions = ["acoustic folk song to play during roadtrips: guitar flute choirs"]
inputs = processor(
    text=descriptions,
    padding=True,
    return_tensors="pt",
)
audio_values = model.generate(**inputs, do_sample=True, guidance_scale=3, max_new_tokens=512)

Listen either using jupyter, or export as wav file

In [20]:
from IPython.display import Audio

sampling_rate = music_model.config.audio_encoder.sampling_rate
Audio(audio_values[0].numpy(), rate=sampling_rate)

In [34]:
Audio(audio_values[1].numpy(), rate=sampling_rate)

In [12]:
sampling_rate = model.config.audio_encoder.sampling_rate
scipy.io.wavfile.write("musicgen_out.wav", rate=sampling_rate, data=audio_values[0, 0].numpy())

### Combine and export

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# 2 huggingface models: processor, model | music_processor, music_model

# Define the two-stage inference function
def two_stage_inference(input_text):
    # Stage 1 inference
    inputs_stage1 = tokenizer_stage1(input_text, return_tensors="pt", padding=True, truncation=True)
    outputs_stage1 = model_stage1(**inputs_stage1)
    logits_stage1 = outputs_stage1.logits

    # Stage 2 inference using the output from stage 1
    inputs_stage2 = tokenizer_stage2(input_text, return_tensors="pt", padding=True, truncation=True)
    outputs_stage2 = model_stage2(**inputs_stage2, labels=logits_stage1)
    logits_stage2 = outputs_stage2.logits

    return logits_stage2

# Save the model
torch.save(model_stage2.state_dict(), "two_stage_model.pth")
