In [None]:
GOOGLE_API_KEY = ''

from audiocraft.models import musicgen
from audiocraft.utils.notebook import display_audio
import google.generativeai as genai
import time
import json

genai.configure(api_key=GOOGLE_API_KEY)

In [None]:
music_prompt_examples = """
'A dynamic blend of hip-hop and orchestral elements, with sweeping strings and brass, evoking the vibrant energy of the city',
'Smooth jazz, with a saxophone solo, piano chords, and snare full drums',
'90s rock song with electric guitar and heavy drums, nightcore, 140bpm',
'lofi melody loop, A minor, 110 bpm, jazzy chords evoking a feeling of curiosity, relaxing, vinyl recording',
'J-Pop, 140bpm, 320kbps, 48kHz',
'funk, disco, R&B, AOR, soft rock, and boogie',
'a light and cheerly EDM track, with syncopated drums, aery pads, and strong emotions bpm: 130'.
"""

json_schema = """
{"Content Description": "string", "Music Prompt": "string"}
"""

gemini_instructions = f"""
You are a music supervisor who analyzes the content and tone of images and videos to describe music that fits well with the mood, evokes emotions, and enhances the narrative of the visuals. Given an image or video, describe the scene and generate a prompt suitable for music generation models. Generate a music prompt based on the description, and use keywords if provided by the user:

{music_prompt_examples}

You must return your response using this JSON schema: {json_schema}
"""


In [None]:
video_file_name = "test_reel_video.mp4"
mllm_model = genai.GenerativeModel('gemini-1.5-flash-latest', system_instruction=gemini_instructions)

In [None]:
video_file = genai.upload_file(video_file_name)
while video_file.state.name == "PROCESSING":
    print(".",end="")
    time.sleep(3)
    video_file = genai.get_file(video_file.name)

if video_file.state.name == "FAILED":
  raise ValueError(video_file.state.name)

In [None]:
response = mllm_model.generate_content(
    [video_file, 'Explain what is happening in this video. '],
    request_options={"timeout":600}
)
cleaned_response = json.loads(response.text.strip("```json\n"))

In [None]:
musicgen_model = musicgen.MusicGen.get_pretrained('musicgen-melody',device='cuda')
musicgen_model.set_generation_params(duration=video_file.video_metadata.video_duration.seconds)

In [None]:
musicgen_model.set_generation_params(duration=30)
result = musicgen_model.generate([cleaned_response['Music Prompt']],progress=True)

musicgen_model.set_generation_params(duration=video_file.video_metadata.video_duration.seconds)

result = musicgen_model.generate_with_chroma([cleaned_response['Music Prompt']],[result], progress=True)



In [None]:
display_audio(result)