In [None]:
!pip install -qU flash_attn==2.7.4.post1
!pip install -qU torch==2.6.0
!pip install -qU transformers==4.48.2
!pip install -qU accelerate==1.3.0
!pip install -qU soundfile==0.13.1
!pip install -qU pillow==11.1.0
!pip install -qU scipy==1.15.2
!pip install -qU torchvision==0.21.0
!pip install -qU backoff==2.2.1
!pip install -qU peft==0.13.2

In [1]:
import requests
import torch
import os
import io
from PIL import Image
import soundfile as sf
from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig
from urllib.request import urlopen


# Define model path
model_path = "microsoft/Phi-4-multimodal-instruct"

# Load model and processor
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_path, 
    device_map="cuda", 
    torch_dtype="auto", 
    trust_remote_code=True, 
    attn_implementation='flash_attention_2',
).cuda()

# Load generation config
generation_config = GenerationConfig.from_pretrained(model_path)

  from .autonotebook import tqdm as notebook_tqdm
A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-4-multimodal-instruct:
- processing_phi4mm.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-4-multimodal-instruct:
- configuration_phi4mm.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was do

In [2]:
# Define prompt structure
user_prompt = '<|user|>'
assistant_prompt = '<|assistant|>'
prompt_suffix = '<|end|>'

In [3]:
# Part 1: Image Processing
print("\n--- IMAGE PROCESSING ---")
image_url = 'https://www.ilankelman.org/stopsigns/australia.jpg'
prompt = f'{user_prompt}<|image_1|>What is shown in this image?{prompt_suffix}{assistant_prompt}'
print(f'>>> Prompt\n{prompt}')

# Download and open image
image = Image.open(requests.get(image_url, stream=True).raw)
inputs = processor(text=prompt, images=image, return_tensors='pt').to('cuda:0')

# Generate response
generate_ids = model.generate(
    **inputs,
    max_new_tokens=1000,
    generation_config=generation_config,
)
generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
response = processor.batch_decode(
    generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(f'>>> Response\n{response}')


--- IMAGE PROCESSING ---
>>> Prompt
<|user|><|image_1|>What is shown in this image?<|end|><|assistant|>
>>> Response
A stop sign in front of a building with Chinese writing on it.


In [5]:

# Part 2: Audio Processing
print("\n--- AUDIO PROCESSING ---")
audio_url = "https://upload.wikimedia.org/wikipedia/commons/b/b0/Barbara_Sahakian_BBC_Radio4_The_Life_Scientific_29_May_2012_b01j5j24.flac"

speech_prompt = "Transcribe the audio to text, and then translate the audio to French. Use <sep> as a separator between the original transcript and the translation."

prompt = f'{user_prompt}<|audio_1|>{speech_prompt}{prompt_suffix}{assistant_prompt}'
print(f'>>> Prompt\n{prompt}')

# Downlowd and open audio file
audio, samplerate = sf.read(io.BytesIO(urlopen(audio_url).read()))
# audio, samplerate = sf.read('audio.wav')


# Process with the model
inputs = processor(text=prompt, audios=[(audio, samplerate)], return_tensors='pt').to('cuda:0')

generate_ids = model.generate(
    **inputs,
    max_new_tokens=1000,
    generation_config=generation_config,
)
generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
response = processor.batch_decode(
    generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(f'>>> Response\n{response}')



--- AUDIO PROCESSING ---
>>> Prompt
<|user|><|audio_1|>Transcribe the audio to text, and then translate the audio to French. Use <sep> as a separator between the original transcript and the translation.<|end|><|assistant|>




>>> Response
What we do as a society, we have to think about where we're moving to. I frequently talk to students about cognitive enhancing drugs and a lot of students take them for studying and exams, but other students feel angry about this. They feel those students are cheating and we have no long-term health and safety studies in healthy people and we really need those before people start taking them. <sep> Ce que nous faisons en tant que société, nous devons penser à où nous allons. Je parle fréquemment avec des étudiants sur les médicaments cognitifs et beaucoup d'étudiants les prennent pour étudier et les examens, mais d'autres étudiants se sentent en colère à ce sujet. Ils sentent que ces étudiants trichent et nous n'avons pas d'études de santé et de sécurité à long terme sur des personnes saines et nous en avons vraiment besoin avant que les gens ne commencent à les prendre.


In [None]:
#体はきっと拙の思いを理解してくれる。どうか正しいところに栄養が吸収されますように。

In [10]:
speech_prompt2 = "Please transcribe the following audio content and give the <emotion> tag at the end. Emotions are divided into 8 categories: <sad>, <anger>, <neutral>, <happy>, <surprise>, <fear>, <disgust>, and <other>."

prompt = f'{user_prompt}<|audio_1|>{speech_prompt2}{prompt_suffix}{assistant_prompt}'
print(f'>>> Prompt\n{prompt}')

# Process with the model
inputs = processor(text=prompt, audios=[(audio, samplerate)], return_tensors='pt').to('cuda:0')

generate_ids = model.generate(
    **inputs,
    max_new_tokens=1000,
    generation_config=generation_config,
)
generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
response = processor.batch_decode(
    generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(f'>>> Response\n{response}')

>>> Prompt
<|user|><|audio_1|>Please transcribe the following audio content and give the <emotion> tag at the end. Emotions are divided into 8 categories: <sad>, <anger>, <neutral>, <happy>, <surprise>, <fear>, <disgust>, and <other>.<|end|><|assistant|>
>>> Response
体はきっと節の思いを理解してくれる。どうか正しいところに栄養が吸収されますように。 <sad> The body will surely understand the intention of the section. May the nutrients be absorbed in the right place.


In [None]:
## 第一個版本是vanilla --> 最後是<e>
## 第二個版本是 *sad* ... --> 出來也是<e>
## 第三個版本是 <sad> --> 就變 <sad>