In [None]:
!git https://github.com/simsim314/i-Code.gits

In [None]:
%cd i-Code/i-Code-V3/
!pip install -r requirement.txt

In [None]:
!wget https://huggingface.co/ZinengTang/CoDi/resolve/main/CoDi_audio_diffuser_m.pth
!wget https://huggingface.co/ZinengTang/CoDi/resolve/main/CoDi_encoders.pth
!wget https://huggingface.co/ZinengTang/CoDi/resolve/main/CoDi_text_diffuser.pth
!wget https://huggingface.co/ZinengTang/CoDi/resolve/main/CoDi_video_diffuser_8frames.pth

<a id='ContentList'></a>
# Content List

## Single to Single Generation

### 1. [Text To Image](#TextToImage)

### 2. [Image To Text](#ImageToText)

### 3. [Text To Audio](#TextToAudio)

### 4. [Audio To Text](#AudioToText)

### 5. [Image To Audio](#ImageToAudio)

### 6. [Audio To Image](#AudioToImage)

### 7. [Text To Video](#TextToVideo)

## Multi-Conditioning Generation

### 1. [Text + Image + Audio To Image](#TextImageAudioToImage)

## Joint Multimodal Generation

### 1. [Text To Image+Text](#TextToImageText)

### 2. [Text To Video+Audio](#TextToVideoAudio)

<a id='LoadModel'></a>
# Load Model

In [None]:
"""
Load model from checkpoint.

For model inference:
The outputs are stored in an array as [number of output modalities, number of samples]
If I generate 4 samples of image + caption, the shape would be [2, 4]
"""

import os
from core.models.model_module_infer import model_module

model_load_paths = ['CoDi_encoders.pth', 'CoDi_text_diffuser.pth', 'CoDi_audio_diffuser_m.pth', 'CoDi_video_diffuser_8frames.pth']
inference_tester = model_module(data_dir='', pth=model_load_paths)
inference_tester = inference_tester.cuda()
inference_tester = inference_tester.eval()


<a id='TextToImage'></a>
# Text To Image
### [Back to Menu](#ContentList)

In [None]:
# Give a prompt
prompt = "cat wearing napolen clothes eating cheese"
# Generate image
images = inference_tester.inference(
                xtype = ['image'],
                condition = [prompt],
                condition_types = ['text'],
                n_samples = 1,
                image_size = 256,
                ddim_steps = 100)
images[0][0]

<a id='ImageToText'></a>
# Image To Text
### [Back to Menu](#ContentList)

In [None]:
# Load an image input
from PIL import Image
im = Image.open('./assets/demo_files/house.jpeg').resize((224,224))
im

In [None]:
text = inference_tester.inference(
                xtype = ['text'],
                condition = [im],
                condition_types = ['image'],
                n_samples = 4,
                ddim_steps = 50,
                scale = 7.5,)
text[0]

<a id='TextToAudio'></a>
# Text To Audio
### [Back to Menu](#ContentList)

In [None]:
# Give a prompt
prompt = 'a train enters station.'

# Generate audio
audio_wave = inference_tester.inference(
                xtype = ['audio'],
                condition = [prompt],
                condition_types = ['text'],
                scale = 7.5,
                n_samples = 1,
                ddim_steps = 50)[0]

# Play the audio
from IPython.display import Audio
Audio(audio_wave.squeeze(), rate=16000)

<a id='AudioToText'></a>
# Audio To Text
### [Back to Menu](#ContentList)

In [None]:
import torchaudio
import torch
from IPython.display import Audio

path = './assets/demo_files/train_sound.flac'

audio_wavs, sr = torchaudio.load(path)
audio_wavs = torchaudio.functional.resample(waveform=audio_wavs, orig_freq=sr, new_freq=16000).mean(0)[:int(16000 * 10.23)]
Audio(audio_wavs.squeeze(), rate=16000)

In [None]:
n_samples = 4
text = inference_tester.inference(
                xtype = ['text'],
                condition = [audio_wavs],
                condition_types = ['audio'],
                n_samples = n_samples,
                ddim_steps = 50,
                scale = 7.5)
text

<a id='ImageToAudio'></a>
# Image To Audio
### [Back to Menu](#ContentList)

In [None]:
# Load an image
from PIL import Image
from core.common.utils import regularize_image
im = Image.open('./assets/demo_files/rain_on_tree.jpg')
im

In [None]:
# Generate audio
audio_wave = inference_tester.inference(
                xtype = ['audio'],
                condition = [im],
                condition_types = ['image'],
                scale = 7.5,
                n_samples = 1,
                ddim_steps = 50)[0]

# Play audio
from IPython.display import Audio
Audio(audio_wave.squeeze(), rate=16000)

<a id='AudioToImage'></a>
# Audio To Image
### [Back to Menu](#ContentList)

In [None]:
# Load input audio andplay
import torchaudio
import torch
from IPython.display import Audio
pad_time = 10.23

path = './assets/demo_files/wind_chimes.wav'

audio_wavs, sr = torchaudio.load(path)
audio_wavs = torchaudio.functional.resample(waveform=audio_wavs, orig_freq=sr, new_freq=16000).mean(0)[:int(16000 * pad_time)]
padding = torch.zeros([int(16000 * pad_time) - audio_wavs.size(0)])
audio_wavs = torch.cat([audio_wavs, padding], 0)

from IPython.display import Audio
Audio(path, rate=16000)


In [None]:
import torch

# Generate image
images = inference_tester.inference(
                xtype = ['image'],
                condition = [audio_wavs],
                condition_types = ['audio'],
                scale = 7.5,
                image_size = 256,
                ddim_steps = 50)
images[0][0]

<a id='TextToVideo'></a>
# Text To Video
### [Back to Menu](#ContentList)

In [None]:
# Give A Prompt
prompt = 'people having meal in a ancy resrtourant'

n_samples = 1
outputs = inference_tester.inference(
                ['video'],
                condition = [prompt],
                condition_types = ['text'],
                n_samples = 1,
                image_size = 256,
                ddim_steps = 50,
                num_frames = 8,
                scale = 7.5)

video = outputs[0][0]

In [None]:
# Visual video as gif
from PIL import Image
frame_one = video[0]
path = "./generated_text2video.gif"
frame_one.save(path, format="GIF", append_images=video[1:],
               save_all=True, duration=2000/len(video), loop=0)

from IPython import display
from IPython.display import Image
Image(data=open(path,'rb').read(), format='png')

<a id='TextImageAudioToImage'></a>
#  Text + Audio To Image

### [Back to Menu](#ContentList)

In [None]:
# Load Audio Inputs
import torchaudio
import torch
from IPython.display import Audio

path = './assets/demo_files/sea_waves.wav'

audio_wavs, sr = torchaudio.load(path)
audio_wavs = torchaudio.functional.resample(waveform=audio_wavs, orig_freq=sr, new_freq=16000).mean(0)[:int(16000 * 10.23)]
Audio(audio_wavs.squeeze(), rate=16000)


In [None]:
# Give A Prompt
prompt = 'dawn, dawn scenery, sunset, beautiful lighting.'


In [None]:
# Generate image
# Mix weight here is the weighting ratio of the condition inputs

n_samples = 1
images = inference_tester.inference(
                ['image'],
                condition = [audio_wavs, prompt],
                condition_types = ['audio', 'text'],
                n_samples = n_samples,
                image_size = 256,
                mix_weight = {'audio': 1, 'text': 2}, )

images[0][0]

<a id='TextToImageText'></a>
#  Text To Image + Text

### [Back to Menu](#ContentList)

In [None]:
# Give A Prompt
prompt = 'deep diving in coral reef underwater.'

outputs = inference_tester.inference(
                ['image', 'text'],
                condition = [prompt],
                condition_types = ['text'],
                n_samples = 1,
                image_size = 256)

image, text = outputs

In [None]:
image[0]

In [None]:
text[0]

<a id='TextToVideoAudio'></a>
#  Text To Video + Audio

### [Back to Menu](#ContentList)

In [None]:
# Give A Prompt
prompt = 'walking inside a beautiful forest, nature, birds.'

n_samples = 1
outputs = inference_tester.inference(
                ['video', 'audio'],
                condition = [prompt],
                condition_types = ['text'],
                n_samples = 1,
                image_size = 256,
                ddim_steps = 50,
                num_frames = 8,
                scale = 7.5)


In [None]:
video, audio_wave = outputs

from IPython.display import Audio
Audio(audio_wave[0].squeeze(), rate=16000)

In [None]:
# Visual video as gif
video = video[0]
from PIL import Image
frame_one = video[0]
path = "./generated_video.gif"
frame_one.save(path, format="GIF", append_images=video[1:],
               save_all=True, duration=2000/len(video), loop=0)

from IPython import display
from IPython.display import Image
Image(data=open(path,'rb').read(), format='png')