### 0. pip & import

In [47]:
!pip install googletrans==4.0.0-rc1



In [48]:
!pip install transformers
!pip install diffusers



In [49]:
pip install matplotlib

Note: you may need to restart the kernel to use updated packages.


In [1]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Playdata\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Playdata\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
import torch, logging
from PIL import Image
from torchvision import transforms as tfms
from transformers import CLIPTextModel, CLIPTokenizer
from diffusers import AutoencoderKL, UNet2DConditionModel, LMSDiscreteScheduler

  from .autonotebook import tqdm as notebook_tqdm


### 1. 문장 번역

In [3]:
from googletrans import Translator
translator=Translator()

In [4]:
new_title = '가을 산책할 때 듣기 좋은 노래'

In [5]:
new_title_trans=translator.translate(new_title,dest='en').text
new_title_trans

'A good song to listen to when walking in autumn'

- 토큰화

In [6]:
# 영어의 불용어 단어 리스트를 불러와 변수에 저장
stopwords=nltk.corpus.stopwords.words('english')

# 각 문장을 토큰화
tokens=nltk.word_tokenize(new_title_trans)

# 토큰화한 각 문장에서 불용어를 제거한 결과 출력
# 불용어가 모두 소문자이므로, 각 토큰을 소문자로 변환 후 비교해야 함
tokens=[t for t in tokens if t.lower() not in stopwords]
print(tokens)
# nltk를 사용한 이유 명확하게 정리(<->konlpy)

['good', 'song', 'listen', 'walking', 'autumn']


In [125]:
remove_prompts = ['good', 'song', 'listen', 'hear']

# 기존의 토큰 리스트에서 삭제할 프롬프트 제거
prompts = [item for item in tokens if item not in remove_prompts]

# 필수 프롬프트 추가
additional_prompts = ['photograph, landscape, simple']
prompts = prompts + additional_prompts

### 2. 이미지 생성

In [155]:
import torch, logging

## disable warnings
logging.disable(logging.WARNING)

## Imaging  library
from PIL import Image
from torchvision import transforms as tfms

## Basic libraries
import numpy as np
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import display
import shutil
import os

## For video display
from IPython.display import HTML
from base64 import b64encode

## Import the CLIP artifacts
from transformers import CLIPTextModel, CLIPTokenizer
from diffusers import AutoencoderKL, UNet2DConditionModel, LMSDiscreteScheduler

## Initiating tokenizer and encoder.
tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
# 모델을 'float32'로 변환
text_encoder = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14").to(torch.float32)

## Initiating the VAE
vae = AutoencoderKL.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="vae")
# 모델을 'float32'로 변환
vae = vae.to(torch.float32)

## Initializing a scheduler and Setting number of sampling steps
scheduler = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000)
scheduler.set_timesteps(50)

## Initializing the U-Net model
unet = UNet2DConditionModel.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="unet")
# 모델을 'float32'로 변환
unet = unet.to(torch.float32)

## Helper functions
def load_image(p):
    '''
    Function to load images from a defined path
    '''
    return Image.open(p).convert('RGB').resize((512,512))

def pil_to_latents(image):
    '''
    Function to convert image to latents
    '''
    init_image = tfms.ToTensor()(image).unsqueeze(0) * 2.0 - 1.0
    init_image = init_image.to(device="cpu", dtype=torch.float32)  # CPU로 이동 및 데이터 타입 변경
    init_latent_dist = vae.encode(init_image).latent_dist.sample() * 0.18215
    return init_latent_dist

def latents_to_pil(latents):
    '''
    Function to convert latents to images
    '''
    latents = (1 / 0.18215) * latents
    with torch.no_grad():
        image = vae.decode(latents).sample
    image = (image / 2 + 0.5).clamp(0, 1)
    image = image.detach().cpu().permute(0, 2, 3, 1).numpy()
    images = (image * 255).round().astype("uint8")
    pil_images = [Image.fromarray(image) for image in images]
    return pil_images

def text_enc(prompts, maxlen=None):
    '''
    A function to take a texual promt and convert it into embeddings
    '''
    if maxlen is None: maxlen = tokenizer.model_max_length
    inp = tokenizer(prompts, padding="max_length", max_length=maxlen, truncation=True, return_tensors="pt")
    return text_encoder(inp.input_ids)[0]

def prompt_2_img(prompts, neg_prompts=None, g=7.5, seed=100, steps=70, dim=512, save_int=False):
    """
    Diffusion process to convert prompt to image
    """
    # input_ids를 float32로 변환
    input_ids = text_enc(prompts).to(torch.float32)

    # 다음으로 이어지는 부분은 변경 없음
    bs = len(prompts)

    # Converting textual prompts to embedding
    text = text_enc(prompts)

    # Adding negative prompt condition
    if not neg_prompts:
        uncond = text_enc([""] * bs, text.shape[1])
    else:
        uncond = text_enc(neg_prompts, text.shape[1])
    emb = torch.cat([uncond, text])

    # Setting the seed
    if seed: torch.manual_seed(seed)

    # Initiating random noise
    latents = torch.randn((bs, unet.in_channels, dim//8, dim//8), dtype=torch.float32)  # dtype을 명시적으로 float32로 설정

    # Setting number of steps in scheduler
    scheduler.set_timesteps(steps)

    # Adding noise to the latents
    latents = latents * scheduler.init_noise_sigma

    # Iterating through defined steps
    for i, ts in enumerate(tqdm(scheduler.timesteps)):
        # We need to scale the input latents to match the variance
        inp = scheduler.scale_model_input(torch.cat([latents] * 2), ts)

        # Predicting noise residual using U-Net
        with torch.no_grad():
            u, t = unet(inp, ts, encoder_hidden_states=emb).sample.chunk(2)

        # Performing Guidance
        pred = u + g * (t - u)

        # Conditioning the latents
        latents = scheduler.step(pred, ts, latents).prev_sample

        # Saving intermediate images
        if save_int:
            if not os.path.exists(f'./steps'): os.mkdir(f'./steps')
            latents_to_pil(latents)[0].save(f'steps/{i:04}.jpeg')

    # Returning the latent representation to output an image of 3x512x512
    return latents_to_pil(latents)

# 이미지 생성 및 표시
images = prompt_2_img(prompts=prompts, neg_prompts=["body, hair, face, eyes, nose, ears, lip, legs, arms, hands, feet, girl, boy, people"], steps=50, save_int=False)[0]

# 수정된 이미지 표시
plt.imshow(images)
plt.axis('off')
plt.show()

    # 1. gpu대신 cpu를 사용할 수 있도록 변경
    # 2. "LayerNormKernelImpl" not implemented for 'Half' 오류-> 모델을 cpu로 이동하여 실행
    # 3. 그래도 계속 같은 오류 발생 -> 모델 수정(모델을 float32 데이터 타입으로 변경)-> text_encoder 모델을 'Float' 데이터 타입으로 변경
    # 4. value cannot be converted to type at::Half without overflow 오류 발생->해당 부분을 강제로 'Float' 데이터 타입으로 변환
    # 5. local variable 'inp' referenced before assignment 오류->inp변수 제거
    # 6. value cannot be converted to type at::Half without overflow 오류 발생->명시적으로 float32->같은 오류
    # 7.  모델과 입력 데이터가 모두 float32로 변환->또 같은 오류->모델 내부에서 half 로 처리하는 부분이 남아있을 것으로 보임
    # -> 모델을 다시 학습 & 모델 내부에 접근해 수정 

RuntimeError: value cannot be converted to type at::Half without overflow

### 3. 이미지 저장

In [None]:
# 저장할 폴더 경로 설정
output_folder = "result"

# 폴더가 존재하지 않으면 생성
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# 생성된 이미지 파일의 이름
output_file_name = f"{new_title}.jpg"

# 이미지 파일의 전체 경로 설정
output_file_path = os.path.join(output_folder, output_file_name)

In [None]:
# 이미지 저장
images.save(output_file_path)