In [None]:
from google.colab import drive
drive.mount('/content/drive')
# need to mount the drive to access the script eg. simple_inference_custom

Mounted at /content/drive


In [None]:
# Install necessary libraries
!pip install transformers
!pip install adapter-transformers
!pip install torch
!pip install datasets
!pip install webdataset
!pip install pytorch_lightning
!pip install git+https://github.com/openai/CLIP.git
# download the linear mse model path
!wget https://github.com/microsoft/LMOps/raw/main/promptist/aesthetic/sac%2Blogos%2Bava1-l14-linearMSE.pth
!pip install openai
!pip install hpsv2

In [None]:
!pip install adapter-transformers -U

In [None]:
! pip install diffusers

In [None]:
# !pip install peft

In [None]:
pwd

'/content'

In [None]:
ls '/content/drive/Shared drives/capstone_rlt2i/codes/reward_predictor/aesthetic_scores/'

predict_aesthetic_scores.py  [0m[01;34m__pycache__[0m/  scores.csv  simple_inference_custom.py


In [None]:
import sys
sys.path.append('/content/drive/Shared drives/capstone_rlt2i/codes/reward_predictor/aesthetic_scores/')
import simple_inference_custom

In [None]:
# sys.path.append('/content/drive/Shared drives/capstone_rlt2i/codes/')
# import text2img
# from text2img import generate_images
# import importlib
# importlib.reload(text2img)
# from text2img import generate_images

In [None]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import AutoTokenizer, AutoModelForCausalLM
import random
# simple_inference_custom is the customized script we have to calcualte aesthetic score
from simple_inference_custom import predict_aesthetic_score
import os
import hpsv2
from sklearn.preprocessing import MinMaxScaler
from transformers import CLIPProcessor, CLIPModel
from datasets import load_dataset
import numpy as np
import openai
import json
import os

from diffusers import StableDiffusionPipeline

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
t2i_pipe = StableDiffusionPipeline.from_pretrained("prompthero/openjourney", torch_dtype=torch.float32)
t2i_pipe = t2i_pipe.to(device)

In [None]:
# policy model as an actor-critic model
class PolicyModel(nn.Module):
    def __init__(self, prompt_feature_size):
        super(PolicyModel, self).__init__()
        self.fc1 = nn.Linear(prompt_feature_size, 128)
        self.fc2 = nn.Linear(128, 64)

        # Actor layer for temperature
        self.fc_actor_temp = nn.Linear(64, 1)

        # Critic layer
        self.fc_critic = nn.Linear(64, 1)

        self.sigmoid = nn.Sigmoid()

    def forward(self, prompt_features):
        # Ensure prompt_features is a 2D tensor
        if len(prompt_features.size()) == 1:
            prompt_features = prompt_features.unsqueeze(0)
        x = torch.relu(self.fc1(prompt_features))
        x = torch.relu(self.fc2(x))

        # Actor output for temperature
        temperature = self.sigmoid(self.fc_actor_temp(x))

        # Critic output
        # value_estimate = self.fc_critic(x)
        # Added sigmoid activation to limit prediction value within 0 and 1
        value_estimate = self.sigmoid(self.fc_critic(x))

        return temperature, value_estimate

    def initialize_with_defaults(self, default_temperature=0.9):
        # For temperature: Using the sigmoid's inverse to get the pre-activation value
        temp_bias = -torch.log(1. / torch.tensor(default_temperature) - 1.)
        self.fc_actor_temp.bias.data.fill_(temp_bias.item())


def compute_combined_score(text, image, power=1, w_aesthetic=0.5, w_hps=0.5):
    try:
        aesthetic_score = predict_aesthetic_score(image)
        HPS_score = hpsv2.score(image, text)
    except Exception as e:
        print(f"Error processing image: {str(e)}")
        return None

    aesthetic_score_val = aesthetic_score.cpu().item()

    # Normalize aesthetic score to be in the range [0, 1]
    aesthetic_score_normalized = aesthetic_score_val / 10.0

    # Apply power transformation to HPS score to amplify the difference between HPS scores
    HPS_score_transformed = HPS_score[0] ** power

    # Since HPS score is already in the range [0, 1], we don't need to normalize it again
    # Calculate the combined score as a weighted sum
    combined_score = w_aesthetic * aesthetic_score_normalized + w_hps * HPS_score_transformed

    # Convert to PyTorch tensor
    combined_score_tensor = torch.tensor(combined_score, requires_grad=True).to(device)

    return combined_score_tensor


# text to image generation
# use DAL2E text to image model
# def text_to_image_generation(text, name):
#     generated_image = generate_images(text, name)
#     return generated_image

# use openjourney text to image model
def text_to_image_generation(prompts):
    images = t2i_pipe(prompts).images
    return images

In [None]:
def prompt_enrichment(prompt, temperature=0.5, desired_length=300):
    curr_prompt = "Base prompt: " + prompt + "\nEnriched prompt for text-to-image generation: "

    # Tokenize the prompt
    curr_prompt_encoded = LM_tokenizer.encode("\n" + LM_tokenizer.bos_token + curr_prompt, return_tensors='pt')[0]
    input_ids = torch.cat((few_shot_prompt_encoded, curr_prompt_encoded), dim=0)

    # Generate text with temperature and desired length
    output = LM_model.generate(input_ids.unsqueeze(0), max_length=desired_length, num_return_sequences=1, bos_token_id=LM_tokenizer.bos_token_id,
                            eos_token_id=LM_tokenizer.eos_token_id, temperature=temperature, do_sample=True)

    # Decode the generated text
    generated_text = LM_tokenizer.decode(output[0], skip_special_tokens=True)

    # Remove possible unwanted text
    start_idx = generated_text.find(curr_prompt)
    if start_idx != -1:
        generated_text = generated_text[start_idx + len(curr_prompt):]

    end_idx = generated_text.find("\n")
    if end_idx != -1:
        generated_text = generated_text[:end_idx]

    end_idx = generated_text.find("Base prompt")
    if end_idx != -1:
        generated_text = generated_text[:end_idx]

    generated_text = generated_text.strip()

    return generated_text

### Load & Preprocess Data

In [None]:
training_data_path = '/content/drive/Shared drives/capstone_rlt2i/Momo/eval_results/'
training_data = pd.read_csv(training_data_path+'finetuning-dataset.csv')
all_enriched_prompts = training_data['Prompt']

In [None]:
all_base_prompts = training_data['Base_prompt']

In [None]:
# Calculate the average prompt length
average_base_prompt_length = int(sum(len(prompt) for prompt in all_base_prompts) / len(all_base_prompts))
print(f"Average prompt length: {average_base_prompt_length}")

Average prompt length: 26


In [None]:
# Calculate the average prompt length
average_prompt_length = int(sum(len(prompt) for prompt in all_enriched_prompts) / len(all_enriched_prompts))
print(f"Average prompt length: {average_prompt_length}")

Average prompt length: 253


In [None]:
df_data_samples = training_data[:5000]

In [None]:
df_data_samples.head()

Unnamed: 0,Base_prompt,Prompt
0,realistic car 3 d,realistic car 3 d render sci - fi car and sci ...
1,a comic potrait of a female necromamcer,a comic potrait of a female necromamcer with b...
2,steampunk market,"steampunk market interior, colorful, 3 d scene..."
3,“ A portrait of a cyborg,"“A portrait of a cyborg in a golden suit, D&D ..."
4,A full portrait of a beautiful post,A full portrait of a beautiful post apocalypti...


In [None]:
import torch
from transformers import BertModel, BertTokenizer
import time

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def extract_bert_features(row):
    text = row['Base_prompt']

    # Load the BERT model and tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased').to(device)

    # Tokenize the text and truncate/pad it to the maximum length
    inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True, padding="max_length").to(device)

    # Extract text features
    with torch.no_grad():
        outputs = model(**inputs)
        # You can use the pooled output or the last hidden states depending on your preference
        text_features = outputs.pooler_output

    return text_features.to('cpu')

# extract text features
times = []

def timed_extract_bert_features(row):
    start_time = time.time()
    text_features = extract_bert_features(row)
    end_time = time.time()

    total_time = end_time - start_time
    times.append(total_time)

    return text_features

# Use the timed function with apply
df_data_samples['text_features'] = df_data_samples.apply(lambda x: timed_extract_bert_features(x), axis=1)

# Calculate total and average time
total_time_taken = sum(times)
average_time_taken = total_time_taken / len(times)

print(f"Total time taken: {total_time_taken} seconds")
print(f"Average time taken per row: {average_time_taken} seconds")

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight

Total time taken: 13.726128578186035 seconds
Average time taken per row: 2.745225715637207 seconds


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_data_samples['text_features'] = df_data_samples.apply(lambda x: timed_extract_bert_features(x), axis=1)


In [None]:
from torch.utils.data import Dataset, DataLoader

class TextImageDataset(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        prompt = row['Base_prompt']
        text_features = row['text_features']
        # image_features = row['image_features']  # Uncomment if you have image features
        return prompt, text_features  # , image_features

# Create the dataset
dataset = TextImageDataset(df_data_samples)

# Define your batch size
BATCH_SIZE = 5

# Create the DataLoader
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

In [None]:
# initialization few-shot language model

# model_base = 'gpt2-medium' # model_name = "bigscience/bloom-7b1" # model_name = 'gpt2-medium' # adapter_path = '/content/drive/My Drive/NLP271B/model'

model_name = "meta-llama/Llama-2-7b-chat-hf"
access_token = "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"

LM_tokenizer = AutoTokenizer.from_pretrained(model_name, token=access_token)
LM_model = AutoModelForCausalLM.from_pretrained(model_name, token=access_token)

# Set the model to evaluation mode
LM_model.eval()

In [None]:
few_shot_prompts = (tokenizer.bos_token + "Base prompt: a green hair guy\nEnriched prompt for text-to-image generation: a highly detailed portrait of a man with dark green hair and green glowing eyes, high detail clothing, concept art, anime, artstation, professional."+ tokenizer.eos_token
    + "\n" + tokenizer.bos_token + "Base prompt: animal crossing werewolf\nEnriched prompt for text-to-image generation: a cute chibi werewolf animal crossing villager. animal crossing character. 3 d render, 3 d model, simplified, animal crossing new horizons, hq, arstation."+ tokenizer.eos_token
    + "\n" + tokenizer.bos_token + "Base prompt: an android woman\nEnriched prompt for text-to-image generation: portrait of a beautiful android woman, futuristic, chrome and colorful, photo realistic, ray tracing, 3 d shading, octane render."+ tokenizer.eos_token
)

few_shot_prompt_encoded = tokenizer.encode(few_shot_prompts, return_tensors='pt')[0]

In [None]:
# Sample a row
sample_row = df_data_samples.iloc[0]

# Extract the text and image features from the sample row
sample_text_feature = sample_row['text_features']
# sample_image_feature = sample_row['image_features']

# Determine the feature sizes
YOUR_PROMPT_FEATURE_SIZE = sample_text_feature.shape[1]
# YOUR_IMAGE_FEATURE_SIZE = sample_image_feature.shape[1]
print("text feature size: ", YOUR_PROMPT_FEATURE_SIZE)
# print("image feature size: ", YOUR_IMAGE_FEATURE_SIZE)

text feature size:  768


In [None]:
# iniatilization for temperatures, prompt length, and RL agent parameters
DEFAULT_TEMPERATURE = 0.9
EPSILON = 0.9
EPSILON_DECAY = 0.995
LEARNING_RATE = 0.001

In [None]:
# Initialize policy model with default temperature, prompt length
policy_model = PolicyModel(prompt_feature_size=YOUR_PROMPT_FEATURE_SIZE)
policy_model.initialize_with_defaults(default_temperature=DEFAULT_TEMPERATURE)
policy_model.to(device)
# adam optimzer
optimizer = optim.Adam(policy_model.parameters(), lr=LEARNING_RATE)

In [None]:
# i = 0
# for prompts, prompt_features in dataloader:
#   prompt_features = prompt_features.to(device)
#   print(prompt_features)
#   i+=1
#   if i ==1:
#     break

In [None]:
# Training Loop
NUM_EPOCHS = 2  # Total number of epochs you intend to train, including resumed training
CLIP_EPSILON = 0.2

# Initialize the optimizer (adjust as needed)
optimizer = optim.Adam(policy_model.parameters(), lr=0.001)

# Check if there's a saved checkpoint, and if so, load it
# checkpoint_file = "policy_model_checkpoint.pth"
checkpoint_file_path = "/content/drive/My Drive/NLP271b/model/policy_model_checkpoint.pth"

In [None]:
start_epoch = 0  # manually adjust this after stopping each time
start_iteration = 0  # manually adjust this after stopping each time

In [None]:
if os.path.exists(checkpoint_file_path):
    checkpoint = torch.load(checkpoint_file_path)
    policy_model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    start_epoch = checkpoint['epoch']
    start_iteration = checkpoint['iteration']

In [None]:
print(start_epoch)
print(start_iteration)

0
1


In [None]:
# Training Loop
for epoch in range(start_epoch, NUM_EPOCHS):
    print("current epoch: ", epoch)
    start_time_loop = time.time()

    # Loop through batches of data
    for iteration, (prompts, prompt_features) in enumerate(dataloader):
        if iteration < start_iteration:
            continue  # Skip previously processed iterations

    # Loop through batches of data
    # for prompts, prompt_features in dataloader:
        # Move the prompt features to the device
        prompt_features = prompt_features.to(device)

        # Predict temperature and desired length for the entire batch
        predicted_temperature_tensor, value_estimate = policy_model(prompt_features)
        std_dev = 0.1
        temperature_dist = torch.distributions.Normal(predicted_temperature_tensor, std_dev)

        # Initialize lists to store batch results
        enriched_prompts_batch = []
        images_batch = []
        scores_batch = []

        for i, prompt in enumerate(prompts):
            predicted_temperature = predicted_temperature_tensor[i].item()

            # Define the temperature range with bounds checking and different levels
            temperature_levels = [
                max(0.0, predicted_temperature - 0.1),  # Lower creative level
                predicted_temperature,                   # Current predicted level
                min(1.0, predicted_temperature + 0.05), # Slightly higher creative level
                min(1.0, predicted_temperature + 0.1)   # Higher creative level
            ]

            # Ensure unique values and sort them
            temperature_levels = sorted(set(temperature_levels))

            # Now you can use temperature_levels for generating enriched prompts
            enriched_prompts = [prompt_enrichment(prompt, temperature=temp) for temp in temperature_levels]

            enriched_prompts_batch.append(enriched_prompts)

            # Generate images for each enriched prompt
            images = text_to_image_generation(enriched_prompts)
            images_batch.append(images)

            # Compute scores for each image
            scores = [compute_combined_score(text, img) for text, img in zip(enriched_prompts, images)]
            scores_batch.append(scores)

        # Now, you have a batch of enriched prompts, images, and scores
        # You need to find the best score and corresponding temperature for each item in the batch
        batch_rewards = []
        batch_advantages = []
        batch_ratios = []

        for i, scores in enumerate(scores_batch):
            max_score_index = scores.index(max(scores))
            best_temperature = temperature_levels[max_score_index]
            reward = scores[max_score_index]

            # Calculate PPO loss for each item in the batch
            advantage = reward - value_estimate[i]
            best_temperature_tensor = torch.tensor(best_temperature, requires_grad=True).to(device)
            old_prob = temperature_dist.log_prob(best_temperature_tensor)
            new_prob = temperature_dist.log_prob(predicted_temperature_tensor[i])
            ratio = torch.exp(new_prob - old_prob)
            clipped_ratio = torch.clamp(ratio, 1 - CLIP_EPSILON, 1 + CLIP_EPSILON)

            batch_rewards.append(reward)
            batch_advantages.append(advantage)
            batch_ratios.append(ratio)

        # Convert lists to tensors
        batch_rewards = torch.stack(batch_rewards)
        batch_advantages = torch.stack(batch_advantages)
        batch_ratios = torch.stack(batch_ratios)

        # Actor (Policy) loss
        actor_loss = -torch.min(batch_ratios * batch_advantages, clipped_ratio * batch_advantages).mean()
        # Critic (Value) loss
        critic_loss = 0.5 * batch_advantages.pow(2).mean()
        # Total loss
        loss = actor_loss + critic_loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Update the iteration counter
        start_iteration = iteration + 1

        # Save the checkpoint after processing each batch
        # Attempt to save the checkpoint after processing each batch
        try:
            torch.save({
                'epoch': epoch,
                'iteration': start_iteration,
                'model_state_dict': policy_model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
            }, checkpoint_file_path)
            print(f"Checkpoint saved to {checkpoint_file_path}")
        except Exception as e:
            print(f"Error saving checkpoint: {e}")

    end_time_loop = time.time()
    print(f"Time taken for one training loop: {end_time_loop - start_time_loop} seconds")

print("Training Complete")



# # Specify the path where you want to save the model
# model_save_path = "/content/drive/My Drive/NLP271B/policy_model.pth"

# # Save the model's state dictionary
# torch.save(policy_model.state_dict(), model_save_path)

# print(f"Model saved to {model_save_path}")


# # Initialize the model architecture (ensure it's the same architecture as the saved model)
# loaded_policy_model = PolicyModel(prompt_feature_size=YOUR_PROMPT_FEATURE_SIZE, image_feature_size=YOUR_IMAGE_FEATURE_SIZE)

# # Load the model's state dictionary
# loaded_policy_model.load_state_dict(torch.load(model_save_path))

# # If you're using a GPU, move the model to the GPU
# loaded_policy_model.to(device)

# # Set the model to evaluation mode (if you're doing inference)
# loaded_policy_model.eval()


current epoch:  0


The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ['enriched 】 market, 【 enriched 】 market, 【 enriched 】 market, 【 enriched 】 market, 【 enriched 】 market, 【 enriched 】 market, 【 enriched 】 market, 【 enriched 】 market, 【 enriched 】 market, 【 enriched 】 market, 【 enriched 】 market, 【 enriched 】 market, 【 enriched 】 market, 【 enriched 】 market, 【 enriched 】 market, 【 enriched 】 market, 【 enriched 】 market, 【 enriched 】 market, 【 enriched 】 market, 【 enriched 】 market, 【 enriched 】 market, 【 enriched 】 market, 【 enriched 】 market, 【 enriched 】 market, 【 enriched 】 market, 【 enriched 】 market, 【 enriched 】 market, 【 enriched 】 market, 【', 'enriched 】 market, 【 enriched 】 market, 【 enriched 】 market, 【 enriched 】 market, 【 enriched 】 market, 【 enriched 】 market, 【 enriched 】 market, 【 enriched 】 market, 【 enriched 】 market, 【 enriched 】 market, 【 enriched 】 market, 【 enriched 】 market, 【 enriched 】 market, 【 enriched 】 market, 【 enriched 】

  0%|          | 0/50 [00:00<?, ?it/s]

Aesthetic score predicted by the model:
tensor([[6.6420]], device='cuda:0', grad_fn=<AddmmBackward0>)
Aesthetic score predicted by the model:
tensor([[4.8320]], device='cuda:0', grad_fn=<AddmmBackward0>)
Aesthetic score predicted by the model:
tensor([[4.8251]], device='cuda:0', grad_fn=<AddmmBackward0>)
Aesthetic score predicted by the model:
tensor([[5.2938]], device='cuda:0', grad_fn=<AddmmBackward0>)


The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ['realistic car 3 d, enriched prompt : realistic car 3 d, enriched prompt : realistic car 3 d, enriched prompt : realistic car 3 d, enriched prompt : realistic car 3 d, enriched prompt : realistic car 3 d, enriched prompt : realistic car 3 d, enriched prompt : realistic car 3 d, enriched prompt : realistic car 3 d, enriched prompt : realistic car 3 d, enriched prompt : realistic car 3 d, enriched prompt : realistic car 3 d, enriched prompt : realistic car 3 d, enriched prompt : realistic car 3 d, enriched prompt : realistic car 3 d, enriched prompt : realistic car 3 d, enriched prompt :', 'realistic car 3 d, enriched prompt : realistic car 3 d, enriched prompt : realistic car 3 d, enriched prompt : realistic car 3 d, enriched prompt : realistic car 3 d, enriched prompt : realistic car 3 d, enriched prompt : realistic car 3 d, enriched prompt : realistic car 3 d, enriched prompt : real

  0%|          | 0/50 [00:00<?, ?it/s]

Aesthetic score predicted by the model:
tensor([[5.5235]], device='cuda:0', grad_fn=<AddmmBackward0>)
Aesthetic score predicted by the model:
tensor([[5.3579]], device='cuda:0', grad_fn=<AddmmBackward0>)
Aesthetic score predicted by the model:
tensor([[5.3675]], device='cuda:0', grad_fn=<AddmmBackward0>)
Aesthetic score predicted by the model:
tensor([[5.3897]], device='cuda:0', grad_fn=<AddmmBackward0>)


The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ['post, enriched prompt : a full portrait of a beautiful post, enriched prompt : a full portrait of a beautiful post, enriched prompt : a full portrait of a beautiful post, enriched prompt : a full portrait of a beautiful post, enriched prompt : a full portrait of a beautiful post, enriched prompt : a full portrait of a beautiful post, enriched prompt : a full portrait of a beautiful post, enriched prompt : a full portrait of a beautiful post, enriched prompt : a full portrait of a beautiful post, enriched prompt : a full portrait of a beautiful post, enriched prompt : a full portrait of a beautiful post, enriched prompt : a full portrait of a beautiful post, enriched prompt : a full portrait of a beautiful post, enriched prompt : a full portrait of a beautiful post, enriched prompt : a full portrait', 'post, enriched prompt : a full portrait of a beautiful post, enriched prompt : a f

  0%|          | 0/50 [00:00<?, ?it/s]

Aesthetic score predicted by the model:
tensor([[7.1929]], device='cuda:0', grad_fn=<AddmmBackward0>)
Aesthetic score predicted by the model:
tensor([[7.2979]], device='cuda:0', grad_fn=<AddmmBackward0>)
Aesthetic score predicted by the model:
tensor([[7.1236]], device='cuda:0', grad_fn=<AddmmBackward0>)
Aesthetic score predicted by the model:
tensor([[6.9226]], device='cuda:0', grad_fn=<AddmmBackward0>)


The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: [', 【 cyanide 】, 【 cyanide 】, 【 cyanide 】, 【 cyanide 】, 【 cyanide 】, 【 cyanide 】, 【 cyanide 】, 【 cyanide 】, 【 cyanide 】, 【 cyanide 】, 【 cyanide 】, 【 cyanide 】, 【 cyanide 】, 【 cyanide 】, 【 cyanide 】, 【 cyanide 】, 【 cyanide 】, 【 cyanide 】, 【 cyanide 】, 【 cyanide 】, 【 cyanide 】, 【 cyanide 】, 【 cyanide 】, 【 cyanide 】, 【 cyanide 】, 【 cyanide 】, 【 cyanide 】, 【 cyanide 】, 【 cyanide 】, 【 cyanide 】, 【 cyanide 】, 【 cyan', ', 【 cyanide 】, 【 cyanide 】, 【 cyanide 】, 【 cyanide 】, 【 cyanide 】, 【 cyanide 】, 【 cyanide 】, 【 cyanide 】, 【 cyanide 】, 【 cyanide 】, 【 cyanide 】, 【 cyanide 】, 【 cyanide 】, 【 cyanide 】, 【 cyanide 】, 【 cyanide 】, 【 cyanide 】, 【 cyanide 】, 【 cyanide 】, 【 cyanide 】, 【 cyanide 】, 【 cyanide 】, 【 cyanide 】, 【 cyanide 】, 【 cyanide 】, 【 cyanide 】, 【 cyanide 】, 【 cyanide 】, 【 cyanide 】, 【 cyanide 】, 【 cyanide 】, 【 cyan', ', 【 cyanide 】, 【 cyanide 】, 【 cyanide 】, 【 cyanide 】, 【 cyanide 】

  0%|          | 0/50 [00:00<?, ?it/s]

Potential NSFW content was detected in one or more images. A black image will be returned instead. Try again with a different prompt and/or seed.


Aesthetic score predicted by the model:
tensor([[6.1559]], device='cuda:0', grad_fn=<AddmmBackward0>)
Aesthetic score predicted by the model:
tensor([[4.6045]], device='cuda:0', grad_fn=<AddmmBackward0>)
Aesthetic score predicted by the model:
tensor([[5.7946]], device='cuda:0', grad_fn=<AddmmBackward0>)
Aesthetic score predicted by the model:
tensor([[6.2936]], device='cuda:0', grad_fn=<AddmmBackward0>)


The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ['enriched prompt : a comic potrait of a female necromamcer, enriched prompt : a comic potrait of a female necromamcer, enriched prompt : a comic potrait of a female necromamcer, enriched prompt : a comic potrait of a female necromamcer, enriched prompt : a comic potrait of a female necromamcer, enriched prompt : a comic potrait of a female necromamcer, enriched prompt : a comic potrait of a female necromamcer, enriched prompt : a comic potrait of a female necromamcer, enriched prompt : a comic potrait of a female necromamcer, enriched prompt : a comic potrait of a female necromamcer, enriched prompt : a comic potrait of a female necromamcer, enriched prompt : a comic potrait of a female necrom', 'enriched prompt : a comic potrait of a female necromamcer, enriched prompt : a comic potrait of a female necromamcer, enriched prompt : a comic potrait of a female necromamcer, enriched prom

  0%|          | 0/50 [00:00<?, ?it/s]

Potential NSFW content was detected in one or more images. A black image will be returned instead. Try again with a different prompt and/or seed.


Aesthetic score predicted by the model:
tensor([[4.6045]], device='cuda:0', grad_fn=<AddmmBackward0>)
Aesthetic score predicted by the model:
tensor([[6.1557]], device='cuda:0', grad_fn=<AddmmBackward0>)
Aesthetic score predicted by the model:
tensor([[6.0415]], device='cuda:0', grad_fn=<AddmmBackward0>)
Aesthetic score predicted by the model:
tensor([[4.6045]], device='cuda:0', grad_fn=<AddmmBackward0>)
Checkpoint saved to /content/drive/My Drive/NLP271b/model/policy_model_checkpoint.pth
Time taken for one training loop: 2137.377005338669 seconds
current epoch:  1
Time taken for one training loop: 0.04449868202209473 seconds
Training Complete
