In [None]:
!pip install transformers

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# path data is the path to the scores csv file
path_data = '/content/drive/My Drive/NLP271b/data/'

In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import pickle

### Create DataFrame

In [None]:
scores = pd.read_csv(path_data + 'scores_1k.csv')

In [None]:
scores.head(3)

Unnamed: 0,HPS_score,aesthetic_score
0,0.261,5.974581
1,0.2598,7.049436
2,0.2595,5.011903


In [None]:
scores.describe()

Unnamed: 0,HPS_score,aesthetic_score
count,1000.0,1000.0
mean,0.257622,5.901686
std,0.015214,0.75343
min,0.1975,2.913632
25%,0.2493,5.558878
50%,0.2588,6.022524
75%,0.2676,6.408821
max,0.302,7.371183


In [None]:
# normalize the scores
scores_norm = scores.copy()
scaler = MinMaxScaler()
scores_norm[['HPS_score', 'aesthetic_score']] = scaler.fit_transform(scores[['HPS_score', 'aesthetic_score']])
scores_norm['combined_score'] = scores_norm[['HPS_score', 'aesthetic_score']].mean(axis=1)

In [None]:
scores_norm.head(3)

Unnamed: 0,HPS_score,aesthetic_score,combined_score
0,0.607656,0.686688,0.647172
1,0.596172,0.92782,0.761996
2,0.593301,0.470723,0.532012


In [None]:
!pip install datasets

In [None]:
from datasets import load_dataset

dataset = load_dataset("poloclub/diffusiondb", name='2m_first_1k')

Downloading builder script:   0%|          | 0.00/15.2k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/25.0k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/581M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/195M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
train_data = dataset['train']

In [None]:
all_prompts = train_data['prompt']
all_image = train_data['image']

In [None]:
new_data = {'prompt': all_prompts, 'image': all_image, 'combined_score': scores_norm['combined_score']}
df_1k = pd.DataFrame(new_data)

In [None]:
df_1k.head()

Unnamed: 0,prompt,image,combined_score
0,"a renaissance portrait of dwayne johnson, art ...",<PIL.PngImagePlugin.PngImageFile image mode=RG...,0.647172
1,"portrait of a dancing eagle woman, beautiful b...",<PIL.PngImagePlugin.PngImageFile image mode=RG...,0.761996
2,"epic 3 d, become legend shiji! gpu mecha contr...",<PIL.PngImagePlugin.PngImageFile image mode=RG...,0.532012
3,an airbrush painting of cyber war machine scen...,<PIL.PngImagePlugin.PngImageFile image mode=RG...,0.695919
4,concept art of a silent hill monster. painted ...,<PIL.PngImagePlugin.PngImageFile image mode=RG...,0.464754


In [None]:
df_1k.to_csv(path_data+'prompt_img_score_1k.csv', index=False)

In [None]:
file_path = 'prompt_img_score_1k.pkl'

# Use pickle.dump() to save the DataFrame to the file
with open(path_data + file_path, 'wb') as file:
    pickle.dump(df_1k, file)

### CLIP Feature Extraction

In [None]:
from transformers import CLIPProcessor, CLIPModel
import torch

In [None]:
file_path = 'prompt_img_score_1k.pkl'
with open(path_data + file_path , 'rb') as file:
    df_1k = pickle.load(file)

In [None]:
df_1k.head()

Unnamed: 0,prompt,image,combined_score
0,"a renaissance portrait of dwayne johnson, art ...",<PIL.PngImagePlugin.PngImageFile image mode=RG...,0.647172
1,"portrait of a dancing eagle woman, beautiful b...",<PIL.PngImagePlugin.PngImageFile image mode=RG...,0.761996
2,"epic 3 d, become legend shiji! gpu mecha contr...",<PIL.PngImagePlugin.PngImageFile image mode=RG...,0.532012
3,an airbrush painting of cyber war machine scen...,<PIL.PngImagePlugin.PngImageFile image mode=RG...,0.695919
4,concept art of a silent hill monster. painted ...,<PIL.PngImagePlugin.PngImageFile image mode=RG...,0.464754


In [None]:
# df_1k['image'][1]

In [None]:
# Load the CLIP model and processor
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16")

# Move the model to the GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()


Downloading (…)rocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/905 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/961k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/4.10k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/599M [00:00<?, ?B/s]

CLIPModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 512)
      (position_embedding): Embedding(77, 512)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (layer_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=512, out_features=2048, bias=True)
            (fc2): Linear(in_features=2048, out_features=512, bias=True)
          )
          (layer_norm2): LayerNorm((512,), eps=1e-05,

In [None]:
def extract_clip_features(row):
    text = row['prompt']
    image_path = row['image']

    # Process text and image on the GPU
    inputs = processor(text, return_tensors="pt", padding=True, truncation=True).to(device)
    image = processor(images=image_path, return_tensors="pt", padding=True).to(device)

    # Extract text and image features
    with torch.no_grad():
        text_features = model.get_text_features(**inputs)
        image_features = model.get_image_features(**image)

    return text_features.to('cpu'), image_features.to('cpu')

In [None]:
# Apply the function to your DataFrame
df_1k[['text_features', 'image_features']] = df_1k.apply(extract_clip_features, axis=1, result_type='expand')

# Combine Text and Image Features (concatenation)
df_1k['combined_features'] = df_1k.apply(lambda row: torch.cat([row['text_features'], row['image_features']], dim=1), axis=1)

In [None]:
df_1k.head()

Unnamed: 0,prompt,image,combined_score,text_features,image_features,combined_features
0,"a renaissance portrait of dwayne johnson, art ...",<PIL.PngImagePlugin.PngImageFile image mode=RG...,0.647172,"[[tensor(-0.0859), tensor(0.0469), tensor(-0.2...","[[tensor(0.3696), tensor(-0.0439), tensor(-0.3...","[[tensor(-0.0859), tensor(0.0469), tensor(-0.2..."
1,"portrait of a dancing eagle woman, beautiful b...",<PIL.PngImagePlugin.PngImageFile image mode=RG...,0.761996,"[[tensor(0.1790), tensor(0.2683), tensor(0.169...","[[tensor(0.4292), tensor(-0.4263), tensor(0.33...","[[tensor(0.1790), tensor(0.2683), tensor(0.169..."
2,"epic 3 d, become legend shiji! gpu mecha contr...",<PIL.PngImagePlugin.PngImageFile image mode=RG...,0.532012,"[[tensor(0.0891), tensor(-0.0423), tensor(-0.7...","[[tensor(0.9356), tensor(-1.0624), tensor(-0.1...","[[tensor(0.0891), tensor(-0.0423), tensor(-0.7..."
3,an airbrush painting of cyber war machine scen...,<PIL.PngImagePlugin.PngImageFile image mode=RG...,0.695919,"[[tensor(0.2290), tensor(0.2456), tensor(0.075...","[[tensor(-0.1087), tensor(-1.2515), tensor(0.2...","[[tensor(0.2290), tensor(0.2456), tensor(0.075..."
4,concept art of a silent hill monster. painted ...,<PIL.PngImagePlugin.PngImageFile image mode=RG...,0.464754,"[[tensor(0.1360), tensor(0.7336), tensor(-0.13...","[[tensor(-0.0140), tensor(0.0491), tensor(0.17...","[[tensor(0.1360), tensor(0.7336), tensor(-0.13..."


In [None]:
# attention weighted combination
def attention_based_fusion(text_features, image_features):
    # Calculate attention weights using dot product
    attention_weights = torch.matmul(text_features, image_features.t())

    # Apply softmax to normalize the attention weights
    attention_weights = torch.nn.functional.softmax(attention_weights, dim=1)

    # Weighted combination of text and image features
    fused_features = torch.matmul(attention_weights, image_features)

    return fused_features


In [None]:
df_1k['attention_fused_features'] = df_1k.apply(lambda row: attention_based_fusion(row['text_features'], row['image_features']), axis=1)

In [None]:
df_1k.head()

Unnamed: 0,prompt,image,combined_score,text_features,image_features,combined_features,attention_fused_features
0,"a renaissance portrait of dwayne johnson, art ...",<PIL.PngImagePlugin.PngImageFile image mode=RG...,0.647172,"[[tensor(-0.0859), tensor(0.0469), tensor(-0.2...","[[tensor(0.3696), tensor(-0.0439), tensor(-0.3...","[[tensor(-0.0859), tensor(0.0469), tensor(-0.2...","[[tensor(0.3696), tensor(-0.0439), tensor(-0.3..."
1,"portrait of a dancing eagle woman, beautiful b...",<PIL.PngImagePlugin.PngImageFile image mode=RG...,0.761996,"[[tensor(0.1790), tensor(0.2683), tensor(0.169...","[[tensor(0.4292), tensor(-0.4263), tensor(0.33...","[[tensor(0.1790), tensor(0.2683), tensor(0.169...","[[tensor(0.4292), tensor(-0.4263), tensor(0.33..."
2,"epic 3 d, become legend shiji! gpu mecha contr...",<PIL.PngImagePlugin.PngImageFile image mode=RG...,0.532012,"[[tensor(0.0891), tensor(-0.0423), tensor(-0.7...","[[tensor(0.9356), tensor(-1.0624), tensor(-0.1...","[[tensor(0.0891), tensor(-0.0423), tensor(-0.7...","[[tensor(0.9356), tensor(-1.0624), tensor(-0.1..."
3,an airbrush painting of cyber war machine scen...,<PIL.PngImagePlugin.PngImageFile image mode=RG...,0.695919,"[[tensor(0.2290), tensor(0.2456), tensor(0.075...","[[tensor(-0.1087), tensor(-1.2515), tensor(0.2...","[[tensor(0.2290), tensor(0.2456), tensor(0.075...","[[tensor(-0.1087), tensor(-1.2515), tensor(0.2..."
4,concept art of a silent hill monster. painted ...,<PIL.PngImagePlugin.PngImageFile image mode=RG...,0.464754,"[[tensor(0.1360), tensor(0.7336), tensor(-0.13...","[[tensor(-0.0140), tensor(0.0491), tensor(0.17...","[[tensor(0.1360), tensor(0.7336), tensor(-0.13...","[[tensor(-0.0140), tensor(0.0491), tensor(0.17..."


In [None]:
file_path2 = 'df_features_1k.pkl'

# Use pickle.dump() to save the DataFrame to the file
with open(path_data+ file_path2, 'wb') as file:
    pickle.dump(df_1k, file)