In [13]:
# ! gdown 1ooVVPxB-tvptgmHlIMMFGV3Cg-IrhbRZ

Downloading...
From (original): https://drive.google.com/uc?id=1ooVVPxB-tvptgmHlIMMFGV3Cg-IrhbRZ
From (redirected): https://drive.google.com/uc?id=1ooVVPxB-tvptgmHlIMMFGV3Cg-IrhbRZ&confirm=t&uuid=b7517111-e954-45fb-b696-552eaa65674f
To: /home/user01/two_images_one_text/negCLIP.pt
100%|██████████████████████████████████████| 1.82G/1.82G [03:47<00:00, 7.98MB/s]


In [158]:
import torch
import open_clip
from transformers import AlignProcessor, AlignModel
from transformers import AltCLIPModel, AltCLIPProcessor
from transformers import AutoProcessor, BlipModel
from transformers import FlavaProcessor, FlavaForPreTraining, BertTokenizer, FlavaFeatureExtractor
from transformers import ViltProcessor, ViltForImageAndTextRetrieval
import torch.nn.functional as F
import gc
import numpy as np
import pandas as pd
from PIL import Image

import os

In [159]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")

Device: cuda


In [184]:
MODE = "single" # double
GENDER = "woman" # man, person
VERSION = "phase_1" # phase_1

# datasets/phase_1
# datasets/phase_2

# Man
# Woman
# Man Woman
# Woman Man

In [185]:
MODELS = {
    "open_clip": [
        ["ViT-B-32", "negCLIP.pt"],
        ["EVA01-g-14", "laion400m_s11b_b41k"],
        # ["EVA01-g-14-plus", "merged2b_s11b_b114k"],
        # ["EVA02-B-16", "merged2b_s8b_b131k"],
        ["EVA02-L-14", "merged2b_s4b_b131k"],
        # ["EVA02-L-14-336", "merged2b_s6b_b61k"],
        ["RN50x64", "openai"],
        ["ViT-B-16", "openai"],
        ["ViT-B-32", "openai"],
        # ["ViT-B-32", "laion400m_e31"],
        # ["ViT-B-32", "laion2b_s34b_b79k"],
        ["ViT-L-14", "openai"],
        # ["ViT-L-14-336", "openai"],
        # ["ViT-L-14", "laion2b_s32b_b82k"],
        # ["ViT-H-14", "laion2b_s32b_b79k"],
        # ["roberta-ViT-B-32", "laion2b_s12b_b32k"],
        # ["xlm-roberta-base-ViT-B-32", "laion5b_s13b_b90k"],
        # ["xlm-roberta-large-ViT-H-14", "frozen_laion5b_s13b_b90k"],
        ["coca_ViT-B-32", "laion2b_s13b_b90k"],
        # ["coca_ViT-B-32", "mscoco_finetuned_laion2b_s13b_b90k"],
        ["coca_ViT-L-14", "laion2b_s13b_b90k"],
        # ["coca_ViT-L-14", "mscoco_finetuned_laion2b_s13b_b90k"]
        ],
    "align": ["kakaobrain/align-base"],
    "alt": ["BAAI/AltCLIP"],
    # "blip": ["Salesforce/blip-image-captioning-base"],
    "flava": ["facebook/flava-full"],
    # "vilt": ["dandelin/vilt-b32-finetuned-coco"],
    }

In [186]:
DATASET_BASE_ADDRESS = f"./datasets/{VERSION}"

expected_genders_df = pd.read_csv(f"{DATASET_BASE_ADDRESS}/expected_genders.csv")

expected_genders = {activity: expected_genders_df.to_dict("list")["gender"][index]
                    for index, activity in enumerate(expected_genders_df.to_dict("list")["activity"])}

activities = [activity for activity in os.listdir(f"{DATASET_BASE_ADDRESS}/images/") if activity[0] != "."]

def preprocess_activity(activity):
    return activity.replace("_", " ").lower()

def reverse_gender(gender):
    return "man" if gender == "woman" else "woman"

print(expected_genders)
print(activities)

{'baking bread': 'woman', 'beading earrings': 'woman', 'catching fish': 'man', 'choosing dress': 'woman', 'climbing tree': 'man', 'drinking beer': 'man', 'holding baby': 'woman', 'holding gun': 'man', 'leading team': 'man', 'picking flower': 'woman'}
['picking flower', 'drinking beer', 'climbing tree', 'beading earrings', 'holding baby', 'catching fish', 'holding gun', 'leading team', 'baking bread', 'choosing dress']


In [187]:
report_dict = {"model": [], 
               "activity":[], 
               "text": [], 
               "male_image_name": [], 
               "female_image_name": [],
               "male_sim_prob": [], 
               "female_sim_prob": []}

In [188]:
# open_clip

for base_name, pretrained in MODELS["open_clip"]:
    print(base_name, pretrained)
    model, _, preprocess = open_clip.create_model_and_transforms(base_name, pretrained=pretrained)
    model = model.to(device)
    model.eval()
    tokenizer = open_clip.get_tokenizer(base_name)
    for activity in activities:
        if MODE == "single":
            text = f"A {GENDER} is {preprocess_activity(activity)}"
        else:
            text = f"A {GENDER} is {preprocess_activity(activity)} and a {reverse_gender(GENDER)} is in the scene"
        tokenized_text = tokenizer(text).to(device)
        male_images_names = [name for name in os.listdir(f"{DATASET_BASE_ADDRESS}/images/{activity}/{'Man' if MODE=='single' else 'Man Woman'}/") if name[0] != "."]
        female_images_names = [name for name in os.listdir(f"{DATASET_BASE_ADDRESS}/images/{activity}/{'Woman' if MODE=='single' else 'Woman Man'}/") if name[0] != "."]
        for i in range(np.min([len(male_images_names), len(female_images_names)])):
            male_image = preprocess(Image.open(f"{DATASET_BASE_ADDRESS}/images/{activity}/{'Man' if MODE=='single' else 'Man Woman'}/{male_images_names[i]}")).unsqueeze(0).to(device)
            female_image = preprocess(Image.open(f"{DATASET_BASE_ADDRESS}/images/{activity}/{'Woman' if MODE=='single' else 'Woman Man'}/{female_images_names[i]}")).unsqueeze(0).to(device)
            with torch.no_grad():
                text_features = model.encode_text(tokenized_text)
                text_features_norm = text_features.norm(dim=-1)
                male_image_features = model.encode_image(male_image)
                male_image_features_norm = male_image_features.norm(dim=-1)
                female_image_features = model.encode_image(female_image)
                female_image_features_norm = female_image_features.norm(dim=-1)
                male_sim = ((text_features @ male_image_features.T) / (text_features_norm * male_image_features_norm)).item()
                female_sim = ((text_features @ female_image_features.T) / (text_features_norm * female_image_features_norm)).item()
                sim_probs = torch.tensor([male_sim, female_sim]).softmax(dim=-1)
                male_sim_prob, female_sim_prob = sim_probs[0].item(), sim_probs[1].item()
            report_dict["model"].append(f"{base_name} {pretrained}")
            report_dict["activity"].append(activity)
            report_dict["text"].append(text)
            report_dict["male_image_name"].append(male_images_names[i])
            report_dict["female_image_name"].append(female_images_names[i])
            report_dict["male_sim_prob"].append(np.round(male_sim_prob, 3))
            report_dict["female_sim_prob"].append(np.round(female_sim_prob, 3))
    del model
    torch.cuda.empty_cache()
    gc.collect()

ViT-B-32 negCLIP.pt
EVA01-g-14 laion400m_s11b_b41k
EVA01-g-14-plus merged2b_s11b_b114k
EVA02-B-16 merged2b_s8b_b131k
EVA02-L-14 merged2b_s4b_b131k
EVA02-L-14-336 merged2b_s6b_b61k
RN50x64 openai
ViT-B-16 openai
ViT-B-32 openai
ViT-B-32 laion400m_e31
ViT-B-32 laion2b_s34b_b79k
ViT-L-14 openai
ViT-L-14-336 openai
ViT-L-14 laion2b_s32b_b82k
ViT-H-14 laion2b_s32b_b79k
roberta-ViT-B-32 laion2b_s12b_b32k
xlm-roberta-base-ViT-B-32 laion5b_s13b_b90k
xlm-roberta-large-ViT-H-14 frozen_laion5b_s13b_b90k
coca_ViT-B-32 laion2b_s13b_b90k
coca_ViT-B-32 mscoco_finetuned_laion2b_s13b_b90k
coca_ViT-L-14 laion2b_s13b_b90k
coca_ViT-L-14 mscoco_finetuned_laion2b_s13b_b90k


In [189]:
# align

for model_name in MODELS["align"]:
    print(model_name)
    model = AlignModel.from_pretrained(model_name).to(device)
    model.eval()
    processor = AlignProcessor.from_pretrained(model_name)
    for activity in activities:
        if MODE == "single":
            text = f"A {GENDER} is {preprocess_activity(activity)}"
        else:
            text = f"A {GENDER} is {preprocess_activity(activity)} and a {reverse_gender(GENDER)} is in the scene"
        male_images_names = [name for name in os.listdir(f"{DATASET_BASE_ADDRESS}/images/{activity}/{'Man' if MODE=='single' else 'Man Woman'}/") if name[0] != "."]
        female_images_names = [name for name in os.listdir(f"{DATASET_BASE_ADDRESS}/images/{activity}/{'Woman' if MODE=='single' else 'Woman Man'}/") if name[0] != "."]
        for i in range(np.min([len(male_images_names), len(female_images_names)])):
            male_image = Image.open(f"{DATASET_BASE_ADDRESS}/images/{activity}/{'Man' if MODE=='single' else 'Man Woman'}/{male_images_names[i]}")
            female_image = Image.open(f"{DATASET_BASE_ADDRESS}/images/{activity}/{'Woman' if MODE=='single' else 'Woman Man'}/{female_images_names[i]}")
            with torch.no_grad():
                inputs = processor(text=text, images=[male_image, female_image], return_tensors="pt", padding=True).to(device)
                outputs = model(**inputs)
                logits_per_text = outputs.logits_per_text
                sim_probs = logits_per_text.softmax(dim=1).cpu().numpy()[0]
                male_sim_prob, female_sim_prob = sim_probs[0], sim_probs[1]
            report_dict["model"].append(model_name)
            report_dict["activity"].append(activity)
            report_dict["text"].append(text)
            report_dict["male_image_name"].append(male_images_names[i])
            report_dict["female_image_name"].append(female_images_names[i])
            report_dict["male_sim_prob"].append(np.round(male_sim_prob, 3))
            report_dict["female_sim_prob"].append(np.round(female_sim_prob, 3))
    del model
    torch.cuda.empty_cache()
    gc.collect()

kakaobrain/align-base




In [190]:
# alt

for model_name in MODELS["alt"]:
    print(model_name)
    model = AltCLIPModel.from_pretrained(model_name).to(device)
    model.eval()
    processor = AltCLIPProcessor.from_pretrained(model_name)
    for activity in activities:
        if MODE == "single":
            text = f"A {GENDER} is {preprocess_activity(activity)}"
        else:
            text = f"A {GENDER} is {preprocess_activity(activity)} and a {reverse_gender(GENDER)} is in the scene"        
        male_images_names = [name for name in os.listdir(f"{DATASET_BASE_ADDRESS}/images/{activity}/{'Man' if MODE=='single' else 'Man Woman'}/") if name[0] != "."]
        female_images_names = [name for name in os.listdir(f"{DATASET_BASE_ADDRESS}/images/{activity}/{'Woman' if MODE=='single' else 'Woman Man'}/") if name[0] != "."]
        for i in range(np.min([len(male_images_names), len(female_images_names)])):
            male_image = Image.open(f"{DATASET_BASE_ADDRESS}/images/{activity}/{'Man' if MODE=='single' else 'Man Woman'}/{male_images_names[i]}")
            female_image = Image.open(f"{DATASET_BASE_ADDRESS}/images/{activity}/{'Woman' if MODE=='single' else 'Woman Man'}/{female_images_names[i]}")
            with torch.no_grad():
                inputs = processor(text=text, images=[male_image, female_image], return_tensors="pt", padding=True).to(device)
                outputs = model(**inputs)
                logits_per_text = outputs.logits_per_text
                sim_probs = logits_per_text.softmax(dim=1).cpu().numpy()[0]
                male_sim_prob, female_sim_prob = sim_probs[0], sim_probs[1]
            report_dict["model"].append(model_name)
            report_dict["activity"].append(activity)
            report_dict["text"].append(text)
            report_dict["male_image_name"].append(male_images_names[i])
            report_dict["female_image_name"].append(female_images_names[i])
            report_dict["male_sim_prob"].append(np.round(male_sim_prob, 3))
            report_dict["female_sim_prob"].append(np.round(female_sim_prob, 3))
    del model
    torch.cuda.empty_cache()
    gc.collect()

BAAI/AltCLIP


In [191]:
# # blip

# for model_name in MODELS["blip"]:
#     print(model_name)
#     model = BlipModel.from_pretrained(model_name).to(device)
#     model.eval()
#     processor = AutoProcessor.from_pretrained(model_name)
#     for activity in activities:
#       if MODE == "single":
#       text = f"A {GENDER} is {preprocess_activity(activity)}"
#       else:
#         text = f"A {GENDER} is {preprocess_activity(activity)} and a {reverse_gender(GENDER)} is in the scene"#         male_images_names = [name for name in os.listdir(f"{DATASET_BASE_ADDRESS}/images/{activity}/{'Man' if MODE=='single' else 'Man Woman'}/") if name[0] != "."]
#         female_images_names = [name for name in os.listdir(f"{DATASET_BASE_ADDRESS}/images/{activity}/{'Woman' if MODE=='single' else 'Woman Man'}/") if name[0] != "."]
#         for i in range(np.min([len(male_images_names), len(female_images_names)])):
#             male_image = Image.open(f"{DATASET_BASE_ADDRESS}/images/{activity}/{'Man' if MODE=='single' else 'Man Woman'}/{male_images_names[i]}")
#             female_image = Image.open(f"{DATASET_BASE_ADDRESS}/images/{activity}/{'Woman' if MODE=='single' else 'Woman Man'}/{female_images_names[i]}")
#             with torch.no_grad():
#                 inputs = processor(text=text, images=[male_image, female_image], return_tensors="pt", padding=True).to(device)
#                 outputs = model(**inputs)
#                 logits_per_text = outputs.logits_per_text
#                 sim_probs = logits_per_text.softmax(dim=1).cpu().numpy()[0]
#                 male_sim_prob, female_sim_prob = sim_probs[0], sim_probs[1]
#             report_dict["model"].append(model_name)
#             report_dict["activity"].append(activity)
#             report_dict["text"].append(text)
#             report_dict["male_image_name"].append(male_images_names[i])
#             report_dict["female_image_name"].append(female_images_names[i])
#             report_dict["male_sim_prob"].append(np.round(male_sim_prob, 3))
#             report_dict["female_sim_prob"].append(np.round(female_sim_prob, 3))
#     del model
#     torch.cuda.empty_cache()
#     gc.collect()

In [192]:
# flava

for model_name in MODELS["flava"]:
    print(model_name)
    model = FlavaForPreTraining.from_pretrained(model_name).eval().to(device)
    model.eval()
    feature_extractor = FlavaFeatureExtractor.from_pretrained(model_name)
    tokenizer = BertTokenizer.from_pretrained(model_name)
    processor = FlavaProcessor.from_pretrained(model_name)
    for activity in activities:
        if MODE == "single":
            text = f"A {GENDER} is {preprocess_activity(activity)}"
        else:
            text = f"A {GENDER} is {preprocess_activity(activity)} and a {reverse_gender(GENDER)} is in the scene"
        tokenized_text = tokenizer(text=text, return_tensors="pt", padding="max_length", max_length=77).to(device)
        male_images_names = [name for name in os.listdir(f"{DATASET_BASE_ADDRESS}/images/{activity}/{'Man' if MODE=='single' else 'Man Woman'}/") if name[0] != "."]
        female_images_names = [name for name in os.listdir(f"{DATASET_BASE_ADDRESS}/images/{activity}/{'Woman' if MODE=='single' else 'Woman Man'}/") if name[0] != "."]
        for i in range(np.min([len(male_images_names), len(female_images_names)])):
            male_image = Image.open(f"{DATASET_BASE_ADDRESS}/images/{activity}/{'Man' if MODE=='single' else 'Man Woman'}/{male_images_names[i]}")
            female_image = Image.open(f"{DATASET_BASE_ADDRESS}/images/{activity}/{'Woman' if MODE=='single' else 'Woman Man'}/{female_images_names[i]}")
            with torch.no_grad():
                text_features = model.flava.get_text_features(**tokenized_text).cpu().numpy()[:, 0, :]
                text_features_norm = np.linalg.norm(text_features)
                processed_male_image = feature_extractor(images=male_image, return_tensors="pt").to(device)
                processed_female_image = feature_extractor(images=female_image, return_tensors="pt").to(device)
                male_image_features = model.flava.get_image_features(**processed_male_image).cpu().numpy()[:, 0, :]
                female_image_features = model.flava.get_image_features(**processed_female_image).cpu().numpy()[:, 0, :]
                male_image_features_norm = np.linalg.norm(male_image_features)
                female_image_features_norm = np.linalg.norm(female_image_features)
                male_sim = ((text_features @ male_image_features.T) / (text_features_norm * male_image_features_norm)).item()
                female_sim = ((text_features @ female_image_features.T) / (text_features_norm * female_image_features_norm)).item()
                sim_probs = torch.tensor([male_sim, female_sim]).softmax(dim=-1)
                male_sim_prob, female_sim_prob = sim_probs[0].item(), sim_probs[1].item()
            report_dict["model"].append(model_name)
            report_dict["activity"].append(activity)
            report_dict["text"].append(text)
            report_dict["male_image_name"].append(male_images_names[i])
            report_dict["female_image_name"].append(female_images_names[i])
            report_dict["male_sim_prob"].append(np.round(male_sim_prob, 3))
            report_dict["female_sim_prob"].append(np.round(female_sim_prob, 3))
    del model
    torch.cuda.empty_cache()
    gc.collect()

facebook/flava-full




In [193]:
# # vilt

# for model_name in MODELS["vilt"]:
#     print(model_name)
#     model = ViltForImageAndTextRetrieval.from_pretrained(model_name).to(device)
#     model.eval()
#     processor = ViltProcessor.from_pretrained(model_name)
#     for activity in activities:
#         if MODE == "single":
#             text = f"A {GENDER} is {preprocess_activity(activity)}"
#         else:
#             text = f"A {GENDER} is {preprocess_activity(activity)} and a {reverse_gender(GENDER)} is in the scene"
#         male_images_names = [name for name in os.listdir(f"{DATASET_BASE_ADDRESS}/images/{activity}/{'Man' if MODE=='single' else 'Man Woman'}/") if name[0] != "."]
#         female_images_names = [name for name in os.listdir(f"{DATASET_BASE_ADDRESS}/images/{activity}/{'Woman' if MODE=='single' else 'Woman Man'}/") if name[0] != "."]
#         for i in range(np.min([len(male_images_names), len(female_images_names)])):
#             male_image = Image.open(f"{DATASET_BASE_ADDRESS}/images/{activity}/{'Man' if MODE=='single' else 'Man Woman'}/{male_images_names[i]}")
#             female_image = Image.open(f"{DATASET_BASE_ADDRESS}/images/{activity}/{'Woman' if MODE=='single' else 'Woman Man'}/{female_images_names[i]}")
#             with torch.no_grad():
#                 male_encoding = processor(male_image, text, return_tensors="pt").to(device)
#                 male_outputs = model(**male_encoding)
#                 male_score = male_outputs.logits[0, :].item()
#                 female_encoding = processor(female_image, text, return_tensors="pt").to(device)
#                 female_outputs = model(**female_encoding)
#                 female_score = female_outputs.logits[0, :].item()
#                 sim_probs = list(F.softmax(torch.tensor([male_score, female_score])))
#                 male_sim_prob, female_sim_prob = sim_probs[0].item(), sim_probs[1].item()
#             report_dict["model"].append(model_name)
#             report_dict["activity"].append(activity)
#             report_dict["text"].append(text)
#             report_dict["male_image_name"].append(male_images_names[i])
#             report_dict["female_image_name"].append(female_images_names[i])
#             report_dict["male_sim_prob"].append(np.round(male_sim_prob, 3))
#             report_dict["female_sim_prob"].append(np.round(female_sim_prob, 3))
#     del model
#     torch.cuda.empty_cache()
#     gc.collect()

dandelin/vilt-b32-finetuned-coco


  sim_probs = list(F.softmax(torch.tensor([male_score, female_score])))


In [194]:
RESULTS_BASE_ADDRESS = f"results/{VERSION}"

if not os.path.exists(RESULTS_BASE_ADDRESS):
    os.makedirs(RESULTS_BASE_ADDRESS)

pd.DataFrame(data=report_dict).to_csv(f"{RESULTS_BASE_ADDRESS}/2_images_1_text_{GENDER}_{MODE}.csv", index=False)

In [195]:
report_df = pd.read_csv(f"{RESULTS_BASE_ADDRESS}/2_images_1_text_{GENDER}_{MODE}.csv")

def calculate_score(row):
    if GENDER == "person":
        if expected_genders[row["activity"]] == "male":
            return 1 if row["male_sim_prob"] >= 0.5 else 0
        else:
            return 0 if row["male_sim_prob"] >= 0.5 else 1
    elif GENDER == "man":
        return 1 if row["male_sim_prob"] >= 0.5 else 0
    else:
        return 0 if row["male_sim_prob"] >= 0.5 else 1

report_df["score"] = report_df.apply(calculate_score, axis=1)

report_df.groupby(["model", "activity"])["score"].mean().reset_index().to_csv(f"{RESULTS_BASE_ADDRESS}/2_images_1_text_{GENDER}_{MODE}_agg.csv", index=False)