In [4]:
import torch
import clip

from scipy.stats import spearmanr, pearsonr
from sklearn.metrics import  classification_report, accuracy_score

from datetime import datetime

In [5]:
#Set the paths to the correct directories

RESULTS_PATH = './results/prompting/' + datetime.now().strftime("%Y%m%d-%H%M%S") + '.txt'
CLASSES_PATH = './imagenet1000_clsidx_to_labels.txt'
DATA_PATH = './encodings/CLIP-ViT32/'
CACHE_PATH = './.cache'

CLIP_VARIANT = 'ViT-B/32'  # Needs to be the same variant as the one used for encoding the images. 

In [11]:
class TextIAA:
    """This class provides all necessary methods for the PromptBased IAA Experiments
    """    

    def __init__(self, res_file: str = '') -> None:
        self.labels = []

        with open(CLASSES_PATH, 'r') as text:
            for line in text:
                label = line.split("'")[1]
                label = label.split(",")[0]
                self.labels.append(label)

        if(res_file != ''):
            self.res_path = res_file
        else:
            self.res_path = RESULTS_PATH


        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model, preprocess = clip.load(CLIP_VARIANT, self.device, download_root=CACHE_PATH)

        #Load prepared Test_Data
        SPLIT = 'test'
        with open(DATA_PATH + SPLIT + '_encodings', 'rb') as file:
            self.test_encodings = torch.load(file, map_location=self.device)

        with open(DATA_PATH + SPLIT + '_ratings', 'rb') as file:
            self.test_ratings = torch.load(file, map_location=self.device)

        self.test_encodings = torch.cat(self.test_encodings).squeeze().float()
        self.test_ratings = torch.cat(self.test_ratings).squeeze()
        self.test_mean = self.test_ratings @ torch.Tensor([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).to(self.device).double()

        # Load prepared Train_Data
        SPLIT = 'train'
        with open(DATA_PATH + SPLIT + '_encodings', 'rb') as file:
            self.train_encodings = torch.load(file, map_location=self.device)

        with open(DATA_PATH + SPLIT + '_ratings', 'rb') as file:
            self.train_ratings = torch.load(file, map_location=self.device)

        
        self.train_encodings = torch.cat(self.train_encodings).squeeze().float()
        self.train_ratings = torch.cat(self.train_ratings).squeeze()
        self.train_mean = self.train_ratings @ torch.Tensor([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).to(self.device).double()

    def generate_embeddings_binary(self, prompts: list, mode = 'fixed'):
        """Generate Text Embeddings for binary task
        Args:
            prompts (list): Prompts to be embedded by CLIP
            mode (str, optional): Prompt mode. One of ['fixed', 'content', 'ensembling']. Defaults to 'fixed'.
        Returns:
            torch.tensor: Embedded Prompts
        """        

        if(mode == 'fixed'):
            text_inputs = torch.cat([clip.tokenize(d) for d in prompts]).to(self.device)
            

        elif(mode == 'content' or mode == 'ensembling'):

            goodlist = []
            badlist = []
            goodPos = prompts[1].find('*')
            badPos = prompts[0].find("*")

            for label in self.labels:
                good = prompts[1][:goodPos] + label + prompts[1][goodPos+1:]
                bad = prompts[0][:badPos] + label + prompts[0][badPos+1:]

                goodlist.append(good)
                badlist.append(bad)


            if (mode == 'ensembling'):
                bad_inputs = torch.cat([clip.tokenize(d) for d in badlist]).to(self.device)
                good_inputs = torch.cat([clip.tokenize(d) for d in goodlist]).to(self.device)

                with torch.no_grad():
                    good_features = self.model.encode_text(good_inputs)
                    good_features = torch.mean(good_features, 0, keepdim=True)
                    bad_features = self.model.encode_text(bad_inputs) 
                    bad_features = torch.mean(bad_features, 0, keepdim=True)
                
                text_features = torch.cat((bad_features, good_features))
                return text_features

            else:

                textlist = badlist.copy()
                textlist.append(goodlist)

                text_inputs = torch.cat([clip.tokenize(d) for d in textlist]).to(self.device)

        with torch.no_grad():
            text_features = self.model.encode_text(text_inputs)

        return text_features

    def get_split(self, split: str = 'test', task: str = 'binary'):
        """Returns the encoded Images and Ratings for a Split

        Args:
            split (str, optional): 'test' or 'train'. Defaults to 'test'.
            task (str, optional): Determines the format of the ratings. One of ['binary', 'continuous']. Defaults to 'binary'.

        Returns:
            [tuple]: image_features, labels
        """        

        if (split == 'test'):
            image_features = self.test_encodings.unsqueeze(0)
            mean = self.test_mean 
        elif (split == 'train'):
            image_features = self.train_encodings.unsqueeze(0)
            mean = self.train_mean
        else:
            raise ValueError("Only 'test' or 'train' is a valid value for the argument split")

        if (task == 'binary'):
            labels = mean > 5.0

        elif (task == 'continuous'):
            labels = mean    
            
        return image_features, labels  

    def binary_fixed(self, prompt_bad: str, prompt_good: str, split = 'test', verbose = True):
        """Performs a evaluation using the provided prompts on the DATASET

        Args:
            prompt_bad (str): Prompt used for predicting unaesthetic images
            prompt_good (str): Prompt used for predicting aesthetic images
            split (str, optional): Split for the evaluation. Either 'test' or 'train'. Defaults to 'test'.
            verbose (bool, optional): Prints a  full classification report. Defaults to True.

        Returns:
            float: prediction accuracy    
        """        

        
        image_features, labels  = self.get_split(split=split) 

        text_features = self.generate_embeddings_binary([prompt_bad, prompt_good])
        
        image_features /= image_features.norm(dim=-1, keepdim=True)
        text_features /= text_features.norm(dim=-1, keepdim=True)

        similarity = (100.0 * image_features @ text_features.float().T).softmax(dim=-1)        
        values, pred = similarity.topk(1)
        pred = pred.ravel()

        if(verbose):
            print(f"------------------ \n Binary Task Results + Fixed (Split: {split}, Prompts: {[prompt_bad, prompt_good]}) :")
            print(classification_report(labels.cpu().numpy(), pred.cpu().numpy(), digits=4, target_names= ['bad', 'good']))
            
            
        return(accuracy_score(labels.cpu().numpy(), pred.cpu().numpy()))

    def binary_content(self, prompt_bad: str, prompt_good: str, split = 'test', verbose = True):
        """Performs a evaluation using content-aware-prompts on the binary task.
        The Prompts must contain one '*', which marks the place where the content should be inserted.

        Args:
            prompt_bad (str): Prompt used for predicting unaesthetic images.
            prompt_good (str): Prompt used for predicting aesthetic images.
            split (str, optional): Split for the evaluation. Either 'test' or 'train'. Defaults to 'test'.
            verbose (bool, optional): Prints a  full classification report. Defaults to True.

        Returns:
            float: prediction accuracy    
        """        

        image_features, labels  = self.get_split(split=split)  

        text_features = self.generate_embeddings_binary([prompt_bad, prompt_good], mode = 'content')
        
        image_features /= image_features.norm(dim=-1, keepdim=True)
        text_features /= text_features.norm(dim=-1, keepdim=True)

        similarity = (100.0 * image_features @ text_features.float().T).softmax(dim=-1)        
        values, indices = similarity.topk(1)
        pred = torch.div(indices.squeeze(), torch.tensor(len(self.labels)), rounding_mode='floor')

        #pred = indices >= len(self.labels)
        #pred = pred.ravel()
        
        if(verbose):
            print(f"------------------ \n Binary Task Results + Content (Split: {split}, Prompts: {[prompt_bad, prompt_good]}) :")
            print(classification_report(labels.cpu().numpy(), pred.cpu().numpy(), digits=4, target_names= ['bad', 'good']))
            
        
        return(accuracy_score(labels.cpu().numpy(), pred.cpu().numpy()))


    def binary_ensembling(self, prompt_bad: str, prompt_good: str, split = 'test', verbose = True):
        """Performs a evaluation using content-aware-prompts on the binary task.
        The Prompts must contain one '*', which marks the place where the content should be inserted.

        Args:
            prompt_bad (str): Prompt used for predicting unaesthetic images.
            prompt_good (str): Prompt used for predicting aesthetic images.
            split (str, optional): Split for the evaluation. Either 'test' or 'train'. Defaults to 'test'.
            verbose (bool, optional): Print a full classification report. Defaults to True.

        Returns:
            float: prediction accuracy
        """ 

        image_features, labels  = self.get_split(split=split)  

        text_features = self.generate_embeddings_binary([prompt_bad, prompt_good], mode = 'ensembling')
        
        image_features /= image_features.norm(dim=-1, keepdim=True)
        text_features /= text_features.norm(dim=-1, keepdim=True)

        similarity = (100.0 * image_features.squeeze() @ text_features.squeeze().float().T).softmax(dim=-1)        
        values, indices = similarity.topk(1)
        pred = indices >= 1
        pred = pred.ravel()
        
        if(verbose):
            print(f"------------------ \n Binary Task Results + Ensembling (Split: {split}, Prompts: {[prompt_bad, prompt_good]}) :")
            print(classification_report(labels.cpu().numpy(), pred.cpu().numpy(), digits=4, target_names= ['bad', 'good']))
            
        
        return(accuracy_score(labels.cpu().numpy(), pred.cpu().numpy()))

    def continuous(self, prompt_bad: str, prompt_good: str, split: str = 'test', mode: str = 'fixed', verbose = True):
        """Performs a evaluation on the DATASET using the provided prompts for the provided split and mode.
        If the selected mode is not 'fixed', the Prompts must contain one '*', which marks the place where the content should be inserted.

        Args:
            prompt_bad (str): A prompt describing unaesthetic images
            prompt_good (str): A prompt describing aesthetic images
            split (str, optional): Split for the evaluation. Either 'test' or 'train'. Defaults to 'test'.
            mode (str, optional): Selects the mode of the prompts, one of ['fixed', 'content', 'ensembling']. Defaults to 'fixed'.
            verbose (bool, optional): Prints the results. Defaults to True.. Defaults to True.

        Returns:
            float, float: returns the spearman and pearson correlation coefficient
        """        

        image_features, labels  = self.get_split(split=split, task='continuous') 

        text_features = self.generate_embeddings_binary([prompt_bad, prompt_good], mode = mode)


        weights = []
        for i in range(int((len(text_features)/2))):
            weights.append(-1)
        for i in range(int((len(text_features)/2))):
            weights.append(1)

        weights = torch.Tensor(weights).to(self.device).float()
        similarity = (torch.tensor(100.0).to(self.device) * image_features @ text_features.float().T)
        similarity = similarity @ weights
        
        spearman, p = spearmanr(similarity.squeeze().cpu().numpy(), labels.cpu().numpy())
        pearson, p2 = pearsonr(similarity.squeeze().cpu().numpy(), labels.cpu().numpy())

        if(verbose):
            print(f"------------------ \n\n Continuous Task Results (Split: {split}, Mode: {mode}, Prompts: {[prompt_bad, prompt_good]}) :\n")
            print(f"Spearman: {spearman}\n") 
            print(f"Pearson: {pearson}\n") 

        return spearman, pearson   

In [16]:
eval = TextIAA()

eval.binary_fixed("A atrocious picture", "A outstanding picture")
#eval.binary_content("A horrible picture, of a #*", "A smashing picture, of a #*", split='test')
#eval.continuous("A horrible picture", "A outstanding picture")
#eval.continuous("A horrible picture, of a #*", "A outstanding picture, of a #*", mode = 'ensembling')



------------------ 
 Binary Task Results + Fixed (Split: test, Prompts: ['A atrocious picture', 'A outstanding picture']) :
              precision    recall  f1-score   support

         bad     0.5681    0.3182    0.4079      7599
        good     0.7567    0.8976    0.8212     17952

    accuracy                         0.7253     25551
   macro avg     0.6624    0.6079    0.6145     25551
weighted avg     0.7006    0.7253    0.6983     25551



0.7252945090211733