In [1]:
import numpy as np
import torch
import pandas as pd
import os
import json
from matplotlib import pyplot as plt

import sys
sys.path.append('./imagenet-testbed/')
sys.path.append('./imagenet-testbed/src/')
from registry import registry
registry.load_full_registry()

# Arguments

In [2]:
result_base_dir = './imagenet-testbed/outputs'
eval_settings = ['val',
                 'val_subsampled_class_1_8',
                 'imagenetv2-matched-frequency',
                 'imagenet-sketch',
                 'ytbb-robust',
                 'imagenet-vid-robust',
                 'objectnet-1.0-beta',
                 'imagenet-a', 
                 'imagenet-r']

# Collect Results

In [3]:
eval_models = [m for m in registry.model_names() if 'clip' in m]
eval_models

['clip_vit_pretrained', 'clip_vit_finetuned_base', 'clip_vit_finetuned_ent']

In [4]:
result_dict = {}

for model in eval_models:
    if not os.path.exists(os.path.join(os.path.join(result_base_dir, model), 'val/metrics.json')):
        print('Not exist: ', model)
        continue
    result_dict[model] = {}
    sub_result_dict = result_dict[model] 
    
    for setting in eval_settings:
        result_dir = os.path.join(os.path.join(result_base_dir, model), setting)
        result_path = os.path.join(result_dir, 'metrics.json')
        
        with open(result_path) as json_file:
            eval_result_dict = json.load(json_file)
        
        if setting in ['ytbb-robust', 'imagenet-vid-robust']:
            result = (eval_result_dict['pm0'] + eval_result_dict['pm10']) / 2 * 100
        else:
            result = eval_result_dict['top1']
        sub_result_dict[setting] = result
    
    sub_result_dict['average'] = np.mean([sub_result_dict[s] for s in eval_settings 
                                           if s not in ['val', 'val_subsampled_class_1_8']])

Finetuning CLIP with entropy bottleneck leads to better robustness than finetuning without on natural distribution shift datasets. Note that both fientuned models are worse than pretrained CLIP, probably due to the quality of the LAION-400M dataset.

The results could be slightly different from those in the paper, due to code cleaning and randomness.

In [5]:
pd.set_option("display.precision", 1)
pd.DataFrame.from_dict(result_dict, orient='index')

Unnamed: 0,val,val_subsampled_class_1_8,imagenetv2-matched-frequency,imagenet-sketch,ytbb-robust,imagenet-vid-robust,objectnet-1.0-beta,imagenet-a,imagenet-r,average
clip_vit_pretrained,75.4,90.6,64.2,41.1,58.3,71.3,42.8,27.7,62.9,52.6
clip_vit_finetuned_base,73.8,90.0,62.1,37.0,56.9,68.8,41.3,26.0,58.1,50.0
clip_vit_finetuned_ent,74.0,90.4,62.6,39.0,58.9,69.9,41.9,26.2,60.9,51.3
