# Checking Probe datasets 

In [2]:
import pickle
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score 

In [3]:
def get_ade20k_features():
    
    features = {'train':[], 'val':[], 'test':[]}
    attr = {'train':[], 'val':[], 'test':[]}
    predictions = {'train':[], 'val':[], 'test':[]}
    names = {'train':[], 'val':[], 'test':[]}

    A = pickle.load(open('../ADE20k/ade20k_imagelabels_with_texture.pkl', 'rb'))


    for split in ['train', 'val', 'test']:
        img_names = A[split]
        feat_split = pickle.load(open('../ADE20k/{}_features.pkl'.format(split), 'rb'))
        pred_split = pickle.load(open('../ADE20k/{}_scene.pkl'.format(split), 'rb'))
        #logit_split = pickle.load(open('../ADE20k/tra'))
        for img in img_names:
            features[split].append(feat_split[img].squeeze())
            predictions[split].append(pred_split[img])
            temp = np.zeros(1200)
            temp[A['labels'][img]] = 1 
            attr[split].append(temp)

        features[split] = np.stack(features[split])
        predictions[split] = np.array(predictions[split])
        attr[split] = np.stack(attr[split])
        names[split] = img_names
    
    return features, attr, predictions, names

In [4]:
def get_pascal_features():
    
    features = {'train':[], 'val':[], 'test':[]}
    attr = {'train':[], 'val':[], 'test':[]}
    predictions = {'train':[], 'val':[], 'test':[]}
    names = {'train':[], 'val':[], 'test':[]}

    A = pickle.load(open('pascal_dataset_split.pkl', 'rb'))
    feat_split = pickle.load(open('Pascal/full_features.pkl', 'rb'))
    pred_split = pickle.load(open('Pascal/full_scenegroup.pkl', 'rb'))
        
    path = '../NetDissect-Lite/dataset/broden1_224/images'
    for split in ['train', 'val', 'test']:
        img_names = A[split]
        #logit_split = pickle.load(open('../ADE20k/tra'))
        for img in img_names:
            features[split].append(feat_split['{}/{}'.format(path, img)].squeeze())
            predictions[split].append(pred_split['{}/{}'.format(path, img)][0])
            temp = np.zeros(1200)
            temp[A['obj_labels_only'][img]] = 1 
            attr[split].append(temp)

        features[split] = np.stack(features[split])
        predictions[split] = np.array(predictions[split])
        attr[split] = np.stack(attr[split])
        names[split] = img_names
    
    return features, attr, predictions, names

In [5]:
ade_features, ade_attr, ade_predictions, ade_names = get_ade20k_features()
pascal_features, pascal_attr, pascal_predictions, pascal_names = get_pascal_features()

## Computing Concept Activation vectors

In [7]:
new_imp_attr = []

for at in range(1200):
    if pascal_attr['train'][:, at].mean() > 0.02 and ade_attr['train'][:, at].mean() > 0.02:
        new_imp_attr.append(at)

In [15]:
def hyperparam_search_l2(train_features, train_labels, val_features, val_labels, 
                      Cs = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5]):
    best_clf = None
    best_auc = 0
    
    for c in Cs:
        clf = LogisticRegression(solver='liblinear', C=c, penalty='l2')
        clf.fit(train_features, train_labels)
        score = roc_auc_score(val_labels, clf.predict_proba(val_features)[:, 1])
        if score>best_auc:
            best_auc = score
            best_clf = clf
            print(score, c)
    
    return best_clf
        

In [11]:
import pandas as pd

labels = pd.read_csv('../../dataset/broden1_224/label.csv', index_col=0)['name'].to_dict()

In [17]:
pascal_clfs = []
ade_clfs = []

for at in new_imp_attr:
    clf = hyperparam_search(ade_features['train'], ade_attr['train'][:, at], 
                            ade_features['val'], ade_attr['val'][:, at])
    ade_clfs.append(clf)
    ade_score = roc_auc_score(ade_attr['test'][:, at], clf.predict_proba(ade_features['test'])[:, 1])
    to_print = [labels[at], ade_score]
    
    clf = hyperparam_search(pascal_features['train'], pascal_attr['train'][:, at],
                           pascal_features['val'], pascal_attr['val'][:, at])
    pascal_clfs.append(clf)
    pascal_score = roc_auc_score(pascal_attr['test'][:, at], clf.predict_proba(pascal_features['test'])[:, 1])
    to_print.append(pascal_score)
    print(*to_print, sep='\t')

0.9562235332730283 0.001
0.9599079951862911 0.005
0.9607279800834128 0.01
0.9610001520961248 0.05
0.9250227755845734 0.001
0.9278148935452306 0.005
wall	0.9604064539211602	0.9279037826199933
0.9831079192980495 0.001
0.9885788943644299 0.005
0.9897492501723206 0.01
0.9906500296424388 0.05
0.8443003837776164 0.001
0.847183902084846 0.005
sky	0.9890929759483067	0.8339622641509434
0.9710759529756943 0.001
0.9752193702252335 0.005
0.9761585927431513 0.01
0.9761963465605246 0.05


KeyboardInterrupt: 

In [20]:
from sklearn.metrics.pairwise import cosine_similarity

for i, at in enumerate(new_imp_attr):
    print(labels[at], cosine_similarity(ade_clfs[i].coef_, pascal_clfs[i].coef_))

wall [[0.19992691]]
sky [[0.4033036]]


IndexError: list index out of range

## Computing baseline explanations

In [21]:
def hyperparam_search_l1(train_features, train_labels, val_features, val_labels, 
                      Cs = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5]):
    best_clf = None
    best_acc = 0
    
    for c in Cs:
        clf = LogisticRegression(solver='liblinear', C=c, penalty='l1')
        clf.fit(train_features, train_labels)
        score = clf.score(val_features, val_labels)
        if score>best_acc:
            best_acc = score
            best_clf = clf
            print(score, c)
    
    return best_clf
        

In [22]:
ade_exp = hyperparam_search_l1(ade_attr['train'], ade_predictions['train'],
                              ade_attr['val'], ade_predictions['val'])

print(ade_exp.score(ade_attr['test'], ade_predictions['test']))

0.03242969343805422 0.001
0.1233848492525969 0.005
0.1603749683303775 0.01
0.23714213326577147 0.05
0.27514568026349123 0.1
0.36711426399797314 0.5
0.3701545477577907 1
0.3914365340765138


In [None]:
pascal_exp = hyperparam_search_l1(pascal_attr['train'], pascal_predictions['train'],
                              pascal_attr['val'], pascal_predictions['val'])

print(pascal_exp.score(pascal_attr['test'], pascal_predictions['test']))