## CAV explanation on text

In [1]:
import torch
from transformers import AutoModel, AutoTokenizer

### Concepts

In [2]:
cooking = [
    "Heat a non-stick pan with a drizzle of oil and add the chopped onion, letting it brown over medium heat.",
    "Add the peeled tomatoes, adjust the salt, and let cook at low heat for 15 minutes in the oven.",
    "Bring a pot of salted water to a boil, cook the pasta until al dente, and drain it directly into the prepared sauce."
]

preparation = [
    "Finely chop the parsley and garlic, then set them aside in a bowl.",
    "Cut the vegetables into evenly sized cubes to ensure uniform cooking.",
    "Beat the eggs with a fork until you get a smooth mixture, then add a pinch of salt."
]

ingredients = [
    "400 grams of chicken breast, 2 zucchinis, 1 garlic clove.",
    "3 eggs, 100 grams of butter, a pinch of salt.",
    "250 ml of milk, 50 grams of cocoa powder, 1 teaspoon of vanilla extract."
]

## Get model activations

In [3]:
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model.eval()  

def get_activations(texts):
    inputs = tokenizer(texts, padding=True, truncation=True, max_length=128, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
        return outputs.last_hidden_state.mean(dim=1).numpy()  # Mean across tokens

cooking_activations = get_activations(cooking)
preparation_activations = get_activations(preparation)
ingredients_activations = get_activations(ingredients)

## Find a linear boundary between examples

In [4]:
from sklearn.linear_model import LogisticRegression
import numpy as np

X = np.vstack([cooking_activations, preparation_activations, ingredients_activations])
y = np.array([0] * len(cooking_activations) + [1] * len(preparation_activations) + [2] * len(ingredients_activations))

cav_classifier = LogisticRegression().fit(X, y)
cav_cottura = cav_classifier.coef_[0]
cav_preparazione = cav_classifier.coef_[1]
cav_ingredienti = cav_classifier.coef_[2]


## Compute concept importance

In [7]:
def concept_importance(input_text):
    activations = get_activations([input_text])
    cottura_rel = np.dot(activations, cav_cottura)
    preparazione_rel = np.dot(activations, cav_preparazione)
    ingredienti_rel = np.dot(activations, cav_ingredienti)
    return np.array([cottura_rel, preparazione_rel, ingredienti_rel])

c_example = "boil water and add salt. Use a pan to heat oil."
p_example = "cut onions in slices, stage the flavor in a cup"
i_example = "300 g of tomato sauce, 10 grams of olive oil"
x_example = "go in computer settings and set up an update of the os"

out = concept_importance(x_example)

labels = ['cooking', 'preparation', 'ingredients']
for i, x in enumerate(out):
    print(f"{labels[i]}: {x[0]}")

cooking: 0.7030524984789631
preparation: 0.09999056806400607
ingredients: -0.8030430665429442
