In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MultiLabelBinarizer
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score, classification_report
from scipy.stats import ttest_ind
import matplotlib.pyplot as plt
from itertools import combinations
import ast

path = 'results/'
df0 = pd.read_csv(f'{path}cifar100-enriched_gemma-2b_layer0.csv')
df6 = pd.read_csv(f'{path}cifar100-enriched_gemma-2b_layer6.csv')
df10 = pd.read_csv(f'{path}cifar100-enriched_gemma-2b_layer10.csv')
df12 = pd.read_csv(f'{path}cifar100-enriched_gemma-2b_layer12.csv')
df12it = pd.read_csv(f'{path}cifar100-enriched_gemma-2b-it_layer12.csv')

In [None]:
df12['retrieved_features'][0]

In [None]:
df12

In [4]:
def map_label(df):
    label = [i for i in df['coarse_label']]
    df['label'] = label
    return df

df0 = map_label(df0)
df6 = map_label(df6)
df10 = map_label(df10)
df12 = map_label(df12)
df12it = map_label(df12it)

In [None]:
# Function to parse 'retrieved_features' column into a list of IDs
def parse_retrieved_features(df):
    def parse_row(row):
        retrieved_features_str = row['retrieved_features']
        try:
            features_dict = ast.literal_eval(retrieved_features_str)
            ids = list(features_dict.keys())
            return ids
        except (ValueError, SyntaxError):
            return []
    df = df.copy()
    df['feature_ids'] = df.apply(parse_row, axis=1)
    return df

# Function to shuffle and split data
def shuffle_and_split(df, test_size=0.2, seed=None):
    if seed is None:
        seed = np.random.seed()
    df_shuffled = df.sample(frac=1, random_state=seed).reset_index(drop=True)
    train_df, test_df = train_test_split(df_shuffled, test_size=test_size, random_state=seed)
    return train_df, test_df

# Assume df0, df6, df10, df12, df12it are already loaded as pandas DataFrames
datasets = {
    'df0': df0,
    'df6': df6,
    'df10': df10,
    'df12': df12,
    'df12it': df12it
}

# Parse 'retrieved_features' and collect all unique IDs
all_feature_ids_list = []
for name, df in datasets.items():
    if 'retrieved_features' not in df.columns:
        raise ValueError(f"Dataset '{name}' does not have 'retrieved_features' column.")
    if 'label' not in df.columns:
        raise ValueError(f"Dataset '{name}' does not have 'label' column.")
    df = parse_retrieved_features(df)
    datasets[name] = df
    all_feature_ids_list.extend(df['feature_ids'])

# Flatten the list of lists to get all feature IDs
all_feature_ids = [item for sublist in all_feature_ids_list for item in sublist]

# Fit MultiLabelBinarizer to create one-hot encodings
mlb = MultiLabelBinarizer()
mlb.fit([all_feature_ids])  # Fit on the combined list of all feature IDs

# Function to run the experiment multiple times
def run_experiment(df, n_runs=3, test_size=0.2, seed=None):
    f1_scores_list = []
    for run_idx in range(n_runs):
        train_df, test_df = shuffle_and_split(df, test_size=test_size, seed=seed)
        X_train = mlb.transform(train_df['feature_ids'])
        y_train = train_df['label']
        X_test = mlb.transform(test_df['feature_ids'])
        y_test = test_df['label']
        
        clf = LogisticRegression(max_iter=1000, n_jobs=-1)
        clf.fit(X_train, y_train)
        predictions = clf.predict(X_test)
        
        macro_f1 = f1_score(y_test, predictions, average='macro')
        micro_f1 = f1_score(y_test, predictions, average='micro')
        f1_scores_list.append({'macro_f1': macro_f1, 'micro_f1': micro_f1})
    return f1_scores_list

# Run experiment for each dataset
results_list = []
for layer_name, df in datasets.items():
    layer_name = "layer "+layer_name.lstrip('df')
    print(f"Running experiment for {layer_name}")
    seed = np.random.seed() 
    f1_scores_list = run_experiment(df, n_runs=3)
    for run_idx, scores in enumerate(f1_scores_list):
        results_list.append({
            'Layer': layer_name,
            'Run': run_idx + 1,
            'Macro F1 Score': scores['macro_f1'],
            'Micro F1 Score': scores['micro_f1'],
            'seed': seed 
        })

# Create a DataFrame from results_list
results_df = pd.DataFrame(results_list)

# Compute mean and std per layer
summary_df = results_df.groupby('Layer').agg({
    'Macro F1 Score': ['mean', 'std'],
    'Micro F1 Score': ['mean', 'std']
}).reset_index()

# Flatten MultiIndex columns
summary_df.columns = ['Layer', 'Macro F1 Mean', 'Macro F1 Std', 'Micro F1 Mean', 'Micro F1 Std']

# Print out the results
print("Detailed Results per Run:")
print(results_df)

print("\nSummary Results per Layer:")
print(summary_df)

# Plot Macro F1 Scores
plt.figure(figsize=(10, 6))
plt.bar(summary_df['Layer'], summary_df['Macro F1 Mean'], yerr=summary_df['Macro F1 Std'], capsize=5, color='skyblue')
plt.xlabel('Layer')
plt.ylabel('Macro F1 Score')
plt.title('Macro F1 Score by Layer with Error Bars')
plt.grid(axis='y')
plt.tight_layout()
plt.show()

# Plot Micro F1 Scores
plt.figure(figsize=(10, 6))
plt.bar(summary_df['Layer'], summary_df['Micro F1 Mean'], yerr=summary_df['Micro F1 Std'], capsize=5, color='lightgreen')
plt.xlabel('Layer')
plt.ylabel('Micro F1 Score')
plt.title('Micro F1 Score by Layer with Error Bars')
plt.grid(axis='y')
plt.tight_layout()
plt.show()

In [None]:
# Define the custom order for the Layer column
layer_order = ['layer 0', 'layer 6', 'layer 10', 'layer 12', 'layer 12it']

# Convert the Layer column to a categorical type with the specified order
summary_df['Layer'] = pd.Categorical(summary_df['Layer'], categories=layer_order, ordered=True)

# Sort the DataFrame by the Layer column
summary_df = summary_df.sort_values('Layer')

# Display the sorted DataFrame
summary_df

In [None]:
# Function to convert 'retrieved_features' column to string
def convert_retrieved_features_to_string(df):
    df = df.copy()
    df['text'] = df['retrieved_features'].astype(str)
    return df

# Function to shuffle and split data
def shuffle_and_split(df, test_size=0.2, seed=None):
    df_shuffled = df.sample(frac=1, random_state=seed).reset_index(drop=True)
    train_df, test_df = train_test_split(df_shuffled, test_size=test_size, random_state=seed)
    return train_df, test_df

# Function to perform TF-IDF classification and calculate F1 scores
def tfidf_classification(train_texts, train_labels, test_texts, test_labels):
    vectorizer = TfidfVectorizer()
    X_train = vectorizer.fit_transform(train_texts)
    X_test = vectorizer.transform(test_texts)
    
    clf = LogisticRegression(max_iter=1000, n_jobs=-1)
    clf.fit(X_train, train_labels)
    predictions = clf.predict(X_test)
    
    macro_f1 = f1_score(test_labels, predictions, average='macro')
    micro_f1 = f1_score(test_labels, predictions, average='micro')
    return macro_f1, micro_f1

# Function to run the experiment multiple times
def run_experiment(df, n_runs=3, test_size=0.2, seed=None):
    f1_scores_list = []
    for run_idx in range(n_runs):
        train_df, test_df = shuffle_and_split(df, test_size=test_size, seed=seed)
        train_texts = train_df['text']
        train_labels = train_df['label']
        test_texts = test_df['text']
        test_labels = test_df['label']
        
        macro_f1, micro_f1 = tfidf_classification(train_texts, train_labels, test_texts, test_labels)
        f1_scores_list.append({'macro_f1': macro_f1, 'micro_f1': micro_f1})
    return f1_scores_list

# Assume df0, df6, df10, df12, df12it are already loaded as pandas DataFrames
datasets = {
    'df0': df0,
    'df6': df6,
    'df10': df10,
    'df12': df12,
    'df12it': df12it
}

# Convert 'retrieved_features' to string and ensure 'label' column exists for each dataset
for name, df in datasets.items():
    if 'retrieved_features' not in df.columns:
        raise ValueError(f"Dataset '{name}' does not have 'retrieved_features' column.")
    if 'label' not in df.columns:
        raise ValueError(f"Dataset '{name}' does not have 'label' column.")
    datasets[name] = convert_retrieved_features_to_string(df)

# Run experiment for each dataset and collect results
results_list = []
for layer_name, df in datasets.items():
    layer_name = "layer "+layer_name.lstrip('df')
    print(f"Running experiment for {layer_name}")
    f1_scores_list = run_experiment(df, n_runs=3, seed=42)
    for run_idx, scores in enumerate(f1_scores_list):
        results_list.append({
            'Layer': layer_name,
            'Run': run_idx + 1,
            'Macro F1 Score': scores['macro_f1'],
            'Micro F1 Score': scores['micro_f1']
        })

# Create a DataFrame from results_list
results_df = pd.DataFrame(results_list)

# Compute mean and std per layer
summary_df = results_df.groupby('Layer').agg({
    'Macro F1 Score': ['mean', 'std'],
    'Micro F1 Score': ['mean', 'std']
}).reset_index()

# Flatten MultiIndex columns
summary_df.columns = ['Layer', 'Macro F1 Mean', 'Macro F1 Std', 'Micro F1 Mean', 'Micro F1 Std']

# Print out the results
print("Detailed Results per Run:")
print(results_df)

print("\nSummary Results per Layer:")
print(summary_df)

# Plot Macro F1 Scores
plt.figure(figsize=(10, 6))
plt.bar(summary_df['Layer'], summary_df['Macro F1 Mean'], yerr=summary_df['Macro F1 Std'], capsize=5, color='skyblue')
plt.xlabel('Layer')
plt.ylabel('Macro F1 Score')
plt.title('Macro F1 Score by Layer with Error Bars')
plt.grid(axis='y')
plt.tight_layout()
plt.show()

# Plot Micro F1 Scores
plt.figure(figsize=(10, 6))
plt.bar(summary_df['Layer'], summary_df['Micro F1 Mean'], yerr=summary_df['Micro F1 Std'], capsize=5, color='lightgreen')
plt.xlabel('Layer')
plt.ylabel('Micro F1 Score')
plt.title('Micro F1 Score by Layer with Error Bars')
plt.grid(axis='y')
plt.tight_layout()
plt.show()


In [None]:
# Define the custom order for the Layer column
layer_order = ['layer 0', 'layer 6', 'layer 10', 'layer 12', 'layer 12it']

# Convert the Layer column to a categorical type with the specified order
summary_df['Layer'] = pd.Categorical(summary_df['Layer'], categories=layer_order, ordered=True)

# Sort the DataFrame by the Layer column
summary_df = summary_df.sort_values('Layer')

# Display the sorted DataFrame
summary_df

In [None]:
def get_feature_counts(df):
    
    all_keys = []
    for features in df['retrieved_features']:
        feature_dict = eval(features)  # Convert string to dictionary
        all_keys.extend(feature_dict.keys())
        
    x = Counter(all_keys).most_common()
    print('before:',len(x))
    # now i want to remove key value > 1000 from feature_dict
    x = dict(x)
    for key in list(x.keys()):
        if x[key] > 1000:
            del x[key]
    print('after:',len(x))
    print('=====')
    return x

# 265
# 1186
# 2213
# 1823
# 2016

feature_counts0 = get_feature_counts(df0)
feature_counts6 = get_feature_counts(df6)
feature_counts10 = get_feature_counts(df10)
feature_counts12 = get_feature_counts(df12)
feature_counts12it = get_feature_counts(df12it)

def clean_features(df, feature_counts):
    new_features = []
    for features in df['retrieved_features']:
        features = eval(features)
        features = {k:v for k,v in features.items() if k in feature_counts}
        new_features.append(features)
    df['cleaned_features'] = new_features
    return df

df0 = clean_features(df0, feature_counts0)
df6 = clean_features(df6, feature_counts6)
df10 = clean_features(df10, feature_counts10)
df12 = clean_features(df12, feature_counts12)
df12it = clean_features(df12it, feature_counts12it)

In [None]:
# print out all above numbers out of 10000
print('layer0:',sum([i!={} for i in df0['cleaned_features']])/10000)
print('layer6:',sum([i!={} for i in df6['cleaned_features']])/10000)
print('layer10:',sum([i!={} for i in df10['cleaned_features']])/10000)
print('layer12:',sum([i!={} for i in df12['cleaned_features']])/10000)
print('layer12it:',sum([i!={} for i in df12it['cleaned_features']])/10000)

In [None]:
# do a function where coarse_label is the same, check which key in cleaned_features appear the most top 5

def get_top5_features(df, coi='fine_label', topn=5):
    top5_features = []
    labels = set(df[coi])
    for i in labels:
        df_coarse = df[df[coi] == i]
        all_keys = []
        for features in df_coarse['cleaned_features']:
            all_keys.extend(features.keys())
        # iterate all and combine into one dict
        all_features = {}
        for features in df['cleaned_features']:
            all_features.update(features)
        x = Counter(all_keys).most_common()
        top5_features.append({i:x[:5]})
        print(i, x[:5], 'This is out of:', 10000/len(labels))
        print([all_features[i[0]] for i in x[:topn]])
    return top5_features

top5_features0 = get_top5_features(df0)

In [None]:
top5_features12 = get_top5_features(df12)

In [None]:
top5_features12 = get_top5_features(df10)

In [None]:
top5_features12it = get_top5_features(df12it)

In [None]:
top5_features12 = get_top5_features(df12, coi='coarse_label')

In [None]:
top5_features12it = get_top5_features(df12it, coi='coarse_label')

In [None]:
get_top5_features(df12it, topn=2)

In [10]:
# TODO
# get these vectors, get their pre vit embedding, post vit embedding, and the difference, and top five activation patterns
# then plot them


In [None]:
df12

In [None]:
import requests
import torch
from PIL import Image
from transformers import LlavaForConditionalGeneration, AutoProcessor
import pandas as pd
from sae_lens import SAE
from datasets import load_dataset
from tqdm import tqdm
import json
from torch.utils.data import DataLoader

# Standard imports
import os
import torch
from tqdm import tqdm

# Set the environment variable to use only GPU 1
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

if torch.cuda.is_available():
    device = "cuda:1"
else:
    device = "cpu"

print(f"Device: {device}")

release = "gemma-2b-it"
layer = 12

def create_vector(size, indices):
    """
    Create a vector of given size where the elements at the specified indices are 1 and the rest are 0.

    Parameters:
    size (int): The size of the vector.
    indices (list of int): The indices to be set to 1.

    Returns:
    torch.Tensor: The resulting vector.
    """
    vector = torch.zeros(1, size)
    for index in indices:
        vector[0, index] = 1
    return vector.to(device)

def get_activated_sae(release, layer, df):
    sae, cfg_dict, sparsity = SAE.from_pretrained(
        release=f"{release}-res-jb",
        sae_id=f"blocks.{layer}.hook_resid_post",
        device=device
    )
    # default dict of dict where all keys are id from 0-cfg_dict['d_sae'], and it contains count all start from 0 and the sae decode vector of the id
    sae_dict = {i:{'count':0, 'vector':create_vector(cfg_dict['d_sae'], [i])@sae.W_dec} for i in range(cfg_dict['d_sae'])}
    for i in df['cleaned_features']:
        if i != {}:
            for sae_id in i.keys():
                sae_dict[int(sae_id)]['count'] += 1
    return sae_dict

dict_12 = get_activated_sae(release, layer, df12)

In [None]:
dict_12

In [None]:
counts = [dict_12[i]['count'] for i in dict_12]

import matplotlib.pyplot as plt
plt.scatter(range(len(counts)), counts)
plt.show()


In [None]:
import umap
import matplotlib.colors as mcolors

# Assuming sae_dict is obtained from get_activated_sae
sae_dict = get_activated_sae(release, layer, df12)

# Extract vectors and counts
vectors = [sae_dict[key]['vector'].detach().cpu().numpy().flatten() for key in sae_dict]
counts = [sae_dict[key]['count'] for key in sae_dict]

# Perform UMAP
reducer = umap.UMAP()
embedding = reducer.fit_transform(vectors)

# Create a custom colormap
cmap = plt.cm.viridis
norm = mcolors.Normalize(vmin=min(counts), vmax=max(counts))

# Plot the UMAP embedding
plt.figure(figsize=(10, 8))
scatter = plt.scatter(embedding[:, 0], embedding[:, 1], c=counts, cmap=cmap, norm=norm, s=100)

# Add color bar
cbar = plt.colorbar(scatter)
cbar.set_label('Count')

# Add labels and title
plt.xlabel('UMAP Dimension 1')
plt.ylabel('UMAP Dimension 2')
plt.title('UMAP of SAE Vectors')

# Show the plot
plt.show()

In [None]:
# Extract vectors and counts
vectors = [sae_dict[key]['vector'].detach().cpu().numpy().flatten() for key in sae_dict]
counts = [sae_dict[key]['count'] for key in sae_dict]

# Perform UMAP
reducer = umap.UMAP()
embedding = reducer.fit_transform(vectors)

# Create a custom colormap
cmap = plt.cm.viridis
norm = mcolors.Normalize(vmin=min(counts), vmax=max(counts))

# Create an array for alpha values based on counts
alphas = np.array([1 if count >= 20 else 0 for count in counts])

# Plot the UMAP embedding
plt.figure(figsize=(10, 8))
scatter = plt.scatter(embedding[:, 0], embedding[:, 1], c=counts, cmap=cmap, norm=norm, s=100, alpha=alphas)

# Add color bar
cbar = plt.colorbar(scatter)
cbar.set_label('Count')

# Add labels and title
plt.xlabel('UMAP Dimension 1')
plt.ylabel('UMAP Dimension 2')
plt.title('UMAP of SAE Vectors')

# Show the plot
plt.show()