In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MultiLabelBinarizer
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score, classification_report
from scipy.stats import ttest_ind
import matplotlib.pyplot as plt
from itertools import combinations
import ast

path = 'results/'
df0 = pd.read_csv(f'{path}imagenet-1k-256x256_gemma-2b_layer0.csv')
df6 = pd.read_csv(f'{path}imagenet-1k-256x256_gemma-2b_layer6.csv')
df10 = pd.read_csv(f'{path}imagenet-1k-256x256_gemma-2b_layer10.csv')
df12 = pd.read_csv(f'{path}imagenet-1k-256x256_gemma-2b_layer12.csv')
df12it = pd.read_csv(f'{path}imagenet-1k-256x256_gemma-2b-it_layer12.csv')

In [2]:
# load image_net_labels.txt as a dictionary
with open('image_net_labels.txt', 'r') as f:
    image_net_labels = eval(f.read())

In [None]:
df0

In [4]:
def map_label(df):
    label = [image_net_labels[i] for i in df['fine_label']]
    df['label'] = label
    return df

df0 = map_label(df0)
df6 = map_label(df6)
df10 = map_label(df10)
df12 = map_label(df12)
df12it = map_label(df12it)

In [None]:
# Function to parse 'retrieved_features' column into a list of IDs
def parse_retrieved_features(df):
    def parse_row(row):
        retrieved_features_str = row['retrieved_features']
        try:
            features_dict = ast.literal_eval(retrieved_features_str)
            ids = list(features_dict.keys())
            return ids
        except (ValueError, SyntaxError):
            return []
    df = df.copy()
    df['feature_ids'] = df.apply(parse_row, axis=1)
    return df

# Function to shuffle and split data
def shuffle_and_split(df, test_size=0.2, seed=None):
    if seed is None:
        seed = np.random.seed()
    df_shuffled = df.sample(frac=1, random_state=seed).reset_index(drop=True)
    train_df, test_df = train_test_split(df_shuffled, test_size=test_size, random_state=seed)
    return train_df, test_df

# Assume df0, df6, df10, df12, df12it are already loaded as pandas DataFrames
datasets = {
    'df0': df0,
    'df6': df6,
    'df10': df10,
    'df12': df12,
    'df12it': df12it
}

# Parse 'retrieved_features' and collect all unique IDs
all_feature_ids_list = []
for name, df in datasets.items():
    if 'retrieved_features' not in df.columns:
        raise ValueError(f"Dataset '{name}' does not have 'retrieved_features' column.")
    if 'label' not in df.columns:
        raise ValueError(f"Dataset '{name}' does not have 'label' column.")
    df = parse_retrieved_features(df)
    datasets[name] = df
    all_feature_ids_list.extend(df['feature_ids'])

# Flatten the list of lists to get all feature IDs
all_feature_ids = [item for sublist in all_feature_ids_list for item in sublist]

# Fit MultiLabelBinarizer to create one-hot encodings
mlb = MultiLabelBinarizer()
mlb.fit([all_feature_ids])  # Fit on the combined list of all feature IDs

# Function to run the experiment multiple times
def run_experiment(df, n_runs=1, test_size=0.2, seed=None):
    f1_scores_list = []
    for run_idx in range(n_runs):
        train_df, test_df = shuffle_and_split(df, test_size=test_size, seed=seed)
        X_train = mlb.transform(train_df['feature_ids'])
        y_train = train_df['label']
        X_test = mlb.transform(test_df['feature_ids'])
        y_test = test_df['label']
        
        clf = LogisticRegression(max_iter=1000, n_jobs=-1)
        clf.fit(X_train, y_train)
        predictions = clf.predict(X_test)
        
        macro_f1 = f1_score(y_test, predictions, average='macro')
        micro_f1 = f1_score(y_test, predictions, average='micro')
        f1_scores_list.append({'macro_f1': macro_f1, 'micro_f1': micro_f1})
    return f1_scores_list

# Run experiment for each dataset
results_list = []
for layer_name, df in datasets.items():
    layer_name = "layer"+layer_name.lstrip('df')
    print(f"Running experiment for {layer_name}")
    seed = np.random.seed() 
    f1_scores_list = run_experiment(df, n_runs=3)
    for run_idx, scores in enumerate(f1_scores_list):
        results_list.append({
            'Layer': layer_name,
            'Run': run_idx + 1,
            'Macro F1 Score': scores['macro_f1'],
            'Micro F1 Score': scores['micro_f1'],
            'seed': seed 
        })

# Create a DataFrame from results_list
results_df = pd.DataFrame(results_list)

# Compute mean and std per layer
summary_df = results_df.groupby('Layer').agg({
    'Macro F1 Score': ['mean', 'std'],
    'Micro F1 Score': ['mean', 'std']
}).reset_index()

# Flatten MultiIndex columns
summary_df.columns = ['Layer', 'Macro F1 Mean', 'Macro F1 Std', 'Micro F1 Mean', 'Micro F1 Std']

# Print out the results
print("Detailed Results per Run:")
print(results_df)

print("\nSummary Results per Layer:")
print(summary_df)

# Plot Macro F1 Scores
plt.figure(figsize=(10, 6))
plt.bar(summary_df['Layer'], summary_df['Macro F1 Mean'], yerr=summary_df['Macro F1 Std'], capsize=5, color='skyblue')
plt.xlabel('Layer')
plt.ylabel('Macro F1 Score')
plt.title('Macro F1 Score by Layer with Error Bars')
plt.grid(axis='y')
plt.tight_layout()
plt.show()

# Plot Micro F1 Scores
plt.figure(figsize=(10, 6))
plt.bar(summary_df['Layer'], summary_df['Micro F1 Mean'], yerr=summary_df['Micro F1 Std'], capsize=5, color='lightgreen')
plt.xlabel('Layer')
plt.ylabel('Micro F1 Score')
plt.title('Micro F1 Score by Layer with Error Bars')
plt.grid(axis='y')
plt.tight_layout()
plt.show()

In [None]:
# Function to convert 'retrieved_features' column to string
def convert_retrieved_features_to_string(df):
    df = df.copy()
    df['text'] = df['retrieved_features'].astype(str)
    return df

# Function to shuffle and split data
def shuffle_and_split(df, test_size=0.2, seed=None):
    df_shuffled = df.sample(frac=1, random_state=seed).reset_index(drop=True)
    train_df, test_df = train_test_split(df_shuffled, test_size=test_size, random_state=seed)
    return train_df, test_df

# Function to perform TF-IDF classification and calculate F1 scores
def tfidf_classification(train_texts, train_labels, test_texts, test_labels):
    vectorizer = TfidfVectorizer()
    X_train = vectorizer.fit_transform(train_texts)
    X_test = vectorizer.transform(test_texts)
    
    clf = LogisticRegression(max_iter=1000, n_jobs=-1)
    clf.fit(X_train, train_labels)
    predictions = clf.predict(X_test)
    
    macro_f1 = f1_score(test_labels, predictions, average='macro')
    micro_f1 = f1_score(test_labels, predictions, average='micro')
    return macro_f1, micro_f1

# Function to run the experiment multiple times
def run_experiment(df, n_runs=1, test_size=0.2, seed=None):
    f1_scores_list = []
    for run_idx in range(n_runs):
        train_df, test_df = shuffle_and_split(df, test_size=test_size, seed=seed)
        train_texts = train_df['text']
        train_labels = train_df['label']
        test_texts = test_df['text']
        test_labels = test_df['label']
        
        macro_f1, micro_f1 = tfidf_classification(train_texts, train_labels, test_texts, test_labels)
        f1_scores_list.append({'macro_f1': macro_f1, 'micro_f1': micro_f1})
    return f1_scores_list

# Assume df0, df6, df10, df12, df12it are already loaded as pandas DataFrames
datasets = {
    'df0': df0,
    'df6': df6,
    'df10': df10,
    'df12': df12,
    'df12it': df12it
}

# Convert 'retrieved_features' to string and ensure 'label' column exists for each dataset
for name, df in datasets.items():
    if 'retrieved_features' not in df.columns:
        raise ValueError(f"Dataset '{name}' does not have 'retrieved_features' column.")
    if 'label' not in df.columns:
        raise ValueError(f"Dataset '{name}' does not have 'label' column.")
    datasets[name] = convert_retrieved_features_to_string(df)

# Run experiment for each dataset and collect results
results_list = []
for layer_name, df in datasets.items():
    layer_name = "layer "+layer_name
    print(f"Running experiment for {layer_name}")
    f1_scores_list = run_experiment(df, n_runs=3, seed=42)
    for run_idx, scores in enumerate(f1_scores_list):
        results_list.append({
            'Layer': layer_name,
            'Run': run_idx + 1,
            'Macro F1 Score': scores['macro_f1'],
            'Micro F1 Score': scores['micro_f1']
        })

# Create a DataFrame from results_list
results_df = pd.DataFrame(results_list)

# Compute mean and std per layer
summary_df = results_df.groupby('Layer').agg({
    'Macro F1 Score': ['mean', 'std'],
    'Micro F1 Score': ['mean', 'std']
}).reset_index()

# Flatten MultiIndex columns
summary_df.columns = ['Layer', 'Macro F1 Mean', 'Macro F1 Std', 'Micro F1 Mean', 'Micro F1 Std']

# Print out the results
print("Detailed Results per Run:")
print(results_df)

print("\nSummary Results per Layer:")
print(summary_df)

# Plot Macro F1 Scores
plt.figure(figsize=(10, 6))
plt.bar(summary_df['Layer'], summary_df['Macro F1 Mean'], yerr=summary_df['Macro F1 Std'], capsize=5, color='skyblue')
plt.xlabel('Layer')
plt.ylabel('Macro F1 Score')
plt.title('Macro F1 Score by Layer with Error Bars')
plt.grid(axis='y')
plt.tight_layout()
plt.show()

# Plot Micro F1 Scores
plt.figure(figsize=(10, 6))
plt.bar(summary_df['Layer'], summary_df['Micro F1 Mean'], yerr=summary_df['Micro F1 Std'], capsize=5, color='lightgreen')
plt.xlabel('Layer')
plt.ylabel('Micro F1 Score')
plt.title('Micro F1 Score by Layer with Error Bars')
plt.grid(axis='y')
plt.tight_layout()
plt.show()


In [None]:
# Function to convert 'retrieved_features' column to string
def convert_retrieved_features_to_string(df):
    df = df.copy()
    df['text'] = df['retrieved_features'].astype(str)
    return df

# Function to shuffle and split data
def shuffle_and_split(df, test_size=0.2, seed=None):
    df_shuffled = df.sample(frac=1, random_state=seed).reset_index(drop=True)
    train_df, test_df = train_test_split(df_shuffled, test_size=test_size, random_state=seed)
    return train_df, test_df

# Function to perform TF-IDF classification
def tfidf_classification(train_texts, train_labels, test_texts, test_labels):
    vectorizer = TfidfVectorizer()
    X_train = vectorizer.fit_transform(train_texts)
    X_test = vectorizer.transform(test_texts)
    
    clf = LogisticRegression(max_iter=1000, n_jobs=-1)
    clf.fit(X_train, train_labels)
    predictions = clf.predict(X_test)
    
    macro_f1 = f1_score(test_labels, predictions, average='macro')
    return macro_f1

# Function to run the experiment multiple times
def run_experiment(df, n_runs=3, test_size=0.2, seed=None):
    macro_f1_scores = []
    for _ in range(n_runs):
        train_df, test_df = shuffle_and_split(df, test_size=test_size, seed=seed)
        train_texts = train_df['text']
        train_labels = train_df['label']
        test_texts = test_df['text']
        test_labels = test_df['label']
        
        macro_f1 = tfidf_classification(train_texts, train_labels, test_texts, test_labels)
        macro_f1_scores.append(macro_f1)
    return macro_f1_scores

# Assume df12it, df0, df6, df10, df12 are already loaded as pandas DataFrames
datasets = {
    'df0': df0,
    'df6': df6,
    'df10': df10,
    'df12': df12,
    'df12it': df12it
}

# Convert 'retrieved_features' to string and ensure 'label' column exists for each dataset
for name, df in datasets.items():
    if 'retrieved_features' not in df.columns:
        raise ValueError(f"Dataset '{name}' does not have 'retrieved_features' column.")
    if 'label' not in df.columns:
        raise ValueError(f"Dataset '{name}' does not have 'label' column.")
    datasets[name] = convert_retrieved_features_to_string(df)

# Run experiment for each dataset
layer_macro_f1_scores = {}
for layer_name, df in datasets.items():
    print(f"Running experiment for {layer_name}")
    macro_f1_scores = run_experiment(df, n_runs=1, seed=42)
    layer_macro_f1_scores[layer_name] = macro_f1_scores

# Perform t-tests between each pair of layers
layer_pairs = list(combinations(datasets.keys(), 2))
t_test_results = {}
for (layer1, layer2) in layer_pairs:
    scores1 = layer_macro_f1_scores[layer1]
    scores2 = layer_macro_f1_scores[layer2]
    t_stat, p_value = ttest_ind(scores1, scores2)
    t_test_results[(layer1, layer2)] = {'t_stat': t_stat, 'p_value': p_value}

# Plot results
layers = sorted(layer_macro_f1_scores.keys())
means = [np.mean(layer_macro_f1_scores[layer]) for layer in layers]
stds = [np.std(layer_macro_f1_scores[layer]) for layer in layers]

plt.figure(figsize=(10, 6))
plt.bar(layers, means, yerr=stds, capsize=5, color='skyblue')
plt.xlabel('Layer')
plt.ylabel('Macro F1 Score')
plt.title('Macro F1 Score by Layer with Error Bars')
plt.grid(axis='y')
plt.tight_layout()
plt.show()

# Display results
print("Layer Macro F1 Scores:")
for layer in layers:
    scores = layer_macro_f1_scores[layer]
    mean_score = np.mean(scores)
    std_score = np.std(scores)
    print(f"{layer}: Mean = {mean_score:.4f}, Std = {std_score:.4f}, Scores = {scores}")

print("\nT-Test Results:")
for (layer1, layer2), result in t_test_results.items():
    t_stat = result['t_stat']
    p_value = result['p_value']
    print(f"{layer1} vs {layer2}: t-statistic = {t_stat:.4f}, p-value = {p_value:.4f}")


In [None]:
def get_feature_counts(df):
    
    all_keys = []
    for features in df['retrieved_features']:
        feature_dict = eval(features)  # Convert string to dictionary
        all_keys.extend(feature_dict.keys())
        
    x = Counter(all_keys).most_common()
    print('before:',len(x))
    # now i want to remove key value > 1000 from feature_dict
    x = dict(x)
    for key in list(x.keys()):
        if x[key] > 1000:
            del x[key]
    print('after:',len(x))
    print('=====')
    return x

# 265
# 1186
# 2213
# 1823
# 2016

feature_counts0 = get_feature_counts(df0)
feature_counts6 = get_feature_counts(df6)
feature_counts10 = get_feature_counts(df10)
feature_counts12 = get_feature_counts(df12)
feature_counts12it = get_feature_counts(df12it)

def clean_features(df, feature_counts):
    new_features = []
    for features in df['retrieved_features']:
        features = eval(features)
        features = {k:v for k,v in features.items() if k in feature_counts}
        new_features.append(features)
    df['cleaned_features'] = new_features
    return df

df0 = clean_features(df0, feature_counts0)
df6 = clean_features(df6, feature_counts6)
df10 = clean_features(df10, feature_counts10)
df12 = clean_features(df12, feature_counts12)
df12it = clean_features(df12it, feature_counts12it)

In [None]:
# Function to convert 'retrieved_features' column to string
def convert_retrieved_features_to_string(df):
    df = df.copy()
    df['text'] = df['cleaned_features'].astype(str)
    return df
# Assume df12it, df0, df6, df10, df12 are already loaded as pandas DataFrames
datasets = {
    'df0': df0,
    'df6': df6,
    'df10': df10,
    'df12': df12,
    'df12it': df12it
}

# Convert 'retrieved_features' to string and ensure 'label' column exists for each dataset
for name, df in datasets.items():
    if 'retrieved_features' not in df.columns:
        raise ValueError(f"Dataset '{name}' does not have 'retrieved_features' column.")
    if 'label' not in df.columns:
        raise ValueError(f"Dataset '{name}' does not have 'label' column.")
    datasets[name] = convert_retrieved_features_to_string(df)

# Run experiment for each dataset
layer_macro_f1_scores = {}
for layer_name, df in datasets.items():
    print(f"Running experiment for {layer_name}")
    macro_f1_scores = run_experiment(df, n_runs=1, seed=42)
    layer_macro_f1_scores[layer_name] = macro_f1_scores

# Perform t-tests between each pair of layers
layer_pairs = list(combinations(datasets.keys(), 2))
t_test_results = {}
for (layer1, layer2) in layer_pairs:
    scores1 = layer_macro_f1_scores[layer1]
    scores2 = layer_macro_f1_scores[layer2]
    t_stat, p_value = ttest_ind(scores1, scores2)
    t_test_results[(layer1, layer2)] = {'t_stat': t_stat, 'p_value': p_value}

# Plot results
layers = sorted(layer_macro_f1_scores.keys())
means = [np.mean(layer_macro_f1_scores[layer]) for layer in layers]
stds = [np.std(layer_macro_f1_scores[layer]) for layer in layers]

plt.figure(figsize=(10, 6))
plt.bar(layers, means, yerr=stds, capsize=5, color='skyblue')
plt.xlabel('Layer')
plt.ylabel('Macro F1 Score')
plt.title('Macro F1 Score by Layer with Error Bars')
plt.grid(axis='y')
plt.tight_layout()
plt.show()

# Display results
print("Layer Macro F1 Scores:")
for layer in layers:
    scores = layer_macro_f1_scores[layer]
    mean_score = np.mean(scores)
    std_score = np.std(scores)
    print(f"{layer}: Mean = {mean_score:.4f}, Std = {std_score:.4f}, Scores = {scores}")

## analysis


In [None]:
def get_feature_counts(df):
    
    all_keys = []
    for features in df['retrieved_features']:
        feature_dict = eval(features)  # Convert string to dictionary
        all_keys.extend(feature_dict.keys())
        
    x = Counter(all_keys).most_common()
    print('before:',len(x))
    # now i want to remove key value > 1000 from feature_dict
    x = dict(x)
    for key in list(x.keys()):
        if x[key] > 1000:
            del x[key]
    print('after:',len(x))
    print('=====')
    return x

# 265
# 1186
# 2213
# 1823
# 2016

feature_counts0 = get_feature_counts(df0)
feature_counts6 = get_feature_counts(df6)
feature_counts10 = get_feature_counts(df10)
feature_counts12 = get_feature_counts(df12)
# feature_counts12it = get_feature_counts(df12it)

def clean_features(df, feature_counts):
    new_features = []
    for features in df['retrieved_features']:
        features = eval(features)
        features = {k:v for k,v in features.items() if k in feature_counts}
        new_features.append(features)
    df['cleaned_features'] = new_features
    return df

df0 = clean_features(df0, feature_counts0)
df6 = clean_features(df6, feature_counts6)
df10 = clean_features(df10, feature_counts10)
df12 = clean_features(df12, feature_counts12)
# df12it = clean_features(df12it, feature_counts12it)

In [None]:
df12

In [None]:
# print out all above numbers out of 10000
print('layer0:',sum([i!={} for i in df0['cleaned_features']])/len(df0))
print('layer6:',sum([i!={} for i in df6['cleaned_features']])/len(df6))
print('layer10:',sum([i!={} for i in df10['cleaned_features']])/len(df10))
print('layer12:',sum([i!={} for i in df12['cleaned_features']])/len(df12))
# print('layer12it:',sum([i!={} for i in df12it['cleaned_features']])/10000)

In [None]:
df12

In [None]:
# do a function where coarse_label is the same, check which key in cleaned_features appear the most top 5

def get_top5_features(df, coi='label'):
    top5_features = []
    labels = set(df[coi])
    for i in labels:
        df_coarse = df[df[coi] == i]
        all_keys = []
        for features in df_coarse['cleaned_features']:
            all_keys.extend(features.keys())
        # iterate all and combine into one dict
        all_features = {}
        for features in df['cleaned_features']:
            all_features.update(features)
        x = Counter(all_keys).most_common()
        top5_features.append({i:x[:5]})
        print(i, x[:5], 'This is out of:', len(df[coi])/len(labels))
        print([all_features[i[0]] for i in x[:5]])
    return top5_features

top5_features0 = get_top5_features(df0)

In [None]:
top5_features12 = get_top5_features(df12)

In [None]:
top5_features12it = get_top5_features(df12it)

In [None]:
top5_features12 = get_top5_features(df12, coi='coarse_label')

In [None]:
top5_features12it = get_top5_features(df12it, coi='coarse_label')

In [10]:
# TODO
# get these vectors, get their pre vit embedding, post vit embedding, and the difference, and top five activation patterns
# then plot them


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from scipy.stats import ttest_ind
import matplotlib.pyplot as plt
from collections import defaultdict

# Function to convert retrieved_features column to string
def convert_retrieved_features_to_string(df):
    df['text'] = df['retrieved_features'].apply(lambda x: str(x))
    return df

# Function to shuffle and split data
def shuffle_and_split(df, test_size=0.2, seed=42):
    df = df.sample(frac=1, random_state=seed).reset_index(drop=True)
    train_df, test_df = train_test_split(df, test_size=test_size, random_state=seed)
    return train_df, test_df

# Function to perform TF-IDF classification
def tfidf_classification(train_texts, train_labels, test_texts, test_labels):
    vectorizer = TfidfVectorizer()
    X_train = vectorizer.fit_transform(train_texts)
    X_test = vectorizer.transform(test_texts)
    
    clf = LogisticRegression()
    clf.fit(X_train, train_labels)
    predictions = clf.predict(X_test)
    
    macro_f1 = f1_score(test_labels, predictions, average='macro')
    return macro_f1

# Function to run the experiment multiple times
def run_experiment(df, n_runs=5, test_size=0.2, seed=42):
    macro_f1_scores = []
    for _ in range(n_runs):
        train_df, test_df = shuffle_and_split(df, test_size, seed)
        train_texts = train_df['text']
        train_labels = train_df['label']
        test_texts = test_df['text']
        test_labels = test_df['label']
        
        macro_f1 = tfidf_classification(train_texts, train_labels, test_texts, test_labels)
        macro_f1_scores.append(macro_f1)
    return macro_f1_scores

# Assume df12it, df0, df6, df10, df12 are already loaded as pandas DataFrames
datasets = {
    'df12it': df12it,
    'df0': df0,
    'df6': df6,
    'df10': df10,
    'df12': df12
}

# Convert retrieved_features to string for each dataset
for name, df in datasets.items():
    datasets[name] = convert_retrieved_features_to_string(df)

# Run experiment for each dataset
layer_macro_f1_scores = {}
for layer_name, df in datasets.items():
    print(f"Running experiment for {layer_name}")
    macro_f1_scores = run_experiment(df, n_runs=5)
    layer_macro_f1_scores[layer_name] = macro_f1_scores

# Perform t-test
layer_pairs = [(i, j) for i in datasets.keys() for j in datasets.keys() if i < j]
t_test_results = {}
for (layer1, layer2) in layer_pairs:
    t_stat, p_value = ttest_ind(layer_macro_f1_scores[layer1], layer_macro_f1_scores[layer2])
    t_test_results[(layer1, layer2)] = (t_stat, p_value)

# Plot results
layers = list(layer_macro_f1_scores.keys())
means = [np.mean(layer_macro_f1_scores[layer]) for layer in layers]
stds = [np.std(layer_macro_f1_scores[layer]) for layer in layers]

plt.figure(figsize=(10, 6))
plt.errorbar(layers, means, yerr=stds, fmt='o', capsize=5, capthick=2, ecolor='red')
plt.xlabel('Layer')
plt.ylabel('Macro F1 Score')
plt.title('Macro F1 Score by Layer with Error Bars')
plt.grid(True)
plt.show()

# Display results
print("Layer Macro F1 Scores:", layer_macro_f1_scores)
print("T-Test Results:", t_test_results)

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from scipy.stats import ttest_ind
import matplotlib.pyplot as plt
from collections import defaultdict

# Function to convert retrieved_features column to string
def convert_retrieved_features_to_string(df):
    df['text'] = df['retrieved_features'].apply(lambda x: str(x))
    return df

# Function to shuffle and split data
def shuffle_and_split(df, test_size=0.2, seed=42):
    df = df.sample(frac=1, random_state=seed).reset_index(drop=True)
    train_df, test_df = train_test_split(df, test_size=test_size, random_state=seed)
    return train_df, test_df

# Function to perform TF-IDF classification
def tfidf_classification(train_texts, train_labels, test_texts, test_labels):
    vectorizer = TfidfVectorizer()
    X_train = vectorizer.fit_transform(train_texts)
    X_test = vectorizer.transform(test_texts)
    
    clf = LogisticRegression()
    clf.fit(X_train, train_labels)
    predictions = clf.predict(X_test)
    
    macro_f1 = f1_score(test_labels, predictions, average='macro')
    return macro_f1

# Function to run the experiment multiple times
def run_experiment(df, n_runs=5, test_size=0.2, seed=42):
    macro_f1_scores = []
    for _ in range(n_runs):
        train_df, test_df = shuffle_and_split(df, test_size, seed)
        train_texts = train_df['text']
        train_labels = train_df['label']
        test_texts = test_df['text']
        test_labels = test_df['label']
        
        macro_f1 = tfidf_classification(train_texts, train_labels, test_texts, test_labels)
        macro_f1_scores.append(macro_f1)
    return macro_f1_scores

# Assume df12it, df0, df6, df10, df12 are already loaded as pandas DataFrames
datasets = {
    'df12it': df12it,
    'df0': df0,
    'df6': df6,
    'df10': df10,
    'df12': df12
}

# Convert retrieved_features to string for each dataset
for name, df in datasets.items():
    datasets[name] = convert_retrieved_features_to_string(df)

# Run experiment for each dataset
layer_macro_f1_scores = {}
for layer_name, df in datasets.items():
    print(f"Running experiment for {layer_name}")
    macro_f1_scores = run_experiment(df, n_runs=5)
    layer_macro_f1_scores[layer_name] = macro_f1_scores

# Perform t-test
layer_pairs = [(i, j) for i in datasets.keys() for j in datasets.keys() if i < j]
t_test_results = {}
for (layer1, layer2) in layer_pairs:
    t_stat, p_value = ttest_ind(layer_macro_f1_scores[layer1], layer_macro_f1_scores[layer2])
    t_test_results[(layer1, layer2)] = (t_stat, p_value)

# Plot results
layers = list(layer_macro_f1_scores.keys())
means = [np.mean(layer_macro_f1_scores[layer]) for layer in layers]
stds = [np.std(layer_macro_f1_scores[layer]) for layer in layers]

plt.figure(figsize=(10, 6))
plt.errorbar(layers, means, yerr=stds, fmt='o', capsize=5, capthick=2, ecolor='red')
plt.xlabel('Layer')
plt.ylabel('Macro F1 Score')
plt.title('Macro F1 Score by Layer with Error Bars')
plt.grid(True)
plt.show()

# Display results
print("Layer Macro F1 Scores:", layer_macro_f1_scores)
print("T-Test Results:", t_test_results)