In [6]:
import json
import os
import random
import warnings

import numpy as np
import pandas as pd
import torch
from tqdm import tqdm, trange

warnings.filterwarnings("ignore")

# Set up seeds
seed = 42
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
random.seed(seed)


def read_jsonl_file(jsonl_path):
    with open(jsonl_path, "r") as f:
        for line in f:
            yield json.loads(line)

In [9]:
sae_paths = [
    "layer0",
    "layer1",
    "layer2",
    "layer3",
    "layer4",
    "layer5",
    "layer6",
    "layer7",
    "layer8",
    "layer9",
    "layer10",
    "layer11"
]

decision_tree_path = "decision_tree.jsonl"
knn_path = "knn.jsonl"
logistic_regression_path = "logistic_regression.jsonl"
svm_path = "svm.jsonl"

In [19]:
def process_info(decision_tree):
    decision_tree_df = pd.DataFrame(decision_tree)
    decision_tree_df = decision_tree_df.groupby("n_features").sum()
    decision_tree_df["tpr"] = decision_tree_df["tp"] / (decision_tree_df["tp"] + decision_tree_df["fn"])
    decision_tree_df["fpr"] = decision_tree_df["fp"] / (decision_tree_df["fp"] + decision_tree_df["tn"])
    decision_tree_df["accuracy"] = (decision_tree_df["tp"] + decision_tree_df["tn"]) / (decision_tree_df["tp"] + decision_tree_df["fp"] + decision_tree_df["tn"] + decision_tree_df["fn"])
    decision_tree_df["precision"] = decision_tree_df["tp"] / (decision_tree_df["tp"] + decision_tree_df["fp"])
    decision_tree_df["recall"] = decision_tree_df["tp"] / (decision_tree_df["tp"] + decision_tree_df["fn"])
    decision_tree_df["f1"] = 2 * (decision_tree_df["precision"] * decision_tree_df["recall"]) / (decision_tree_df["precision"] + decision_tree_df["recall"])

    return decision_tree_df



# read all decision tree results
decision_tree_results = []
knn_results = []
logistic_regression_results = []
svm_results = []

for sae_path in sae_paths:
    
    result = list(read_jsonl_file(os.path.join(sae_path, decision_tree_path)))
    processed_result = process_info(result)
    decision_tree_results.append(processed_result)

    result = list(read_jsonl_file(os.path.join(sae_path, knn_path)))
    processed_result = process_info(result)
    knn_results.append(processed_result)

    result = list(read_jsonl_file(os.path.join(sae_path, logistic_regression_path)))
    processed_result = process_info(result)
    logistic_regression_results.append(processed_result)

    result = list(read_jsonl_file(os.path.join(sae_path, svm_path)))
    processed_result = process_info(result)
    svm_results.append(processed_result)

    
    








In [39]:
import numpy as np
import plotly.graph_objects as go

   

In [41]:
def plot_decision_tree_f1(decision_tree_results, selected_layers=[0, 10], fillcolor='rgba(0,100,255,0.1)', title="Decision Tree F1 Score"):
    """
    Plots Decision Tree F1 Scores with Selected Layers and Confidence Interval.
    
    Parameters:
    - decision_tree_results: List of DataFrames containing F1 scores
    - selected_layers: List of layers to plot individually
    - fillcolor: Color for the confidence interval (default is light blue)
    """
    fig = go.Figure()
    line_styles = ['solid', 'dash', 'dot', 'dashdot']  # Different line styles
    
    # Get indices (assuming all results have the same index)
    all_indices = decision_tree_results[0].index  
    
    # Filter out selected layers and compute mean/std for the rest
    filtered_results = [result["f1"] for i, result in enumerate(decision_tree_results) if i not in selected_layers]
    mean_f1 = np.mean(filtered_results, axis=0)
    std_f1 = np.std(filtered_results, axis=0)
    
    # Plot the selected individual layers
    for i, result in enumerate(decision_tree_results):
        if i not in selected_layers:
            continue
        fig.add_trace(go.Scatter(
            x=result.index,
            y=result["f1"],
            mode='lines+markers',
            name=f"Layer {i}",
            line=dict(dash=line_styles[i % len(line_styles)])  # Cycle through styles
        ))
    
    # Add the confidence interval as a shaded region
    fig.add_trace(go.Scatter(
        x=list(all_indices) + list(all_indices[::-1]),  # x values for fill
        y=list(mean_f1 + std_f1) + list((mean_f1 - std_f1)[::-1]),  # Upper & lower bound
        fill='toself',  # Fill the area
        fillcolor=fillcolor,  # Transparency
        line=dict(color='rgba(255,255,255,0)'),  # Hide border line
        name="Mean ± Std",
    ))
    
    # Add the mean line
    fig.add_trace(go.Scatter(
        x=all_indices,
        y=mean_f1,
        mode='lines',
        name="Mean of Other Layers",
        line=dict(color='green', dash='solid', width=2)
    ))
    
    # Update layout
    fig.update_layout(
        title=title,
        xaxis_title="Number of Features",
        yaxis_title="F1 Score",
        legend_title="Layers",
        width=1200,
        height=800,
        template="plotly_white"
    )
    
    fig.show()


plot_decision_tree_f1(decision_tree_results, selected_layers=[0, 10], title="Decision Tree F1 Score")


In [50]:

plot_decision_tree_f1(knn_results, selected_layers=[0, 10], title="KNN F1 Score")


In [51]:

plot_decision_tree_f1(logistic_regression_results, selected_layers=[0, 10], title="Logistic Regression F1 Score")


In [52]:

plot_decision_tree_f1(svm_results, selected_layers=[0, 10], title="SVM F1 Score")


In [65]:
# Get a table wih all the results for F1 score, accuracy, precision, recall 

# \end{figure*}
# \begin{table}[h]
#     \centering
#     \begin{tabular}{|c|c|c|c|c|c|c|}
#         \hline
#         \multirow{2}{*}{Layer}  & \multicolumn{6}{c|}{Top k Features F1 Score} \\
#         \cline{2-7}
#          & k=1 & k=5 & k=10 & k=25 & k=50 & k=100 \\
#         \hline
#         0 & 0.1000 & 0.1000 & 0.1000 & 0.1000 & 0.1000 & 0.1000  \\
#         \hline

#     \end{tabular}
#     \caption{Top k Features F1 Scores across different layers using Decision Tree Algorithm}
#     \label{tab:top_f1}
# \end{table}


top_k = [1, 5, 10, 25, 50, 99]
layers = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]

def get_table(decision_tree_results, top_k, layers, metric="f1", column_name="F1 Score"):
    table = []
    for layer in layers:
        row = [layer]
        for k in top_k:
            row.append(decision_tree_results[layer].loc[k, metric])
        table.append(row)

    decision_tree_table = pd.DataFrame(table, columns=["Layer"] + [f"Top {k} {column_name}" for k in top_k])
    
    return decision_tree_table

decision_tree_table = get_table(decision_tree_results, top_k, layers, metric="f1", column_name="F1 Score")
decision_tree_table = get_table(decision_tree_results, top_k, layers, metric="accuracy", column_name="Accuracy")
decision_tree_table

Unnamed: 0,Layer,Top 1 Accuracy,Top 5 Accuracy,Top 10 Accuracy,Top 25 Accuracy,Top 50 Accuracy,Top 99 Accuracy
0,0,0.566667,0.616667,0.633333,0.666667,0.65,0.666667
1,1,0.7,0.783333,0.716667,0.75,0.783333,0.75
2,2,0.683333,0.683333,0.75,0.7,0.766667,0.783333
3,3,0.716667,0.75,0.766667,0.766667,0.8,0.733333
4,4,0.666667,0.766667,0.7,0.766667,0.733333,0.766667
5,5,0.75,0.75,0.733333,0.733333,0.683333,0.733333
6,6,0.733333,0.75,0.783333,0.733333,0.733333,0.733333
7,7,0.683333,0.7,0.75,0.7,0.766667,0.733333
8,8,0.633333,0.733333,0.716667,0.766667,0.733333,0.7
9,9,0.633333,0.733333,0.733333,0.733333,0.733333,0.766667


In [67]:
knn_table = get_table(knn_results, top_k, layers, metric="f1", column_name="F1 Score")
knn_table = get_table(knn_results, top_k, layers, metric="accuracy", column_name="Accuracy")
knn_table

Unnamed: 0,Layer,Top 1 Accuracy,Top 5 Accuracy,Top 10 Accuracy,Top 25 Accuracy,Top 50 Accuracy,Top 99 Accuracy
0,0,0.583333,0.6,0.583333,0.616667,0.616667,0.616667
1,1,0.666667,0.666667,0.666667,0.733333,0.716667,0.716667
2,2,0.616667,0.65,0.616667,0.666667,0.666667,0.666667
3,3,0.65,0.65,0.633333,0.65,0.666667,0.75
4,4,0.616667,0.6,0.583333,0.583333,0.6,0.633333
5,5,0.633333,0.6,0.616667,0.633333,0.633333,0.616667
6,6,0.683333,0.583333,0.616667,0.616667,0.633333,0.633333
7,7,0.6,0.666667,0.666667,0.65,0.65,0.633333
8,8,0.633333,0.616667,0.666667,0.666667,0.716667,0.7
9,9,0.6,0.6,0.65,0.666667,0.683333,0.7


In [68]:
logistic_regression_table = get_table(logistic_regression_results, top_k, layers, metric="f1", column_name="F1 Score")
# logistic_regression_table = get_table(logistic_regression_results, top_k, layers, metric="accuracy", column_name="Accuracy")
logistic_regression_table

Unnamed: 0,Layer,Top 1 F1 Score,Top 5 F1 Score,Top 10 F1 Score,Top 25 F1 Score,Top 50 F1 Score,Top 99 F1 Score
0,0,0.683544,0.645161,0.645161,0.644068,0.631579,0.631579
1,1,0.657534,0.6,0.566038,0.526316,0.618182,0.678571
2,2,0.676056,0.634921,0.622951,0.588235,0.688525,0.733333
3,3,0.676056,0.644068,0.610169,0.709677,0.711864,0.677966
4,4,0.644068,0.644068,0.631579,0.566667,0.730159,0.701754
5,5,0.677419,0.707692,0.580645,0.610169,0.655172,0.701754
6,6,0.688525,0.6875,0.642857,0.644068,0.666667,0.654545
7,7,0.698413,0.688525,0.655738,0.561404,0.666667,0.655738
8,8,0.465116,0.656716,0.709677,0.618182,0.7,0.7
9,9,0.676056,0.676056,0.633333,0.654545,0.688525,0.666667
