In [1]:
import json
import os
import random
import warnings

import numpy as np
import pandas as pd
import torch
from tqdm import tqdm, trange

warnings.filterwarnings("ignore")

# Set up seeds
seed = 42
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
random.seed(seed)


def read_jsonl_file(jsonl_path):
    with open(jsonl_path, "r") as f:
        for line in f:
            yield json.loads(line)

In [2]:
sae_paths = [
    "layer0",
    "layer1",
    "layer2",
    "layer3",
    "layer4",
    "layer5",
    "layer6",
    "layer7",
    "layer8",
    "layer9",
    "layer10",
    "layer11"
]

decision_tree_path = "decision_tree.jsonl"
knn_path = "knn.jsonl"
logistic_regression_path = "logistic_regression.jsonl"
svm_path = "svm.jsonl"

In [3]:
def process_info(decision_tree):
    decision_tree_df = pd.DataFrame(decision_tree)
    decision_tree_df = decision_tree_df.groupby("n_features").sum()
    decision_tree_df["tpr"] = decision_tree_df["tp"] / (decision_tree_df["tp"] + decision_tree_df["fn"])
    decision_tree_df["fpr"] = decision_tree_df["fp"] / (decision_tree_df["fp"] + decision_tree_df["tn"])
    decision_tree_df["accuracy"] = (decision_tree_df["tp"] + decision_tree_df["tn"]) / (decision_tree_df["tp"] + decision_tree_df["fp"] + decision_tree_df["tn"] + decision_tree_df["fn"])
    decision_tree_df["precision"] = decision_tree_df["tp"] / (decision_tree_df["tp"] + decision_tree_df["fp"])
    decision_tree_df["recall"] = decision_tree_df["tp"] / (decision_tree_df["tp"] + decision_tree_df["fn"])
    decision_tree_df["f1"] = 2 * (decision_tree_df["precision"] * decision_tree_df["recall"]) / (decision_tree_df["precision"] + decision_tree_df["recall"])

    return decision_tree_df



# read all decision tree results
decision_tree_results = []
knn_results = []
logistic_regression_results = []
svm_results = []

for sae_path in sae_paths:
    
    result = list(read_jsonl_file(os.path.join(sae_path, decision_tree_path)))
    processed_result = process_info(result)
    decision_tree_results.append(processed_result)

    result = list(read_jsonl_file(os.path.join(sae_path, knn_path)))
    processed_result = process_info(result)
    knn_results.append(processed_result)

    result = list(read_jsonl_file(os.path.join(sae_path, logistic_regression_path)))
    processed_result = process_info(result)
    logistic_regression_results.append(processed_result)

    result = list(read_jsonl_file(os.path.join(sae_path, svm_path)))
    processed_result = process_info(result)
    svm_results.append(processed_result)

    
    








In [6]:
import numpy as np
import plotly.graph_objects as go

   

In [7]:
def plot_decision_tree_f1(decision_tree_results, selected_layers=[0, 10], fillcolor='rgba(0,100,255,0.1)', title="Decision Tree F1 Score"):
    """
    Plots Decision Tree F1 Scores with Selected Layers and Confidence Interval.
    
    Parameters:
    - decision_tree_results: List of DataFrames containing F1 scores
    - selected_layers: List of layers to plot individually
    - fillcolor: Color for the confidence interval (default is light blue)
    """
    fig = go.Figure()
    line_styles = ['solid', 'dash', 'dot', 'dashdot']  # Different line styles
    
    # Get indices (assuming all results have the same index)
    all_indices = decision_tree_results[0].index  
    
    # Filter out selected layers and compute mean/std for the rest
    filtered_results = [result["f1"] for i, result in enumerate(decision_tree_results) if i not in selected_layers]
    mean_f1 = np.mean(filtered_results, axis=0)
    std_f1 = np.std(filtered_results, axis=0)
    
    # Plot the selected individual layers
    for i, result in enumerate(decision_tree_results):
        if i not in selected_layers:
            continue
        fig.add_trace(go.Scatter(
            x=result.index,
            y=result["f1"],
            mode='lines+markers',
            name=f"Layer {i}",
            line=dict(dash=line_styles[i % len(line_styles)])  # Cycle through styles
        ))
    
    # Add the confidence interval as a shaded region
    fig.add_trace(go.Scatter(
        x=list(all_indices) + list(all_indices[::-1]),  # x values for fill
        y=list(mean_f1 + std_f1) + list((mean_f1 - std_f1)[::-1]),  # Upper & lower bound
        fill='toself',  # Fill the area
        fillcolor=fillcolor,  # Transparency
        line=dict(color='rgba(255,255,255,0)'),  # Hide border line
        name="Mean ± Std",
    ))
    
    # Add the mean line
    fig.add_trace(go.Scatter(
        x=all_indices,
        y=mean_f1,
        mode='lines',
        name="Mean of Other Layers",
        line=dict(color='green', dash='solid', width=2)
    ))
    
    # Update layout
    fig.update_layout(
        title=title,
        xaxis_title="Number of Features",
        yaxis_title="F1 Score",
        legend_title="Layers",
        width=1200,
        height=800,
        template="plotly_white"
    )
    
    fig.show()


plot_decision_tree_f1(decision_tree_results, selected_layers=[0, 10], title="Decision Tree F1 Score")


In [8]:

plot_decision_tree_f1(knn_results, selected_layers=[0, 10], title="KNN F1 Score")


In [15]:

plot_decision_tree_f1(logistic_regression_results, selected_layers=[0, 10], title="Logistic Regression F1 Score")


In [10]:

plot_decision_tree_f1(svm_results, selected_layers=[0, 10], title="SVM F1 Score")


In [None]:
# Get a table wih all the results for F1 score, accuracy, precision, recall 

# \end{figure*}
# \begin{table}[h]
#     \centering
#     \begin{tabular}{|c|c|c|c|c|c|c|}
#         \hline
#         \multirow{2}{*}{Layer}  & \multicolumn{6}{c|}{Top k Features F1 Score} \\
#         \cline{2-7}
#          & k=1 & k=5 & k=10 & k=25 & k=50 & k=100 \\
#         \hline
#         0 & 0.1000 & 0.1000 & 0.1000 & 0.1000 & 0.1000 & 0.1000  \\
#         \hline

#     \end{tabular}
#     \caption{Top k Features F1 Scores across different layers using Decision Tree Algorithm}
#     \label{tab:top_f1}
# \end{table}


top_k = [1, 5, 10, 25, 50, 99]
layers = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]

def get_table(decision_tree_results, top_k, layers, metric="f1", column_name="F1 Score"):
    table = []
    for layer in layers:
        row = [layer]
        for k in top_k:
            row.append(decision_tree_results[layer].loc[k, metric])
        table.append(row)

    decision_tree_table = pd.DataFrame(table, columns=["Layer"] + [f"Top {k} {column_name}" for k in top_k])
    
    return decision_tree_table

decision_tree_table = get_table(decision_tree_results, top_k, layers, metric="f1", column_name="F1 Score")
decision_tree_table

Unnamed: 0,Layer,Top 1 Accuracy,Top 5 Accuracy,Top 10 Accuracy,Top 25 Accuracy,Top 50 Accuracy,Top 99 Accuracy
0,0,0.617021,0.643617,0.62234,0.654255,0.654255,0.62766
1,1,0.707447,0.643617,0.654255,0.643617,0.638298,0.702128
2,2,0.664894,0.638298,0.654255,0.659574,0.632979,0.617021
3,3,0.606383,0.611702,0.601064,0.569149,0.617021,0.648936
4,4,0.691489,0.611702,0.579787,0.611702,0.62766,0.590426
5,5,0.56383,0.611702,0.590426,0.643617,0.632979,0.547872
6,6,0.569149,0.601064,0.632979,0.638298,0.62766,0.601064
7,7,0.611702,0.595745,0.62766,0.638298,0.638298,0.670213
8,8,0.590426,0.590426,0.643617,0.601064,0.617021,0.632979
9,9,0.579787,0.611702,0.617021,0.574468,0.617021,0.617021


In [16]:

decision_tree_table = get_table(decision_tree_results, top_k, layers, metric="accuracy", column_name="Accuracy")
decision_tree_table

Unnamed: 0,Layer,Top 1 Accuracy,Top 5 Accuracy,Top 10 Accuracy,Top 25 Accuracy,Top 50 Accuracy,Top 99 Accuracy
0,0,0.617021,0.643617,0.62234,0.654255,0.654255,0.62766
1,1,0.707447,0.643617,0.654255,0.643617,0.638298,0.702128
2,2,0.664894,0.638298,0.654255,0.659574,0.632979,0.617021
3,3,0.606383,0.611702,0.601064,0.569149,0.617021,0.648936
4,4,0.691489,0.611702,0.579787,0.611702,0.62766,0.590426
5,5,0.56383,0.611702,0.590426,0.643617,0.632979,0.547872
6,6,0.569149,0.601064,0.632979,0.638298,0.62766,0.601064
7,7,0.611702,0.595745,0.62766,0.638298,0.638298,0.670213
8,8,0.590426,0.590426,0.643617,0.601064,0.617021,0.632979
9,9,0.579787,0.611702,0.617021,0.574468,0.617021,0.617021


In [19]:
knn_table = get_table(knn_results, top_k, layers, metric="f1", column_name="F1 Score")
knn_table

Unnamed: 0,Layer,Top 1 F1 Score,Top 5 F1 Score,Top 10 F1 Score,Top 25 F1 Score,Top 50 F1 Score,Top 99 F1 Score
0,0,0.582278,0.578616,0.655914,0.638554,0.626506,0.614458
1,1,0.719212,0.681564,0.705263,0.703518,0.708333,0.719577
2,2,0.040404,0.722513,0.695187,0.659574,0.67033,0.709677
3,3,0.162162,0.691489,0.673913,0.721311,0.699454,0.695652
4,4,0.691489,0.69697,0.681081,0.655914,0.655556,0.655914
5,5,0.6,0.681564,0.681319,0.67052,0.666667,0.685393
6,6,0.539474,0.714286,0.695187,0.707182,0.692308,0.67033
7,7,0.348485,0.692737,0.688889,0.677596,0.666667,0.659091
8,8,0.57485,0.670103,0.673575,0.655556,0.67033,0.714286
9,9,0.563218,0.695652,0.701571,0.685393,0.677596,0.666667


In [20]:

knn_table = get_table(knn_results, top_k, layers, metric="accuracy", column_name="Accuracy")
knn_table

Unnamed: 0,Layer,Top 1 Accuracy,Top 5 Accuracy,Top 10 Accuracy,Top 25 Accuracy,Top 50 Accuracy,Top 99 Accuracy
0,0,0.648936,0.643617,0.659574,0.680851,0.670213,0.659574
1,1,0.696809,0.696809,0.702128,0.68617,0.702128,0.718085
2,2,0.494681,0.718085,0.696809,0.659574,0.680851,0.712766
3,3,0.505319,0.691489,0.680851,0.728723,0.707447,0.702128
4,4,0.691489,0.680851,0.68617,0.659574,0.670213,0.659574
5,5,0.595745,0.696809,0.691489,0.696809,0.691489,0.702128
6,6,0.62766,0.723404,0.696809,0.718085,0.702128,0.680851
7,7,0.542553,0.707447,0.702128,0.68617,0.675532,0.680851
8,8,0.62234,0.659574,0.664894,0.670213,0.680851,0.723404
9,9,0.595745,0.702128,0.696809,0.702128,0.68617,0.68617


In [None]:
logistic_regression_table = get_table(logistic_regression_results, top_k, layers, metric="f1", column_name="F1 Score")
logistic_regression_table

Unnamed: 0,Layer,Top 1 F1 Score,Top 5 F1 Score,Top 10 F1 Score,Top 25 F1 Score,Top 50 F1 Score,Top 99 F1 Score
0,0,0.496732,0.717703,0.724638,0.736318,0.736318,0.736318
1,1,0.714286,0.670886,0.714286,0.717391,0.717391,0.705882
2,2,0.729064,0.726368,0.746114,0.728261,0.735135,0.732984
3,3,0.729064,0.706522,0.711111,0.705882,0.709677,0.711111
4,4,0.571429,0.707182,0.707865,0.719577,0.696133,0.681081
5,5,0.601093,0.73,0.714286,0.715789,0.716578,0.72043
6,6,0.611111,0.73,0.725275,0.705882,0.688525,0.709677
7,7,0.722513,0.72449,0.72043,0.688889,0.703297,0.711111
8,8,0.632768,0.729064,0.744898,0.729282,0.729282,0.717391
9,9,0.624277,0.729064,0.717172,0.72043,0.722513,0.714286


In [18]:
logistic_regression_table = get_table(logistic_regression_results, top_k, layers, metric="accuracy", column_name="Accuracy")
logistic_regression_table

Unnamed: 0,Layer,Top 1 Accuracy,Top 5 Accuracy,Top 10 Accuracy,Top 25 Accuracy,Top 50 Accuracy,Top 99 Accuracy
0,0,0.590426,0.68617,0.696809,0.718085,0.718085,0.718085
1,1,0.659574,0.723404,0.723404,0.723404,0.723404,0.707447
2,2,0.707447,0.707447,0.739362,0.734043,0.739362,0.728723
3,3,0.707447,0.712766,0.723404,0.707447,0.712766,0.723404
4,4,0.648936,0.718085,0.723404,0.718085,0.707447,0.68617
5,5,0.611702,0.712766,0.702128,0.712766,0.718085,0.723404
6,6,0.62766,0.712766,0.734043,0.707447,0.696809,0.712766
7,7,0.718085,0.712766,0.723404,0.702128,0.712766,0.723404
8,8,0.654255,0.707447,0.734043,0.739362,0.739362,0.723404
9,9,0.654255,0.707447,0.702128,0.723404,0.718085,0.723404
