In [None]:
from utils import *
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pyplot import imshow
from matplotlib.colors import TwoSlopeNorm
import pickle
import json
from scipy.stats import ttest_rel, norm
import matplotlib.ticker as mticker

plt.figure(figsize=(8, 9))
plt.rcParams["font.family"] = "Serif"
plt.rcParams["mathtext.fontset"] = "dejavuserif"
color_order = [0, 1, 2, 4, 5, 3]
cmap = plt.get_cmap("tab10")
num_layers = 16

with open('../data/filtered_indices.pkl', 'rb') as f:
    filtered_indices = pickle.load(f)

## Layer Knockout - ambiguous sentence

In [None]:
all_lit = []
all_fig = []
replace = 'orig'
comp = 'mlp' #attn

dirname = f"./results/Llama-3.2-1B/knockout/{replace}/{comp}/fig_drop"
for i in range(0, 245):
        if i not in filtered_indices:
                temp = np.load(f'{dirname}/idiom_{i}.npz')
                all_fig.append(temp['arr_0'])

dirname = f"./results/Llama-3.2-1B/knockout/{replace}/{comp}/lit_drop"
for i in range(0, 245):  
        if i not in filtered_indices:
                temp = np.load(f'{dirname}/idiom_{i}.npz')
                all_lit.append(temp['arr_0'])

In [None]:
literal = np.array(all_lit)
figurative  = np.array(all_fig)

lit_avg, lit_lower, lit_upper = compute_confidence_interval(literal) #fig
fig_avg, fig_lower, fig_upper = compute_confidence_interval(figurative) #lit

# Paired t-test and Cohen's d across layers.
p_values = []
cohen_ds = []
for i in range(num_layers):
    lit_layer = literal[:, i]
    fig_layer = figurative[:, i]
    t_stat, p_val = ttest_rel(lit_layer, fig_layer)
    diff = lit_layer - fig_layer
    mean_diff = np.mean(diff)
    std_diff = np.std(diff, ddof=1)
    cohen_d = abs(mean_diff / std_diff) if std_diff != 0 else 0
    # If effect size is small, ignore the p-value.
    # if cohen_d < 0.5:
    #     p_val = 1
    p_values.append(p_val)
    cohen_ds.append(cohen_d)


# Create an x-axis array (assuming one point per layer)
x_axis = np.arange(len(lit_avg))


plt.fill_between(x_axis, fig_lower, fig_upper, color=cmap.colors[color_order[5]], alpha=0.15)
plt.plot(x_axis, fig_avg, label=r'$c \in C_f, s_a$', linewidth=2, color=cmap.colors[color_order[5]])

plt.fill_between(x_axis, lit_lower, lit_upper, color=cmap.colors[color_order[0]], alpha=0.15)
plt.plot(x_axis, lit_avg, label=r'$c \in C_l, s_a$', linewidth=2, color=cmap.colors[color_order[0]])

# Annotation: mark significance and check average differences.
alpha = 0.05

all_diff = []
for i in range(len(lit_avg)):
     all_diff.append(abs(lit_avg[i] - fig_avg[i]))
     
THRESHOLD_DIFF = sum(all_diff) / len(all_diff)
for i, p_val in enumerate(p_values):
    # Determine the height for placing the annotation.
    y_val = min(lit_upper[i], fig_upper[i])
    
    # If p-value is significant (i.e. statistical difference is present), mark with a red asterisk.
    if p_val < alpha:
        # Also, if the absolute difference in averages is huge, annotate with the diff value.
        avg_diff = abs(lit_avg[i] - fig_avg[i])
        if avg_diff > THRESHOLD_DIFF:
                plt.text(x_axis[i], -0.6, '*', ha='center', va='bottom',
                        fontsize=27, color='red')

plt.xlabel("Layers", fontsize=27)
plt.xticks(fontsize=27)
plt.yticks(np.arange(-0.6, 0.7, 0.2), fontsize=20)
plt.ylabel(r'$\Delta I(s_a)$', fontsize=27)
plt.axhline(0, linestyle='--', color='gray', linewidth=2, label='No Effect')
plt.legend(loc='upper right', fontsize=27)
plt.grid(True, color='gray', linestyle='dotted', linewidth=0.5, alpha=0.4)
plt.savefig(f"{comp}_knockout_sa.pdf", format='pdf', bbox_inches='tight')
plt.show()

## Layer Knockout -- all sentences

In [None]:
all_orig = []
all_lit = []
all_fig = []
comp = 'mlp' #attn

dirname = f"./results/Llama-3.2-1B/knockout/orig/{comp}/fig_drop"
for i in range(0, 245):
        if i not in filtered_indices:
                temp = np.load(f'{dirname}/idiom_{i}.npz')
                all_orig.append(temp)

dirname = f"./results/Llama-3.2-1B/knockout/fig/{comp}/fig_drop"
for i in range(0, 245):
        if i not in filtered_indices:
                temp = np.load(f'{dirname}/idiom_{i}.npz')
                all_fig.append(temp)

dirname = f"./results/Llama-3.2-1B/knockout/lit/{comp}/lit_drop"
for i in range(0, 245):  
        if i not in filtered_indices:
                temp = np.load(f'{dirname}/idiom_{i}.npz')
                all_lit.append(temp['arr_0'])

In [None]:
plt.figure(figsize=(6, 7))

original_avg, original_lower, original_upper = compute_confidence_interval(all_orig)
literal_avg, literal_lower, literal_upper = compute_confidence_interval(all_lit)
figurative_avg, figurative_lower, figurative_upper = compute_confidence_interval(all_fig)


# Paired t-test and Cohen's d for Original vs. Literal and Original vs. Figurative.
p_values_org_lit = []
p_values_org_fig = []
cohen_ds_org_lit = []
cohen_ds_org_fig = []

for i in range(num_layers):
    # Extract the data for the current layer.
    orig_layer = all_orig[:, i]
    lit_layer = all_lit[:, i]
    fig_layer = all_fig[:, i]
    
    # Compute paired t-test for Original vs. Literal.
    t_stat_lit, p_val_org_lit = ttest_rel(orig_layer, lit_layer)
    diff_org_lit = orig_layer - lit_layer
    mean_diff_org_lit = np.mean(diff_org_lit)
    std_diff_org_lit = np.std(diff_org_lit, ddof=1)
    cohen_d_org_lit = abs(mean_diff_org_lit / std_diff_org_lit) if std_diff_org_lit != 0 else 0
    if cohen_d_org_lit < 0.2:
        p_val_org_lit = 1  # Ignore significance for small effect sizes.
    p_values_org_lit.append(p_val_org_lit)
    cohen_ds_org_lit.append(cohen_d_org_lit)
    
    # Compute paired t-test for Original vs. Figurative.
    t_stat_fig, p_val_org_fig = ttest_rel(orig_layer, fig_layer)
    diff_org_fig = orig_layer - fig_layer
    mean_diff_org_fig = np.mean(diff_org_fig)
    std_diff_org_fig = np.std(diff_org_fig, ddof=1)
    cohen_d_org_fig = abs(mean_diff_org_fig / std_diff_org_fig) if std_diff_org_fig != 0 else 0
    if cohen_d_org_fig < 0.2:
        p_val_org_fig = 1
    p_values_org_fig.append(p_val_org_fig)
    cohen_ds_org_fig.append(cohen_d_org_fig)

# Plot configuration.
plt.rcParams["font.family"] = "Serif"
plt.rcParams["mathtext.fontset"] = "dejavuserif"
cmap = plt.get_cmap("tab10")
# Use three colors for Original, Literal, and Figurative respectively.
color_order = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
x_axis = np.arange(num_layers)

# Plot confidence intervals and means for "Original".
plt.fill_between(x_axis, original_lower, original_upper,
                 color=cmap.colors[color_order[3]], alpha=0.15)
plt.plot(x_axis, original_avg, label=r'$c \in C_f, s_a$', linewidth=2, 
         color=cmap.colors[color_order[3]])

# Plot confidence intervals and means for "Literal".
plt.fill_between(x_axis, literal_lower, literal_upper,
                 color=cmap.colors[color_order[4]], alpha=0.15)
plt.plot(x_axis, literal_avg, label=r'$c \in C_l, s_l$', linewidth=2, 
         color=cmap.colors[color_order[4]])

# Plot confidence intervals and means for "Figurative".
plt.fill_between(x_axis, figurative_lower, figurative_upper,
                 color=cmap.colors[color_order[2]], alpha=0.15)
plt.plot(x_axis, figurative_avg, label=r'$c \in C_f, s_f$', linewidth=2,
         color=cmap.colors[color_order[2]])

# Annotation: Only mark an asterisk if both comparisons are statistically significant.
alpha = 0.05

all_diff = []
for i in range(len(lit_avg)):
     all_diff.append(abs(lit_avg[i] - fig_avg[i]))
     
THRESHOLD_DIFF = sum(all_diff) / len(all_diff)
for i in range(num_layers):
    # Choose the maximum upper bound among conditions to position the annotation.
    y_val = max(original_upper[i], literal_upper[i], figurative_upper[i])
    avg_diff = abs(original_avg[i] - literal_avg[i])

    if p_values_org_lit[i] < alpha and p_values_org_fig[i] < alpha and avg_diff > THRESHOLD_DIFF:
        plt.text(x_axis[i], -0.6, '*', ha='center', va='bottom',
                 fontsize=20, color='red')
        

plt.xlabel("Layers", fontsize=20)
plt.ylabel(r'$\Delta I(s), \; s\in \{s_a, s_f, s_l\}$', fontsize=30)
plt.axhline(0, linestyle='--', color='gray', linewidth=1, label='No Effect')
plt.legend(loc='upper right', fontsize=20)
plt.grid(True, color='gray', linestyle='dotted', linewidth=0.5, alpha=0.4)
plt.xticks(fontsize=20)
plt.yticks(np.arange(-0.6, 0.7, 0.2), fontsize=20)
plt.tight_layout()
plt.savefig(f"{comp}_knockout_all.pdf", format='pdf', bbox_inches='tight')
plt.show()

## Head Knockout

In [None]:
cand = 'lit' # fig
dirname = f"./results/Llama-3.2-1B/knockout/original/head/{cand}_drop"

with open('/nethome/soyoung/idiom/idiom_process/data_gen/data/w_prefix_all_most_single_token_cand_literal_constrain.json', 'r') as f:
    knowns = json.load(f)

class Avg:
    def __init__(self):
        self.d = []

    def add(self, v):
        self.d.append(v[None])

    def add_all(self, vv):
        self.d.append(vv)

    def avg(self):
        return np.concatenate(self.d).mean(axis=0)

    def std(self):
        return np.concatenate(self.d).std(axis=0)

    def size(self):
        return sum(datum.shape[0] for datum in self.d)

avg = Avg()
for i, known in enumerate(knowns):
    if i in filtered_indices:
        continue
    try:
        data = np.load(f'{dirname}/idiom_{i}.npz')
        scores = data['arr_0'].T
        avg.add(scores)
    
    except Exception as e:
        print(e)
        pass

result = avg.avg()
data = np.array(avg.d).squeeze(1)

In [None]:
fig, ax = plt.subplots()
plt.figure(figsize=(11, 10))

img = imshow(
    result,
    # vmin = min_val,
    cmap = "RdBu",
    # vmax = max_val,
    # norm=norm, 
)

plt.xticks(ticks=[0, 2, 4, 6, 8, 10, 12, 14], labels=[0, 2, 4, 6, 8, 10, 12, 14])
plt.rcParams["font.family"] = "Serif"
plt.rcParams["mathtext.fontset"] = "dejavuserif"
# cbar = fig.colorbar(img, ticks=[-6, -5, -4, -3, -2, -1, 0])
# cbar = fig.colorbar(img, ticks=[-0.4, -0.3, -0.2, -0.1, 0])
# cbar = fig.colorbar(img, ticks=[-2, -1, 0, 1])
cbar = fig.colorbar(img)
cbar.set_label(r"$\Delta L(s_a)$", rotation=270, labelpad=15, fontsize=14) # r"$\Delta F(s_a)$"
plt.xlabel("Layers", fontsize=14)
plt.ylabel("Heads", fontsize=14)
plt.savefig("attn_heads_disamb.pdf", format='pdf', bbox_inches='tight')
plt.show(img)

## Information flow

- Idiom_Fig
- Idiom_Lit
- IdiomA_IdiomB

In [None]:
replace =  'idiom_fig' #idiom_lit, idiomA_idiomB

dirname = f"./results/Llama-3.2-1B/information_flow/{replace}/fig_drop"
fig_drop = []
for i in range(0, 245):  # Make sure 'data' is defined appropriately
        if i not in filtered_indices:
                temp = np.load(f'{dirname}/idiom_{i}.npz')
                fig_drop.append(temp['arr_0'])

dirname = f"./results/Llama-3.2-1B/information_flow/{replace}/lit_drop"
lit_drop = []
for i in range(0, 245):  # Make sure 'data' is defined appropriately
        if i not in filtered_indices:
                temp = np.load(f'{dirname}/idiom_{i}.npz')
                lit_drop.append(temp['arr_0'])

In [None]:
idiom_replace_figc_ = np.array(fig_drop)
idiom_replace_litc_  = np.array(lit_drop)

attn_averages, attn_lower, attn_upper = compute_confidence_interval(idiom_replace_figc_)
attn_averages1, attn_lower1, attn_upper1 = compute_confidence_interval(idiom_replace_litc_)

# Paired t-test and Cohen's d across layers.
p_values = []
cohen_ds = []
for i in range(num_layers):
    lit_layer = idiom_replace_litc_[:, i]
    fig_layer = idiom_replace_figc_[:, i]
    t_stat, p_val = ttest_rel(lit_layer, fig_layer)
    diff = lit_layer - fig_layer
    mean_diff = np.mean(diff)
    std_diff = np.std(diff, ddof=1)
    cohen_d = abs(mean_diff / std_diff) if std_diff != 0 else 0
    # If effect size is small, ignore the p-value.
    # if cohen_d < 0.5:
    #     p_val = 1
    p_values.append(p_val)
    cohen_ds.append(cohen_d)


# Create an x-axis array (assuming one point per layer)
x_axis = np.arange(len(attn_averages))


plt.fill_between(x_axis, attn_lower, attn_upper, color=cmap.colors[color_order[5]], alpha=0.15)
# plt.plot(x_axis, attn_averages, label=r'$\Delta FL$', linewidth=2, color=cmap.colors[color_order[5]])
# plt.plot(x_axis, attn_averages, label='Idiom to Because', linewidth=2, color=cmap.colors[color_order[5]])
plt.plot(x_axis, attn_averages, label=r'$\Delta F(s_a \hookleftarrow s_f)$', linewidth=2, color=cmap.colors[color_order[5]])

plt.fill_between(x_axis, attn_lower1, attn_upper1, color=cmap.colors[color_order[0]], alpha=0.15)
# plt.plot(x_axis, attn_averages1, label=r'$\Delta FL$', linewidth=2, color=cmap.colors[color_order[0]])
# plt.plot(x_axis, attn_averages1, label='Because to Last', linewidth=2, color=cmap.colors[color_order[0]])
plt.plot(x_axis, attn_averages1, label=r'$\Delta L(s_a \hookleftarrow s_f)$', linewidth=2, color=cmap.colors[color_order[0]])

# Annotation: mark significance and check average differences.
alpha = 0.05

all_diff_means = []
for i in range(len(attn_averages)):
    avg_diff = abs(attn_averages[i] - attn_averages1[i])
    all_diff_means.append(avg_diff)

THRESHOLD_DIFF = sum(all_diff_means) / len(all_diff_means) # Set this threshold as the minimum absolute difference considered "huge"
for i, p_val in enumerate(p_values):
    # Determine the height for placing the annotation.
    y_val = min(attn_upper[i], attn_upper1[i])
    
    # If p-value is significant (i.e. statistical difference is present), mark with a red asterisk.
    if p_val < alpha:
    
        # Also, if the absolute difference in averages is huge, annotate with the diff value.
        avg_diff = abs(attn_averages[i] - attn_averages1[i])
        print(avg_diff)
        if avg_diff > THRESHOLD_DIFF:
            # if cohen_ds[i] > 0.5:
            # plt.text(x_axis[i], -0.15, '*', ha='center', va='bottom', fontsize=27, color='red')
            plt.text(x_axis[i], -0.15, '*', ha='center', va='bottom', fontsize=27, color='red')

plt.xlabel("Layers", fontsize=30)
plt.xticks(fontsize=30)
plt.yticks(fontsize=30)
plt.yticks(np.arange(-0.15, 0.3, 0.05), fontsize=30)

# plt.yticks(np.arange(-0.2, 0.4, 0.1), fontsize=30)
# plt.ylabel(r'$(L_{\mathrm{patch}} - F_{\mathrm{patch}}) - (L_{\mathrm{original}} - F_{\mathrm{original}})$')
# plt.ylabel(r'$F_{\mathrm{patch}} - F_{\mathrm{original}}$')

# plt.yticks(np.arange(-0.004, 0.015, 0.004), fontsize=27)
plt.ylabel(r'$\Delta I(s_a \hookleftarrow s_f)$', fontsize=40)
# plt.ylabel(r'$(L_{\mathrm{patch_1}} - L_{\mathrm{patch_2}})$')
# plt.ylabel(r'$(L_{\mathrm{original}} - L_{\mathrm{patch}})$')
plt.axhline(0, linestyle='--', color='gray', linewidth=2, label='No Effect')
plt.legend(loc='upper right', fontsize=30)
plt.grid(True, color='gray', linestyle='dotted', linewidth=0.5, alpha=0.4)
plt.savefig("replace_idiom_fig.pdf", format='pdf', bbox_inches='tight')
plt.show()