|<h2>Course:</h2>|<h1><a href="https://udemy.com/course/dullms_x/?couponCode=202508" target="_blank">A deep understanding of AI language model mechanisms</a></h1>|
|-|:-:|
|<h2>Part 5:</h2>|<h1>Observation (non-causal) mech interp<h1>|
|<h2>Section:</h2>|<h1>Identifying circuits and components<h1>|
|<h2>Lecture:</h2>|<h1><b>Challenges with sparse logistic regression in large datasets<b></h1>|

<br>

<h5><b>Teacher:</b> Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h5>
<h5><b>Course URL:</b> <a href="https://udemy.com/course/dullms_x/?couponCode=202508" target="_blank">udemy.com/course/dullms_x/?couponCode=202508</a></h5>
<i>Using the code without the course may lead to confusion or errors.</i>

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression

import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

# The data

In [None]:
# data
n_neurons = 3000
n_tokens = 200

data = np.random.randn(n_tokens,n_neurons)

# data labels
labels = np.array([0]*(n_tokens//2) + [1]*(n_tokens//2))

# shift data to create a constant offset
data += labels[:,None]*5


# show the histograms for the two labels
y0,x0 = np.histogram(data[labels==0,:].flatten(),100)
y1,x1 = np.histogram(data[labels==1,:].flatten(),100)

fig,axs = plt.subplots(1,2,figsize=(12,3.5))

h = axs[0].imshow(data,aspect='auto',vmin=-3,vmax=6)
fig.colorbar(h,ax=axs[0],pad=.01)
axs[0].set(xlabel='Neuron index',ylabel='Token index',
              title='Image of activations')


axs[1].plot(x0[:-1],y0,linewidth=3,label='Category 0')
axs[1].plot(x1[:-1],y1,linewidth=3,label='Category 1')

axs[1].set(xlabel='Activation value',ylabel='Count',ylim=[-100,None],
              title='Distributions of category-specific activations')
axs[1].legend()

plt.tight_layout()
plt.show()

# Sparsity by L1 regularization amount

In [None]:
# values of C
Cvals = np.linspace(1,5,17)
sparsityByC = np.zeros(len(Cvals))

# loop over C
for i,c in enumerate(Cvals):
  logreg = LogisticRegression(penalty='l1', solver='saga', C=c)
  logreg.fit(data,labels)
  sparsityByC[i] = 100*(logreg.coef_==0).mean()

In [None]:
plt.figure(figsize=(8,5))

plt.plot(Cvals,sparsityByC,'ks-',markerfacecolor=[.9,.7,.7],markersize=10)
plt.gca().set(xlabel='C parameter',ylabel='Sparsity (% total params)',title='Sparsity (proportion of zero-valued coefficients) as a function of C')

plt.show()

# The sparse logistic regression

In [None]:
# Run the logistic regression
logreg = LogisticRegression(penalty='l1', max_iter=1000, solver='saga', C=3)
logreg.fit(data,labels)
coefs = logreg.coef_.squeeze() # beta values


### model performance results
#  accuracy (do the predictions match the true labels?)
accuracy = 100*(logreg.predict(data) == labels).mean()


# get sparsity
sparsity = 100*(coefs==0).mean()
print(f'Accuracy: {accuracy:.2f}, Sparsity: {sparsity:.2f}%')

In [None]:
# FYI, the actual number of iterations that were run
logreg.n_iter_

In [None]:
# also FYI, the intercept term is stored separately
print(logreg.intercept_)
coefs.shape

# Large effect sizes with beta=0 coefficients

In [None]:
# effect size (Cohen's d)
mean_diff = data[labels==1,:].mean(axis=0) - data[labels==0,:].mean(axis=0)
pooled_sd = ( data[labels==1,:].std(axis=0) + data[labels==0,:].std(axis=0) )/2
cohens_d = mean_diff / (pooled_sd)

# and show the results
plt.figure(figsize=(8,6))

plt.plot(coefs[coefs!=0],cohens_d[coefs!=0],'ko',markersize=8,alpha=.6,markerfacecolor=[.7,.9,.7])
plt.plot(coefs[coefs==0],cohens_d[coefs==0],'ko',markersize=8,alpha=.8,markerfacecolor=[.9,.7,.7])

plt.gca().set(ylabel="Effect size (Cohen's d)",xlabel=r'$\beta$ coefficient',title='Large effect sizes in zeroed coefficients')
plt.grid(linestyle='--',linewidth=.4)

plt.show()

# Negative betas?!

In [None]:
# find the min and max coefficient
minbeta = coefs.argmin()
maxbeta = coefs.argmax()

# show their distributions
_,axs = plt.subplots(1,2,figsize=(12,4))
axs[0].hist(data[labels==0,minbeta],bins=20,color=[.9,.7,.7],edgecolor='k',linewidth=.1,label='Label 0')
axs[0].hist(data[labels==1,minbeta],bins=20,color=[.7,.9,.7],edgecolor='k',linewidth=.1,label='Label 1')
axs[0].set(title=f'Neuron {minbeta} with $\\beta$={coefs[minbeta]:.2f}',xlabel='Data value',ylabel='Count')

axs[1].hist(data[labels==0,maxbeta],bins=20,color=[.9,.7,.7],edgecolor='k',linewidth=.1,label='Label 0')
axs[1].hist(data[labels==1,maxbeta],bins=20,color=[.7,.9,.7],edgecolor='k',linewidth=.1,label='Label 1')
axs[1].set(title=f'Neuron {maxbeta} with $\\beta$={coefs[maxbeta]:.2f}',xlabel='Data value',ylabel='Count')

plt.legend()
plt.show()

In [None]:
# run another regression with only those two neurons
extremeData = data[:,[minbeta,maxbeta]]

logreg2 = LogisticRegression()
logreg2.fit(extremeData,labels)
print(logreg2.coef_)

In [None]:
# these neurons are strongly correlated, so their contributions are redundant
plt.plot(extremeData[labels==0,0],extremeData[labels==0,1],'ko',alpha=.5,markerfacecolor=[.7,.9,.9],label='Category 0')
plt.plot(extremeData[labels==1,0],extremeData[labels==1,1],'ko',alpha=.5,markerfacecolor=[.9,.7,.9],label='Category 1')

plt.legend()
plt.gca().set(xlabel='Min-beta neuron',ylabel='Max-beta neuron',label=f'r = {np.corrcoef(extremeData.T)[0,1]:.2f}')
plt.show()

# Which neurons get selected?

In [None]:
plt.figure(figsize=(8,5))
plt.plot(coefs[coefs!=0],data[:,coefs!=0].var(axis=0),'ko',markersize=8,alpha=.6,markerfacecolor=[.7,.9,.9],label='Non-zero coefs')
plt.plot(coefs[coefs==0],data[:,coefs==0].var(axis=0),'ko',markersize=8,alpha=.6,markerfacecolor=[.9,.7,.7],label='Zero coefs')
plt.legend()
plt.gca().set(ylabel='Data variance',xlabel=r'$\beta$ coefficient',title='Variance in zeroed coefficients')

plt.show()

In [None]:
# also doesn't trivially follow inter-variable correlation
R0 = np.corrcoef(data[n_tokens//2:,coefs==0].T)
R1 = np.corrcoef(data[n_tokens//2:,coefs!=0].T)

y0,x0 = np.histogram(R0[np.nonzero(np.triu(R0,1))],80,density=True)
y1,x1 = np.histogram(R1[np.nonzero(np.triu(R1,1))],80,density=True)

plt.figure(figsize=(10,3))
plt.plot(x0[:-1],y0,label=r'$\beta = 0$')
plt.plot(x1[:-1],y1,label=r'$\beta \neq 0$')

plt.legend()
plt.show()