|<h2>Course:</h2>|<h1><a href="https://udemy.com/course/dullms_x/?couponCode=202508" target="_blank">A deep understanding of AI language model mechanisms</a></h1>|
|-|:-:|
|<h2>Part 3:</h2>|<h1>Evaluating LLMs<h1>|
|<h2>Section:</h2>|<h1>Qualitative evaluations<h1>|
|<h2>Lecture:</h2>|<h1><b>Distributions of model hidden-state activations<b></h1>|

<br>

<h5><b>Teacher:</b> Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h5>
<h5><b>Course URL:</b> <a href="https://udemy.com/course/dullms_x/?couponCode=202508" target="_blank">udemy.com/course/dullms_x/?couponCode=202508</a></h5>
<i>Using the code without the course may lead to confusion or errors.</i>

In [None]:
import numpy as np
import torch
import matplotlib.pyplot as plt
import matplotlib as mpl

# for pairplot
import seaborn as sns
import pandas as pd

# vector plots
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

In [None]:
from transformers import AutoModelForCausalLM, GPT2Tokenizer

# load pretrained GPT-2 model and tokenizer
gpt2 = AutoModelForCausalLM.from_pretrained('gpt2')
gpt2.eval()

# and the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Get model outputs and hidden-layer activations

In [None]:
# the text
text = 'The goal of a correlation analysis is to compute a correlation coefficient. This coefficient is indicated using r, and is a number that encodes the normalized strength of the linear relationship between two variables. The normalization imposes boundaries of -1 to +1. Negative, zero, and positive correlation coefficients have distinct interpretations.'
tokens = tokenizer.encode(text,return_tensors='pt')
tokens

In [None]:
# get the outputs of the models
with torch.no_grad():
  outputs = gpt2(tokens,
                 output_hidden_states=True)

In [None]:
print(f'There are {len(tokens[0])} tokens.')
print(f'There are {len(outputs.hidden_states)} "hidden states."')
print(f'And each hidden state has size {list(outputs.hidden_states[3].shape)}.')

# Scatter plots

In [None]:
# pick one layer to visualize
whichLayer = 3

y = outputs.hidden_states[whichLayer][0,1:,:].detach()

_,axs = plt.subplots(1,2,figsize=(12,4))

# plot by embedding
axs[0].plot(y.T,'o',alpha=.4)
axs[0].set(xlabel='Embedding dimension',ylabel='Activation',title='Each color is a token')

# plot by token
axs[1].plot(y,'o',alpha=.4)
axs[1].set(xlabel='Token index',ylabel='Activation',title='Each color is an embedding dimension')

plt.suptitle(f'Output activations from attention layer {whichLayer}',fontweight='bold')
plt.tight_layout()
plt.show()

# Layer covariance and shared variance plots


In [None]:
# first we gather all the data into one matrix

# number of points, excluding the first token
npnts = (len(tokens[0])-1) * gpt2.config.n_embd

# initialize
allDataMat = np.zeros((len(outputs.hidden_states),npnts))

# loop over all layers
for layeri in range(len(outputs.hidden_states)):

  # extract all activations from this layer
  vectActs = outputs.hidden_states[layeri][0,1:,:].detach().flatten().numpy()

  # put it into a matrix
  allDataMat[layeri,:] = vectActs

In [None]:
# create a covariance matrix
covmat = np.cov(allDataMat)

# and a correlation matrix (squared to get R2 -> shared variance)
cormat = np.corrcoef(allDataMat)**2

# layer names
layerlabels = ['emb' if i==0 else f'h.{i-1}' for i in range(13)]

# set the figure
_,axs = plt.subplots(1,2,figsize=(10,5))

h = axs[0].imshow(covmat,vmin=-10,vmax=10,origin='lower')
axs[0].set(xlabel='Layer',ylabel='Layer',title='Covariance matrix',
           xticks=range(0,len(layerlabels),2),xticklabels=layerlabels[::2],
           yticks=range(1,len(layerlabels),2),yticklabels=layerlabels[1::2])
plt.colorbar(h,ax=axs[0],pad=.04,fraction=.046)

h = axs[1].imshow(100*cormat,vmin=0,vmax=100,origin='lower')
axs[1].set(xlabel='Layer',ylabel='Layer',title=r'R$^2$ matrix (% shared variance)',
           xticks=range(0,len(layerlabels),2),xticklabels=layerlabels[::2],
           yticks=range(1,len(layerlabels),2),yticklabels=layerlabels[1::2])
plt.colorbar(h,ax=axs[1],pad=.04,fraction=.046)

plt.tight_layout()
plt.show()

# Pairplot in seaborn

In [None]:
# pairplots are great but not really scalable to many variables
numpnts = 1000 # reduce the number of data points
numlayers = 4 # only show the first 4 layers

# convert to pandas dataframe and downsample
df = pd.DataFrame(allDataMat[:numlayers,:numpnts].T)

# create the pairplot (with internal pandas transform)
sns.pairplot(df)
plt.show()

# Histograms

In [None]:
binbounds = np.linspace(-15,15,301)

_,axs = plt.subplots(1,2,figsize=(12,4))
linecolors = mpl.cm.plasma_r(np.linspace(0,1,len(outputs.hidden_states)))

for layeri in range(len(outputs.hidden_states)):

  # calculate histogram of activations from this layer
  y,_ = np.histogram(allDataMat[layeri,:],bins=binbounds)

  axs[0].plot(binbounds[:-1],y,color=linecolors[layeri],label=layerlabels[layeri])
  axs[1].plot(binbounds[:-1],np.log(y+1e-14),color=linecolors[layeri],label=layerlabels[layeri])

for a in axs:
  a.legend(fontsize=8)
  a.set(xlim=binbounds[[0,-1]],ylim=[0,None],xlabel='Activation value',ylabel='Count')

axs[0].set_title('Linear counts')
axs[1].set_title('Log counts')
plt.show()