In [1]:
%cd ../neural-nlp

/Users/amith/Documents/columbia/phd/2021-f/computation_and_the_brain/coms6998-project/neural-nlp


In [2]:
from neural_nlp.benchmarks import benchmark_pool
pereira = benchmark_pool["Pereira2018-encoding"]
data = pereira._load_assembly()

data

  class Score(DataAssembly):


In [3]:
# post-processing copied from PereiraBenchmark in neural.py (lines 393/394)

data = data.dropna('neuroid')

print(data.shape)

(627, 49760)


In [6]:
import numpy as np

# Extract stimuli (sentences) from xarray
stimuli_texts = {}
for _, row in data.attrs['stimulus_set'].iterrows():
    stimuli_texts[row['stimulus_id']] = row['sentence']


# Extract fmri data
stimuli = []
brain_reps = []
for idx, stimulus_id in enumerate(data.coords['stimulus_id']):
    stimuli.append(stimuli_texts[stimulus_id.item()])
    brain_reps.append(data.values[idx, :])

brain_reps = np.array(brain_reps)

assert len(stimuli) == len(brain_reps)

print(brain_reps.shape)

(627, 49760)


In [7]:
import pickle

with open('../corpora/pereira.pkl', 'wb') as f:
    pickle.dump((stimuli, brain_reps), f)

In [12]:
# start running the notebook here if you 
# have the pereira.pkl file already!

import pickle
import numpy as np

with open('../corpora/pereira.pkl', 'rb') as f:
    stimuli, brain_reps = pickle.load(f)

print(brain_reps.shape)

(627, 49760)


In [13]:
import torch
from transformers import GPT2Tokenizer, GPT2Model

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2Model.from_pretrained('gpt2', output_hidden_states=True)
model = model.eval()  # Turn off dropout

model

GPT2Model(
  (wte): Embedding(50257, 768)
  (wpe): Embedding(1024, 768)
  (drop): Dropout(p=0.1, inplace=False)
  (h): ModuleList(
    (0): GPT2Block(
      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (attn): GPT2Attention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
        (attn_dropout): Dropout(p=0.1, inplace=False)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp): GPT2MLP(
        (c_fc): Conv1D()
        (c_proj): Conv1D()
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (1): GPT2Block(
      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (attn): GPT2Attention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
        (attn_dropout): Dropout(p=0.1, inplace=False)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp): GPT2MLP

In [14]:
import numpy as np
from tqdm import tqdm

hidden_states = []
with torch.no_grad():
    for stimulus in tqdm(stimuli):
        output = model(
            **tokenizer([stimulus], add_special_tokens=True, return_tensors='pt')
        )
        
        layer_reps = []
        for layer_hidden_states in output.hidden_states:
            # in models/implementations.py, Transformer uses the rep
            # of the last word (line 595)
            layer_reps.append(layer_hidden_states.squeeze()[-1, :].numpy())
        hidden_states.append(layer_reps)
hidden_states = np.array(hidden_states)
print(hidden_states.shape)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 627/627 [00:39<00:00, 15.98it/s]


(627, 13, 768)


In [15]:
from scipy.stats import pearsonr
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression

k_folds = KFold(n_splits=5, shuffle=True)

pearsonrs = []
for fold, (train_indices, test_indices) in enumerate(k_folds.split(hidden_states)):
    pearsonrs.append([])
    train_brain_reps, test_brain_reps = brain_reps[train_indices], brain_reps[test_indices]
    for layer_num in tqdm(range(len(hidden_states[0])), desc='fold%s' % fold):
        train_hidden_states, test_hidden_states = \
            hidden_states[train_indices, layer_num, :], \
            hidden_states[test_indices, layer_num, :]

        # TODO: Are they doing any kind of hyperparameter tuning
        # (regularization, etc) here?  We're using SKLearn's defaults

        model = LinearRegression().fit(train_hidden_states, train_brain_reps)
        pred_brain_reps = model.predict(test_hidden_states)
        
        # We aggregated voxel/electrode/ROI predictivity scores by taking the
        # median of scores for each participant’s voxels/electrodes/ROIs and
        # then computing the median across participants. Finally, this score was
        # divided by the estimated ceiling value (see below) to yield a final score in
        # the range [0, 1].
        
        # https://github.com/brain-score/brain-score/blob/master/brainscore/metrics/xarray_utils.py#L78
        # https://github.com/brain-score/brain-score/blob/master/brainscore/metrics/regression.py#L33
        # https://github.com/brain-score/brain-score/blob/master/brainscore/metrics/transformations.py#L42
        
        # not totally sure this is right...
        
        layer_pearson_rs = []
        for pred_brain_rep, test_brain_rep in zip(pred_brain_reps, test_brain_reps):
            layer_pearson_rs.append(pearsonr(pred_brain_rep, test_brain_rep))
        
        pearsonrs[-1].append(np.median(layer_pearson_rs))

fold0: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 13/13 [03:14<00:00, 14.97s/it]
fold1: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 13/13 [03:20<00:00, 15.39s/it]
fold2: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 13/13 [03:24<00:00, 15.75s/it]
fold3: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 13/13 [03:25<00:00, 15.79s/it]
fold4: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 13/13 [03:48<00:00, 17.57s/it]


In [16]:
pearsonrs = np.array(pearsonrs)
for layer_num in range(pearsonrs.shape[1]):
    print((layer_num, np.mean(pearsonrs[:, layer_num]), np.median(pearsonrs[:, layer_num])))

(0, 0.008726872021927195, 0.00026370962555591007)
(1, 5.5704096546878854e-11, 1.2012132205600063e-30)
(2, 1.9213049567458508e-11, 4.8402302423740277e-23)
(3, 3.3988302001727236e-11, 3.363861954594302e-23)
(4, 2.258517415353001e-19, 4.567767534087973e-24)
(5, 1.7192467627783422e-09, 7.084778940287923e-16)
(6, 1.5807451868999074e-17, 6.01822025428152e-25)
(7, 2.617089164714207e-13, 1.0951639991689443e-24)
(8, 2.7787744654038066e-24, 2.200889953594277e-31)
(9, 1.7324031317170693e-08, 8.390406345911877e-20)
(10, 1.7481852619377185e-09, 7.804518140563757e-12)
(11, 1.545092748524449e-08, 5.150342559864029e-15)
(12, 2.6385651505677922e-12, 3.57456361746838e-15)
