In [1]:
%cd ../

/Users/amith/Documents/columbia/phd/2021-f/computation_and_the_brain/coms6998-project


In [2]:
from neural_nlp.benchmarks import benchmark_pool
pereira = benchmark_pool["Pereira2018-encoding"]
data = pereira._load_assembly(version='base')

  class Score(DataAssembly):


In [3]:
# here are the various stimuli (passages with their accompanying sentences) which 
# were presented to the human participants in the experiment

from collections import Counter

# from _PereiraBenchmark#call
# we add a new passage identifier (experiment + the index of the passage read)
# this will allow us to process each stimulus together (passage by passage)

stimulus_set = data.attrs['stimulus_set']
stimulus_set.loc[:, 'passage_id'] = stimulus_set['experiment'] + stimulus_set['passage_index'].astype(str)

print(stimulus_set)
print(Counter(stimulus_set['passage_id']))

                                              sentence  sentence_num  \
0    Beekeeping encourages the conservation of loca...             0   
1    It is in every beekeeper's interest to conserv...             1   
2    As a passive form of agriculture, it does not ...             2   
3    Beekeepers also discourage the use of pesticid...             3   
4    Artisanal beekeepers go to extremes for their ...             4   
..                                                 ...           ...   
622  Some windows have multiple panes to increase i...           379   
623                   A woman is a female human adult.           380   
624    A woman is stereotypically seen as a caregiver.           381   
625     A woman can become pregnant and bear children.           382   
626  A woman has different reproductive organs than...           383   

          stimulus_id    experiment                       story  \
0      243sentences.0  243sentences     243sentences.beekeeping   
1

In [30]:
import torch
from transformers import GPT2TokenizerFast, GPT2Model

tokenizer = GPT2TokenizerFast.from_pretrained('gpt2')
model = GPT2Model.from_pretrained('gpt2', output_hidden_states=True)
model = model.eval()  

model

GPT2Model(
  (wte): Embedding(50257, 768)
  (wpe): Embedding(1024, 768)
  (drop): Dropout(p=0.1, inplace=False)
  (h): ModuleList(
    (0): GPT2Block(
      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (attn): GPT2Attention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
        (attn_dropout): Dropout(p=0.1, inplace=False)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp): GPT2MLP(
        (c_fc): Conv1D()
        (c_proj): Conv1D()
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (1): GPT2Block(
      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (attn): GPT2Attention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
        (attn_dropout): Dropout(p=0.1, inplace=False)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp): GPT2MLP

In [31]:
from tqdm import tqdm

# now we run the stimuli through our model and get their corresponding activations
# we do so for each "story" (identified by passage_id) separately by concatenating
# it's constituent sentences, keeping track of each sentences start and end tokens
# to be able to retrieve their token representans

# from stimulus_id -> 13 x 768 tensor (final representations from each layer)
activations = {}
for story in tqdm(sorted(set(stimulus_set['passage_id'].values))):
    story_stimuli = stimulus_set[stimulus_set['passage_id'] == story]
    
    sentences = []
    stimulus_ids = []
    stimulus_ends = []
    length_so_far = 0
    for _, stimulus in story_stimuli.sort_values(by='sentence_num', ascending=True).iterrows():
        length_so_far += len(stimulus['sentence'])
        sentences.append(stimulus['sentence'])
        stimulus_ids.append(stimulus['stimulus_id'])
        stimulus_ends.append(length_so_far - 1)
        
        # we'll join the sentences with spaces 
        length_so_far += 1
    
    with torch.no_grad():
        tokenized = tokenizer(
            [' '.join(sentences)], 
            add_special_tokens=True,
            return_tensors='pt'
        )
        
        # note that the ending character here is usually a period 
        # (we can experiment w/ the last word by subtracting 1)
        stimulus_token_ends = [
            tokenized.char_to_token(stimulus_end) for stimulus_end in stimulus_ends
        ]
        
        output = model(**tokenized)
        
        for stimulus_id, stimulus_token_end in zip(stimulus_ids, stimulus_token_ends):
            assert stimulus_id not in activations
            
            # get hidden state of each final token for each stimulus
            
            activations[stimulus_id] = torch.stack([
                output.hidden_states[i][0][stimulus_token_end] for i in range(len(output.hidden_states))
            ])

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 168/168 [00:24<00:00,  6.96it/s]


In [31]:
import pickle

with open('cache/gpt2_activations.pkl', 'wb') as f:
    pickle.dump(activations, f)

In [34]:
import pickle

with open('cache/gpt2_activations.pkl', 'rb') as f:
    activations = pickle.load(f)
    
# sanity check that we have gpt activations for every stimulus
    
assert set(activations.keys()) == set(data['stimulus_id'].values)

In [15]:
import numpy as np
from tqdm import tqdm
from collections import defaultdict

# now we have to split / group the data as done in the neural_nlp repo

# here are our raw voxels
print(data.values.shape)

# for each subject in our data (dim 0), here are their corresponding experiments
experiment_counts = Counter(data['experiment'].values)
print(len(data['experiment']), experiment_counts)

# for each voxel in our data (dim 1), here is its corresponding brain region (atlas)
print(len(data['atlas']), Counter(data['atlas'].values))

# we split the data by experiment and atlas (this is very slow...)
# from brainscore.metrics.transformations import CartesianProduct
# splitter = CartesianProduct(dividers=['experiment', 'atlas'])
# splits = splitter(data, apply=lambda split: split.drop_vars(['experiment', 'atlas']))

experiment_voxels = defaultdict(list)
experiment_voxel_nas = defaultdict(set)
experiment_subjects = defaultdict(list)
experiment_stimuli = defaultdict(list)
for subject_id, subject, stimulus_id, experiment in tqdm(zip(
    range(data.shape[0]), 
    data['subject'].values, 
    data['stimulus_id'].values, 
    data['experiment'].values
)):
    subject_voxels = []
    for voxel_id, atlas in zip(range(data.shape[1]), data['atlas'].values):
        if atlas == 'language':
            voxel = data.values[subject_id][voxel_id]
            if np.isnan(voxel):
                experiment_voxel_nas[experiment].add(len(subject_voxels))
            subject_voxels.append(voxel)

    experiment_voxels[experiment].append(subject_voxels)
    experiment_subjects[experiment].append(subject)
    experiment_stimuli[experiment].append(stimulus_id)

(627, 103900)
627 Counter({'384sentences': 384, '243sentences': 243})
103900 Counter({'visual': 43741, 'MD': 29936, 'language': 13553, 'DMN': 10978, 'auditory': 5692})


627it [00:31, 19.86it/s]


In [16]:
import numpy as np

# we filter out the voxels that are na

for experiment in experiment_voxels:
    print(
        experiment, 
        len(experiment_voxels[experiment]), 
        len(experiment_voxels[experiment][0]), 
        len(experiment_voxel_nas[experiment])
    )
    
experiments = {}
for experiment in experiment_voxels:
    experiments[experiment] = np.array(
        [
            [voxel for voxel_id, voxel in enumerate(voxels) if voxel_id not in experiment_voxel_nas[experiment]]
            for voxels in experiment_voxels[experiment]
        ]
    )
    print(experiment, experiments[experiment].shape)

384sentences 384 13553 1398
243sentences 243 13553 5522
384sentences (384, 12155)
243sentences (243, 8031)


In [37]:
from tqdm import tqdm
from scipy.stats import pearsonr
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GroupShuffleSplit

# 2 experiments x 5 folds x 13 layers
experiment_pearsonrs = defaultdict(lambda: np.zeros((5, 13)))
for experiment, brain_reps in experiments.items(): 
    # splits need to be by stimulus_id (how do we shuffle here?)
    # (though really they should be by passage_id given how we're doing the GPT2 encoding...
    # otherwise the test set will leak into the train set...)
    k_folds = GroupShuffleSplit(n_splits=5, train_size=0.8)

    for fold, (train_indices, test_indices) in enumerate(
        k_folds.split(brain_reps, groups=experiment_stimuli[experiment])
    ):
        test_subjects = [
            experiment_subjects[experiment][brain_rep_idx] 
            for brain_rep_idx in test_indices
        ]
        
        train_brain_reps, test_brain_reps = brain_reps[train_indices], brain_reps[test_indices]
        for layer_num in tqdm(range(13), desc='%s-fold%s' % (experiment, fold)):
            train_hidden_states = np.stack([
                activations[experiment_stimuli[experiment][brain_rep_idx]][layer_num].numpy()
                for brain_rep_idx in train_indices
            ])
            test_hidden_states = np.stack([
                activations[experiment_stimuli[experiment][brain_rep_idx]][layer_num] .numpy()
                for brain_rep_idx in test_indices
            ])

            # TODO: Are they doing any kind of hyperparameter tuning
            # (regularization, etc) here?  We're using SKLearn's defaults
            
            model = LinearRegression().fit(train_hidden_states, train_brain_reps)
            pred_brain_reps = model.predict(test_hidden_states)

            # We aggregated voxel/electrode/ROI predictivity scores by taking the
            # median of scores for each participant’s voxels/electrodes/ROIs and
            # then computing the median across participants. Finally, this score was
            # divided by the estimated ceiling value (see below) to yield a final score in
            # the range [0, 1].

            # https://github.com/brain-score/brain-score/blob/master/brainscore/metrics/xarray_utils.py#L78
            # https://github.com/brain-score/brain-score/blob/master/brainscore/metrics/regression.py#L33
            # https://github.com/brain-score/brain-score/blob/master/brainscore/metrics/transformations.py#L42

            # not totally sure this is right...

            layer_pearson_rs_by_subj = defaultdict(list)
            for pred_brain_rep, test_brain_rep, test_subject in zip(pred_brain_reps, test_brain_reps, test_subjects):
                layer_pearson_rs_by_subj[test_subject].append(pearsonr(pred_brain_rep, test_brain_rep))

            experiment_pearsonrs[experiment][fold][layer_num] = np.median([
                np.median(subj_pearson_rs) for subj_pearson_rs in layer_pearson_rs_by_subj.values()
            ])


384sentences-fold0: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 13/13 [00:24<00:00,  1.87s/it]
384sentences-fold1: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 13/13 [00:23<00:00,  1.79s/it]
384sentences-fold2: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 13/13 [00:23<00:00,  1.79s/it]
384sentences-fold3: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 13/13 [00:25<00:00,  1.92s/it]
384sentences-fold4: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 13/13 

In [38]:
for experiment, pearsonrs in experiment_pearsonrs.items():
    print(experiment)
    for layer_num in range(pearsonrs.shape[1]):
        print((layer_num, np.mean(pearsonrs[:, layer_num]), np.median(pearsonrs[:, layer_num])))

384sentences
(0, 0.0007542024393813523, 9.990087661635725e-06)
(1, 0.0017777627476372741, 2.357027559063973e-09)
(2, 0.008793941305723538, 1.3007383400174443e-06)
(3, 0.013204335414826036, 0.003345693082923446)
(4, 0.0002499079852198603, 0.00011448259210339833)
(5, 0.0027065503204595953, 4.4438247550250133e-07)
(6, 0.0008908926065907989, 0.0006890723884412221)
(7, 0.0015121320130124338, 0.000723096937342519)
(8, 0.0031632659345197907, 1.8181763954417475e-05)
(9, 0.004871656723277321, 0.0013079726432238597)
(10, 0.00040889134113173797, 2.5757334633372933e-06)
(11, 0.005160606227962996, 4.883990555615461e-15)
(12, 0.0076998286128112745, 2.422599302678769e-06)
243sentences
(0, 0.001581102759864113, 1.1841863145481332e-08)
(1, 0.010051317046319164, 0.006394502371637348)
(2, 0.009191029002548591, 0.011375808695666824)
(3, 0.026980557036389907, 0.029602002921695573)
(4, 0.008841013505484187, 9.916468992196217e-05)
(5, 0.026080265066930574, 0.021168201102157207)
(6, 0.006399673583976173, 0.00

In [39]:
# Compare with the ceiling
pereira.ceiling